In [3]:
import pandas as pd
import numpy as np
import glob
import zipfile

## utility

# 定义转换函数
def format_stock_code(df, column_name):
    """
    股票代码转换成6位,并添加交易所名称
    """
    df[column_name] = df[column_name].astype(str)
    df[column_name] = df[column_name].apply(lambda x: x.zfill(6))
    # 根据股票代码的开头进行后缀的添加
    df.loc[df[column_name].str.startswith(('0', '3')), column_name] += '.SZ'
    df.loc[df[column_name].str.startswith(('4', '8')), column_name] += '.BJ'
    df.loc[df[column_name].str.startswith('6'), column_name] += '.SH'
    return df

def read_and_concatenate_csv(folder_path,columns):
    """
    使用glob模块获取文件夹下所有CSV文件的路径
    """
    csv_files = glob.glob(folder_path + '*.csv')
    
    # 创建一个空列表，用于存储每个CSV文件的DataFrame
    dfs = []
    
    # 逐个读取CSV文件
    for file in csv_files:
        df = pd.read_csv(file)
        df = df[columns]
        dfs.append(df)
        
    # 使用concat方法合并所有DataFrame
    concatenated_df = pd.concat(dfs, ignore_index=True)
    
    return concatenated_df

def get_oth_data(folder_path,columns=None):

    ## 导入文件夹所有csv
    df = read_and_concatenate_csv(folder_path,columns)
    ## 修改股票代码
    df = format_stock_code(df, '股票代码_Stkcd')
    ## 日期标准格式
    df['日期_Date'] = pd.to_datetime(df['日期_Date'] )
    ## 按code和date排序
    df = df.sort_values(by=['股票代码_Stkcd','日期_Date'])
    df = df[columns] 

    return df

def get_OHLC(file_path):

    ## 获取数据
    df = pd.read_csv(file_path,encoding='gbk')
    keywords = ["开盘价", "最高价", "最低价", "收盘价"]
    keywords_1 = ["Open", "High", "Low", "Close"]

    # 找到含关键字的行的索引
    indices = [i for i, val in enumerate(df.iloc[:,0]) if val in keywords]
    # 添加最后一个索引+1，以便能够捕获到最后一个关键字后的所有行
    indices.append(len(df))
    
    # 根据关键字分割DataFrame
    dfs = [df.iloc[indices[n]:indices[n+1]-2] for n in range(len(indices)-1)]

    ## 构造标准格式
    y = []
    for ind,x in enumerate(dfs):
        ## 删除、修改列
        x = x.drop(columns=['Unnamed: 0','Unnamed: 1','日期'])
        x = x.rename(columns={'Date':'code'})
        ## 宽格式转换为长格式
        x= pd.melt(x, id_vars=['code'], var_name='date', value_name=keywords_1[ind])
        ## 时间标准格式
        x['date'] = pd.to_datetime(x['date'])
        # 设置双索引
        x.set_index(['date', 'code'], inplace=True)
        x = x.sort_index(level='date')
        ## 转换成数值格式
        x = x.apply(pd.to_numeric, errors='coerce')
        y.append(x)

    return  pd.concat(y,axis=1)

def get_data():
    
    pass    

def get_preprocess():

    ##
    pass

## 29指标名称
columns = ['股票代码_Stkcd','日期_Date','上市状态_Listedstate','证监会行业门类代码_Csrciccd1',
           '成交量_Trdvol','成交金额_Trdsum','日振幅(%)_Dampltd','总股数日换手率(%)_DFulTurnR',
           '流通股日换手率(%)_DTrdTurnR','累积股价调整乘子_Mcfacpr','汇率_Ex','日收益率_Dret',
           '等权平均市场日收益率_Dreteq','流通市值加权平均市场日收益率_Drettmv','总市值加权平均市场日收益率_Dretmc',
           '等权平均市场日资本收益率_Dareteq','流通市值加权平均市场日资本收益_Darettmv','总市值加权平均日资本收益_Daretmc',
           '市盈率_PE','市净率_PB','市现率_PCF','市销率_PS','每股收益(摊薄)(元/股)_EPS','净资产收益率(摊薄)_ROE',
           '每股公积金(元/股)_AccumFundPS','每股营业利润(元/股)_OpPrfPS','每股净资产(元/股)_NAPS',
           '每股营业收入_IncomePS','每股经营活动现金流量净额(元/股)_NCFfropePS','已上市流通股_Lsttrdshr'
           ]

In [5]:
df =  get_oth_data(folder_path='../量化data/csv_data/', columns=columns)
df = df.rename(columns={'股票代码_Stkcd':'code','日期_Date':'date'})
dfs = get_OHLC('../量化data/(open,high,low,close).csv')
df_ohlc = dfs.reset_index()
merged_df = pd.merge(df, df_ohlc, on=['code', 'date'], how='left')
merged_df.reset_index(inplace = True)
condition = (merged_df['date'] >= '2021-02-01') & (merged_df['date'] < '2024-03-01') #时间选取2021-04-01至2023-09-30
data = merged_df[condition]
data.set_index(['date', 'code'], inplace=True)
data= data.sort_index(level='date')
data

  df = pd.read_csv(file)
  df = pd.read_csv(file_path,encoding='gbk')


Unnamed: 0_level_0,Unnamed: 1_level_0,index,上市状态_Listedstate,证监会行业门类代码_Csrciccd1,成交量_Trdvol,成交金额_Trdsum,日振幅(%)_Dampltd,总股数日换手率(%)_DFulTurnR,流通股日换手率(%)_DTrdTurnR,累积股价调整乘子_Mcfacpr,汇率_Ex,...,每股公积金(元/股)_AccumFundPS,每股营业利润(元/股)_OpPrfPS,每股净资产(元/股)_NAPS,每股营业收入_IncomePS,每股经营活动现金流量净额(元/股)_NCFfropePS,已上市流通股_Lsttrdshr,Open,High,Low,Close
date,code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2021-02-01,000001.SZ,20,Norm,J,147523930.0,3.529557e+09,9.9177,0.7602,0.7602,122.9998,1.0,...,4.72,0.58,14.88,1.97,1.58,1.940576e+10,21.8838,23.7772,21.5984,23.3586
2021-02-01,000002.SZ,748,Norm,K,88669362.0,2.456622e+09,3.0946,0.9118,0.9118,119.2797,1.0,...,9.21,4.33,17.45,24.83,2.87,9.724197e+09,23.6902,23.8173,23.0889,23.7749
2021-02-01,000004.SZ,1477,Norm,I,4688700.0,9.048045e+07,5.6269,2.8407,4.0725,4.5505,1.0,...,7.35,0.35,8.78,0.57,-0.11,1.151316e+08,19.9600,19.9600,18.8200,19.1400
2021-02-01,000005.SZ,2205,Norm,N,19355012.0,3.788038e+07,7.4766,1.8285,1.8295,5.9172,1.0,...,0.79,0.01,1.53,0.08,-0.01,1.057946e+09,2.0800,2.0900,1.9300,1.9400
2021-02-01,000006.SZ,2933,Norm,K,4760431.0,2.327165e+07,2.2634,0.3526,0.3526,55.3750,1.0,...,1.69,0.47,5.19,1.19,-0.03,1.349995e+09,4.4554,4.5473,4.4463,4.5289
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-29,873679.BJ,3266587,Norm,C,1615423.0,3.762497e+07,3.3391,2.8924,12.6422,1.0000,1.0,...,0.98,1.23,7.00,3.15,1.68,1.277800e+07,22.9700,23.5900,22.8200,23.4800
2023-12-29,873693.BJ,3266632,Norm,C,1421837.0,5.952405e+07,4.7470,1.9558,7.0041,1.0000,1.0,...,1.56,0.29,4.39,0.70,0.21,2.030000e+07,42.0600,42.9800,41.0100,41.5400
2023-12-29,873703.BJ,3266650,Norm,C,2031030.0,7.836494e+07,2.7146,2.7207,14.2528,1.0193,1.0,...,1.39,1.63,5.06,5.08,1.82,1.425000e+07,38.4900,39.1600,38.1100,38.4800
2023-12-29,873726.BJ,3266701,Norm,C,4661968.0,2.004090e+08,25.5395,5.6800,26.7293,1.0187,1.0,...,1.87,0.99,4.58,2.81,2.00,1.744142e+07,39.4300,47.9600,37.9000,43.6400


In [68]:
data.to_csv('../量化data/股票整合数据_未预处理.zip', compression='zip', index=True)