In [3]:
import requests
import calendar
import qstock as qs
from datetime import datetime
from jsonpath import jsonpath
from tqdm import tqdm
import numpy as np
from bs4 import BeautifulSoup
import pandas as pd
from qstock.data.trade import latest_report_date,market_realtime
from qstock.data.util import (trans_num,cn_headers,get_code_id,request_header, session,)
import warnings
warnings.filterwarnings("ignore")
def stock_indicator(code):
    """
    修改自qstock源码
    获取个股历史报告期所有财务分析指标
    https://money.finance.sina.com.cn/corp/go.php/vFD_FinancialGuideLine/stockid/600004/ctrl/2019/displaytype/4.phtml
    code: 股票代码或简称
    """
    if not code.isdigit():
        code=stock_code_dict()[code]
    
    url = f"https://money.finance.sina.com.cn/corp/go.php/vFD_FinancialGuideLine/stockid/{code}/ctrl/2020/displaytype/4.phtml"
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "lxml")
    year_context = soup.find(attrs={"id": "con02-1"}).find("table").find_all("a")
    year_list = [item.text for item in year_context]
    df = pd.DataFrame()
    for year in year_list[:6]:
        url = f"https://money.finance.sina.com.cn/corp/go.php/vFD_FinancialGuideLine/stockid/{code}/ctrl/{year}/displaytype/4.phtml"
        r = requests.get(url)
        temp_df = pd.read_html(r.text)[12].iloc[:, :-1]
        temp_df.columns = temp_df.iloc[0, :]
        temp_df = temp_df.iloc[1:, :]
        df0 = pd.DataFrame()
        indicator_list = ["每股指标", "盈利能力", "成长能力", "营运能力", "偿债及资本结构", "现金流量", "其他指标"]
        for i in range(len(indicator_list)):
            if i == 6:
                inner_df = temp_df[
                    temp_df.loc[
                        temp_df.iloc[:, 0].str.find(indicator_list[i]) == 0, :
                    ].index[0] :
                ].T
            else:
                inner_df = temp_df[
                    temp_df.loc[temp_df.iloc[:, 0].str.find(indicator_list[i]) == 0, :]
                    .index[0] : temp_df.loc[
                        temp_df.iloc[:, 0].str.find(indicator_list[i + 1]) == 0, :
                    ]
                    .index[0]
                    - 1
                ].T
            inner_df = inner_df.reset_index(drop=True)
            df0 = pd.concat([df0, inner_df], axis=1)
        df0.columns = df0.iloc[0, :].tolist()
        df0 = df0.iloc[1:, :]
        df0.index = temp_df.columns.tolist()[1:]
        df = pd.concat([df, df0])

    df.dropna(inplace=True)
    df.reset_index(inplace=True)
    df.rename(columns={'index': '日期'}, inplace=True)
    fields=['日期','摊薄每股收益(元)','每股净资产_调整后(元)','每股经营性现金流(元)',
            '每股资本公积金(元)','每股未分配利润(元)','总资产(元)','扣除非经常性损益后的净利润(元)',
            '主营业务利润率(%)','总资产净利润率(%)','销售净利率(%)','净资产报酬率(%)','资产报酬率(%)',
            '净资产收益率(%)','加权净资产收益率(%)','成本费用利润率(%)','主营业务成本率(%)',
            '应收账款周转率(次)','存货周转率(次)','固定资产周转率(次)','总资产周转率(次)',
            '流动资产周转率(次)','流动比率','速动比率','现金比率(%)','产权比率(%)','资产负债率(%)',
            '经营现金净流量对销售收入比率(%)','经营现金净流量与净利润的比率(%)','经营现金净流量对负债比率(%)',
            '主营业务收入增长率(%)','净利润增长率(%)','净资产增长率(%)','总资产增长率(%)']

    new_names=['日期','每股收益','调整每股净资产','每股现金流','每股公积金','每股未分配利润','总资产','扣非净利润',
          '主营利润率','总资产净利率','销售净利率','净资产报酬率','资产报酬率','净资产收益率','加权净资产收益率',
          '成本费用利润率','主营业务成本率','应收账款周转率','存货周转率','固定资产周转率','总资产周转率',
          '流动资产周转率','流动比率','速动比率','现金比率','产权比率','资产负债率','现金流销售比',
          '现金流净利润比','现金流负债比','主营收入增长率','净利润增长率','净资产增长率','总资产增长率']
    
    result=df[fields].rename(columns=dict(zip(fields,new_names)))
    return result

In [4]:
import akshare as ak

index_stock_cons_csindex_df = ak.index_stock_cons_csindex(symbol="000906")
Z800_stocks = index_stock_cons_csindex_df.成分券代码.values
Z800_stocks

array(['000001', '000002', '000009', '000021', '000027', '000031',
       '000032', '000039', '000050', '000060', '000063', '000066',
       '000100', '000155', '000156', '000157', '000166', '000301',
       '000333', '000338', '000400', '000401', '000408', '000423',
       '000425', '000513', '000519', '000537', '000538', '000539',
       '000559', '000563', '000568', '000591', '000596', '000598',
       '000617', '000623', '000625', '000629', '000630', '000636',
       '000651', '000661', '000683', '000703', '000708', '000709',
       '000723', '000725', '000728', '000729', '000733', '000738',
       '000739', '000750', '000768', '000776', '000778', '000783',
       '000785', '000786', '000792', '000800', '000807', '000818',
       '000825', '000830', '000831', '000858', '000876', '000878',
       '000883', '000887', '000893', '000895', '000898', '000921',
       '000932', '000933', '000937', '000938', '000958', '000959',
       '000960', '000963', '000967', '000977', '000983', '0009

In [5]:
import time
from random import randint as Rint
stock_dfs_dict = {}
for code in tqdm(Z800_stocks):
    while code not in stock_dfs_dict:
        try:
            stock_dfs_dict[code] =  stock_indicator(code)
            time.sleep(Rint(1,6))
        except:
            time.sleep(Rint(30,45))

100%|██████████████████████████████████████████████████████████████████████████████| 800/800 [2:43:53<00:00, 12.29s/it]


In [101]:
stock_dfs_dict['000002']

Unnamed: 0,日期,每股收益,调整每股净资产,每股现金流,每股公积金,每股未分配利润,总资产,扣非净利润,主营利润率,总资产净利率,...,资产负债率,现金流销售比,现金流净利润比,现金流负债比,主营收入增长率,净利润增长率,净资产增长率,总资产增长率,code,date
1,2023-12-31,1.7145,21.15,0.3279,2.0998,7.8467,1504850172117.8,9793841193.09,11.2446,1.2542,...,73.2243,0.0084,0.1913,0.0036,-7.5618,-45.5258,-0.5082,-14.3572,2,2024-03-31
3,2023-06-30,1.2714,21.03,0.1563,2.0826,8.3656,1684196409372.7,8702699763.64,14.5767,0.8815,...,76.0928,0.0093,0.1229,0.0015,-2.9112,-16.2104,2.7919,-10.3771,2,2023-08-31
5,2022-12-31,3.2286,21.0,0.2365,1.9661,8.3727,1757124444202.9,19762103017.38,14.712,2.0321,...,76.9515,0.0055,0.0732,0.002,11.2723,-1.3623,3.1109,-9.3629,2,2023-03-31
7,2022-06-30,1.5572,20.28,0.7129,1.7614,8.0552,1879202244259.1,11720990962.68,16.4224,0.9483,...,79.1556,0.0401,0.4578,0.0056,23.8197,11.9283,7.6949,-3.8794,2,2022-08-31
9,2021-12-31,3.2747,20.3,0.3538,1.7706,7.9738,1938638128699.1,22381781882.05,17.1743,1.9995,...,79.7398,0.0091,0.108,0.0027,8.0375,-35.7998,12.2707,3.7161,2,2022-03-31
11,2021-06-30,1.3921,19.1,0.5837,1.5844,8.1721,1955046076839.6,10727286571.01,18.2197,0.8458,...,81.3958,0.0406,0.4193,0.0043,14.1862,-13.1819,24.88,8.2416,2,2021-08-31
13,2020-12-31,5.1041,19.32,4.5782,1.5971,8.4713,1869177094005.6,40237711134.26,22.7467,3.2952,...,81.2835,0.1269,0.897,0.035,13.9219,7.5574,29.2947,8.0493,2,2021-03-31
15,2020-06-30,1.6035,16.84,1.9458,1.6572,8.2674,1806186613769.1,12114207886.28,24.1393,1.0537,...,83.8745,0.1545,1.2135,0.0149,5.0455,-3.404,25.2889,14.5289,2,2020-08-31
17,2019-12-31,4.878,16.64,4.0423,1.0958,8.4366,1729929450401.2,38314387512.31,27.3009,3.3839,...,84.359,0.1242,0.8287,0.0313,23.5873,11.8917,14.8367,13.1724,2,2020-03-31
19,2019-06-30,1.7064,14.25,0.7833,1.0795,8.1183,1577056909941.3,11749678942.31,28.0576,1.242,...,85.2594,0.0635,0.4591,0.0066,31.4656,42.6181,12.9667,17.2401,2,2019-08-31


In [128]:
stock_dfs_dict_Price = {}
price_df = qs.get_data(Z800_stocks, start='2019-03-30').reset_index()

                                                                                                                       

In [129]:
def zhang_5(df):
    df = df.ffill()
    df = df[df.close>0]
    df['30day_close'] = df.close.shift(-21)
    df = df.dropna()
    df['30dGain'] = df['30day_close'] / df['close'] - 1
    return df
price_df = price_df.groupby('name').apply(zhang_5).reset_index(drop=True)
price_df['date'] = price_df['date'].apply(lambda x : x.strftime('%Y-%m-%d'))
price_df

Unnamed: 0,date,name,code,open,high,low,close,volume,turnover,turnover_rate,30day_close,30dGain
0,2020-12-30,DR派能科,688063,139.73,182.61,135.27,164.93,273047,5.848337e+09,75.95,166.86,0.011702
1,2020-12-31,DR派能科,688063,171.86,183.29,168.36,181.59,97925,2.458007e+09,27.24,161.15,-0.112561
2,2021-01-04,DR派能科,688063,189.58,224.01,182.71,211.87,88136,2.488008e+09,24.52,166.70,-0.213197
3,2021-01-05,DR派能科,688063,212.49,228.29,201.24,211.24,76809,2.317842e+09,21.37,146.94,-0.304393
4,2021-01-06,DR派能科,688063,216.15,232.58,211.86,220.44,65476,2.056985e+09,18.21,145.97,-0.337824
...,...,...,...,...,...,...,...,...,...,...,...,...
934370,2024-05-16,龙源电力,001289,18.00,18.10,17.81,17.88,79126,1.416493e+08,5.93,18.13,0.013982
934371,2024-05-17,龙源电力,001289,17.89,18.18,17.89,18.16,68306,1.234622e+08,5.12,18.00,-0.008811
934372,2024-05-20,龙源电力,001289,18.18,18.65,18.13,18.48,104716,1.931888e+08,7.85,17.77,-0.038420
934373,2024-05-21,龙源电力,001289,18.49,18.49,18.16,18.24,48725,8.910555e+07,3.65,17.75,-0.026864


In [130]:
price_df.describe()

Unnamed: 0,open,high,low,close,volume,turnover,turnover_rate,30day_close,30dGain
count,934375.0,934375.0,934375.0,934375.0,934375.0,934375.0,934375.0,934375.0,934375.0
mean,33.122671,33.863437,32.451881,33.147654,330153.7,543165200.0,1.642359,33.255185,0.013538
std,74.258007,75.606477,73.00646,74.308012,661809.4,900202600.0,2.611931,74.655262,0.289921
min,-0.24,0.04,-0.39,0.01,108.0,232487.0,0.01,0.01,-0.982759
25%,6.9,7.01,6.8,6.9,62117.0,120124900.0,0.48,6.93,-0.067896
50%,15.01,15.3,14.75,15.02,150079.0,257957500.0,0.91,15.11,-0.00444
75%,33.71,34.44,33.04,33.74,348235.5,583092700.0,1.81,33.89,0.069915
max,2449.21,2489.11,2346.23,2462.23,41144530.0,47979120000.0,84.52,2462.23,166.0


In [131]:
def sort_percentile(df):
    df = df.sort_values(by='30dGain', ascending=True).reset_index(drop=True)
    df = df.reset_index()
    df['percentile'] = df['index']/len(df)
    return df.drop(['index'],axis=1)
price_df = price_df.groupby('date').apply(sort_percentile).reset_index(drop=True)
price_df

Unnamed: 0,date,name,code,open,high,low,close,volume,turnover,turnover_rate,30day_close,30dGain,percentile
0,2019-04-01,九安医疗,002432,3.54,4.26,3.44,4.26,514423,3.818892e+08,11.89,1.87,-0.561033,0.000000
1,2019-04-01,石英股份,603688,5.06,5.19,4.82,5.15,46474,7.495228e+07,1.38,2.53,-0.508738,0.001531
2,2019-04-01,天华新能,300390,1.98,2.22,1.98,2.16,152842,1.659723e+08,8.40,1.15,-0.467593,0.003063
3,2019-04-01,弘元绿能,603185,11.19,11.53,11.15,11.52,28056,1.518937e+08,8.91,6.75,-0.414062,0.004594
4,2019-04-01,锦浪科技,300763,13.67,14.38,13.44,14.01,66833,4.479488e+08,33.42,8.61,-0.385439,0.006126
...,...,...,...,...,...,...,...,...,...,...,...,...,...
934370,2024-05-22,新易盛,300502,89.20,89.98,87.11,88.65,214272,1.890106e+09,3.47,112.77,0.272081,0.993750
934371,2024-05-22,思瑞浦,688536,84.61,89.17,84.61,89.07,13038,1.136772e+08,0.98,114.20,0.282138,0.995000
934372,2024-05-22,佰维存储,688525,47.20,47.98,46.40,47.75,112739,5.334265e+08,4.49,63.30,0.325654,0.996250
934373,2024-05-22,鹏鼎控股,002938,28.10,28.79,27.65,28.45,189503,5.455245e+08,0.82,37.90,0.332162,0.997500


In [132]:
price_df.describe()

Unnamed: 0,open,high,low,close,volume,turnover,turnover_rate,30day_close,30dGain,percentile
count,934375.0,934375.0,934375.0,934375.0,934375.0,934375.0,934375.0,934375.0,934375.0,934375.0
mean,33.122671,33.863437,32.451881,33.147654,330153.7,543165200.0,1.642359,33.255185,0.013538,0.499333
std,74.258007,75.606477,73.00646,74.308012,661809.4,900202600.0,2.611931,74.655262,0.289921,0.288675
min,-0.24,0.04,-0.39,0.01,108.0,232487.0,0.01,0.01,-0.982759,0.0
25%,6.9,7.01,6.8,6.9,62117.0,120124900.0,0.48,6.93,-0.067896,0.249339
50%,15.01,15.3,14.75,15.02,150079.0,257957500.0,0.91,15.11,-0.00444,0.499334
75%,33.71,34.44,33.04,33.74,348235.5,583092700.0,1.81,33.89,0.069915,0.749344
max,2449.21,2489.11,2346.23,2462.23,41144530.0,47979120000.0,84.52,2462.23,166.0,0.99875


In [133]:
def Y_label(x):
    if x >= 0.8:
        return np.array([1,0,0])
    elif x> 0.2 and x <0.8:
        return np.array([0,1,0])
    elif x<=0.2:
        return np.array([0,0,1])

price_df['Y'] = price_df['percentile'].apply(lambda x:Y_label(x))
price_df = price_df.dropna()
price_df

Unnamed: 0,date,name,code,open,high,low,close,volume,turnover,turnover_rate,30day_close,30dGain,percentile,Y
0,2019-04-01,九安医疗,002432,3.54,4.26,3.44,4.26,514423,3.818892e+08,11.89,1.87,-0.561033,0.000000,"[0, 0, 1]"
1,2019-04-01,石英股份,603688,5.06,5.19,4.82,5.15,46474,7.495228e+07,1.38,2.53,-0.508738,0.001531,"[0, 0, 1]"
2,2019-04-01,天华新能,300390,1.98,2.22,1.98,2.16,152842,1.659723e+08,8.40,1.15,-0.467593,0.003063,"[0, 0, 1]"
3,2019-04-01,弘元绿能,603185,11.19,11.53,11.15,11.52,28056,1.518937e+08,8.91,6.75,-0.414062,0.004594,"[0, 0, 1]"
4,2019-04-01,锦浪科技,300763,13.67,14.38,13.44,14.01,66833,4.479488e+08,33.42,8.61,-0.385439,0.006126,"[0, 0, 1]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
934370,2024-05-22,新易盛,300502,89.20,89.98,87.11,88.65,214272,1.890106e+09,3.47,112.77,0.272081,0.993750,"[1, 0, 0]"
934371,2024-05-22,思瑞浦,688536,84.61,89.17,84.61,89.07,13038,1.136772e+08,0.98,114.20,0.282138,0.995000,"[1, 0, 0]"
934372,2024-05-22,佰维存储,688525,47.20,47.98,46.40,47.75,112739,5.334265e+08,4.49,63.30,0.325654,0.996250,"[1, 0, 0]"
934373,2024-05-22,鹏鼎控股,002938,28.10,28.79,27.65,28.45,189503,5.455245e+08,0.82,37.90,0.332162,0.997500,"[1, 0, 0]"


In [134]:
temp = {}
for code,df in price_df.groupby('code'):
    temp[code] = df
price_df = temp
del temp
price_df

{'000001':               date  name    code   open   high    low  close   volume  \
 565     2019-04-01  平安银行  000001  11.06  11.78  11.06  11.41  1951401   
 1184    2019-04-02  平安银行  000001  11.51  11.71  11.46  11.59  1100384   
 1823    2019-04-03  平安银行  000001  11.44  11.68  11.38  11.67   792916   
 2397    2019-04-04  平安银行  000001  11.66  12.23  11.66  12.09  2034365   
 3041    2019-04-08  平安银行  000001  12.13  12.66  11.95  12.19  1743176   
 ...            ...   ...     ...    ...    ...    ...    ...      ...   
 930829  2024-05-16  平安银行  000001  10.13  10.49  10.11  10.45  3076292   
 931525  2024-05-17  平安银行  000001  10.46  10.70  10.41  10.70  2841749   
 932445  2024-05-20  平安银行  000001  10.71  10.81  10.59  10.66  2170210   
 933150  2024-05-21  平安银行  000001  10.60  10.90  10.59  10.83  2030049   
 933914  2024-05-22  平安银行  000001  10.84  11.02  10.74  10.84  2115531   
 
             turnover  turnover_rate  30day_close   30dGain  percentile  \
 565     2.588269e+09    

In [156]:
def date_adjust(x):
    x = x.split('-')
    if x[1] == '03':
        x[1] = '04'
        x[2] = '30'
    elif x[1] == '06':
        x[1] = '08'
        x[2] = '31'
    elif x[1] == '09':
        x[1] = '10'
        x[2] = '31'
    elif x[1] == '12':
        x[0] = str(int(x[0])+1)
        x[1] = '03'
        x[2] = '31'
    return '-'.join(x)
stock_dfs_dict_withPrice = {}
for code in tqdm(Z800_stocks):
    stock_dfs_dict[code]['code'] = code
    stock_dfs_dict[code]['date'] = stock_dfs_dict[code]['日期'].apply(lambda x: date_adjust(x))
    stock_dfs_dict[code] = stock_dfs_dict[code].fillna(0)
    stock_dfs_dict[code].replace('--', np.nan, inplace=True)
    stock_dfs_dict[code] = stock_dfs_dict[code].dropna()
    price_df[code] = price_df[code].fillna(0)
    stock_dfs_dict_withPrice[code] = pd.merge(left = price_df[code],right = stock_dfs_dict[code],how='left',on = ['code','date'])
    stock_dfs_dict_withPrice[code] = stock_dfs_dict_withPrice[code].ffill()
    stock_dfs_dict_withPrice[code] = stock_dfs_dict_withPrice[code].dropna()
    if len(stock_dfs_dict_withPrice[code]) == 0:
        del stock_dfs_dict_withPrice[code]

100%|███████████████████████████████████████████████████████████████████████████████| 800/800 [00:07<00:00, 112.22it/s]


In [157]:
train_df =  pd.concat(stock_dfs_dict_withPrice.values(), axis=0)

In [158]:
train_df.columns

Index(['date', 'name', 'code', 'open', 'high', 'low', 'close', 'volume',
       'turnover', 'turnover_rate', '30day_close', '30dGain', 'percentile',
       'Y', '日期', '每股收益', '调整每股净资产', '每股现金流', '每股公积金', '每股未分配利润', '总资产',
       '扣非净利润', '主营利润率', '总资产净利率', '销售净利率', '净资产报酬率', '资产报酬率', '净资产收益率',
       '加权净资产收益率', '成本费用利润率', '主营业务成本率', '应收账款周转率', '存货周转率', '固定资产周转率',
       '总资产周转率', '流动资产周转率', '流动比率', '速动比率', '现金比率', '产权比率', '资产负债率', '现金流销售比',
       '现金流净利润比', '现金流负债比', '主营收入增长率', '净利润增长率', '净资产增长率', '总资产增长率'],
      dtype='object')

In [159]:
train_df = train_df.drop(['日期','30day_close','percentile'],axis = 1)

In [160]:
train_df 

Unnamed: 0,date,name,code,open,high,low,close,volume,turnover,turnover_rate,...,现金比率,产权比率,资产负债率,现金流销售比,现金流净利润比,现金流负债比,主营收入增长率,净利润增长率,净资产增长率,总资产增长率
243,2020-03-31,万 科Ａ,000002,22.43,22.50,21.58,21.73,607044,1.568993e+09,0.62,...,13.0593,512.5784,84.359,0.1242,0.8287,0.0313,23.5873,11.8917,14.8367,13.1724
244,2020-04-01,万 科Ａ,000002,22.53,23.04,22.33,22.71,1074332,2.866233e+09,1.11,...,13.0593,512.5784,84.359,0.1242,0.8287,0.0313,23.5873,11.8917,14.8367,13.1724
245,2020-04-02,万 科Ａ,000002,22.58,23.02,22.43,22.96,663097,1.768780e+09,0.68,...,13.0593,512.5784,84.359,0.1242,0.8287,0.0313,23.5873,11.8917,14.8367,13.1724
246,2020-04-03,万 科Ａ,000002,22.83,23.26,22.68,22.85,569453,1.531608e+09,0.59,...,13.0593,512.5784,84.359,0.1242,0.8287,0.0313,23.5873,11.8917,14.8367,13.1724
247,2020-04-07,万 科Ａ,000002,23.42,23.50,22.88,23.15,671540,1.817213e+09,0.69,...,13.0593,512.5784,84.359,0.1242,0.8287,0.0313,23.5873,11.8917,14.8367,13.1724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,2024-05-16,九号公司-WD,689009,41.92,42.20,39.74,40.43,123043,5.028714e+08,2.39,...,87.1942,84.6035,47.0831,0.2416,4.7657,0.2243,-9.1347,-13.0051,15.3859,11.5099
860,2024-05-17,九号公司-WD,689009,40.23,42.04,40.21,41.10,89619,3.721017e+08,1.74,...,87.1942,84.6035,47.0831,0.2416,4.7657,0.2243,-9.1347,-13.0051,15.3859,11.5099
861,2024-05-20,九号公司-WD,689009,40.58,42.60,39.22,39.70,129609,5.269698e+08,2.51,...,87.1942,84.6035,47.0831,0.2416,4.7657,0.2243,-9.1347,-13.0051,15.3859,11.5099
862,2024-05-21,九号公司-WD,689009,39.51,40.55,39.51,39.79,62921,2.527247e+08,1.22,...,87.1942,84.6035,47.0831,0.2416,4.7657,0.2243,-9.1347,-13.0051,15.3859,11.5099


In [161]:

for col in [ 'open', 'high', 'low', 'close', 'volume',
       'turnover', 'turnover_rate','30dGain',
         '每股收益', '调整每股净资产', '每股现金流', '每股公积金', '每股未分配利润', '总资产', '扣非净利润',
       '主营利润率', '总资产净利率', '销售净利率', '净资产报酬率', '资产报酬率', '净资产收益率', '加权净资产收益率',
       '成本费用利润率', '主营业务成本率', '应收账款周转率', '存货周转率', '固定资产周转率', '总资产周转率',
       '流动资产周转率', '流动比率', '速动比率', '现金比率', '产权比率', '资产负债率', '现金流销售比', '现金流净利润比',
       '现金流负债比', '主营收入增长率', '净利润增长率', '净资产增长率', '总资产增长率']:

    train_df[col] = train_df[col].astype(float)

In [175]:
for col in ['每股收益', '调整每股净资产', '每股现金流', '每股未分配利润']:
    train_df[f'priceTo{col}'] = train_df.close / train_df[col]
    
def MinMaxScalefunc(data,window):   
    max1 = data.rolling(window).max()   
    min1 = data.rolling(window).min()   
    return (data - min1) / (max1- min1+ 1e-9)

temp = []
for code,group in tqdm(train_df.groupby('code')):
    for col in ['每股收益', '调整每股净资产', '每股现金流', '每股公积金', '每股未分配利润', '总资产', '扣非净利润',
       '主营利润率', '总资产净利率', '销售净利率', '净资产报酬率', '资产报酬率', '净资产收益率', '加权净资产收益率',
       '成本费用利润率', '主营业务成本率', '应收账款周转率', '存货周转率', '固定资产周转率', '总资产周转率',
       '流动资产周转率', '流动比率', '速动比率', '现金比率', '产权比率', '资产负债率', '现金流销售比', '现金流净利润比',
       '现金流负债比', '主营收入增长率', '净利润增长率', '净资产增长率', '总资产增长率']:
        group[f'{col}_scale'] = MinMaxScalefunc(group[col],100)
        try:
            group[f'priceTo{col}_scale'] = MinMaxScalefunc(group[col],100)
        except:
            group[f'priceTo{col}_scale'] = MinMaxScalefunc(group[col],100)
    temp.append(group)
train_df =  pd.concat(temp)
train_df['code'] = train_df['code'].astype(str)
train_df

100%|████████████████████████████████████████████████████████████████████████████████| 695/695 [00:22<00:00, 31.10it/s]


Unnamed: 0,date,name,code,open,high,low,close,volume,turnover,turnover_rate,...,现金流负债比_scale,priceTo现金流负债比_scale,主营收入增长率_scale,priceTo主营收入增长率_scale,净利润增长率_scale,priceTo净利润增长率_scale,净资产增长率_scale,priceTo净资产增长率_scale,总资产增长率_scale,priceTo总资产增长率_scale
171,2020-03-31,万 科Ａ,000002,22.43,22.50,21.58,21.73,607044.0,1.568993e+09,0.62,...,,,,,,,,,,
671,2020-04-01,万 科Ａ,000002,22.53,23.04,22.33,22.71,1074332.0,2.866233e+09,1.11,...,,,,,,,,,,
1399,2020-04-02,万 科Ａ,000002,22.58,23.02,22.43,22.96,663097.0,1.768780e+09,0.68,...,,,,,,,,,,
1961,2020-04-03,万 科Ａ,000002,22.83,23.26,22.68,22.85,569453.0,1.531608e+09,0.59,...,,,,,,,,,,
2585,2020-04-07,万 科Ａ,000002,23.42,23.50,22.88,23.15,671540.0,1.817213e+09,0.69,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641610,2024-05-16,九号公司-WD,689009,41.92,42.20,39.74,40.43,123043.0,5.028714e+08,2.39,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
642788,2024-05-17,九号公司-WD,689009,40.23,42.04,40.21,41.10,89619.0,3.721017e+08,1.74,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
643258,2024-05-20,九号公司-WD,689009,40.58,42.60,39.22,39.70,129609.0,5.269698e+08,2.51,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
643694,2024-05-21,九号公司-WD,689009,39.51,40.55,39.51,39.79,62921.0,2.527247e+08,1.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
train_df = train_df.dropna(axis=0)
train_df = train_df.sort_values(by='date').reset_index(drop = True)
train_df

In [178]:
train_size = int(len(train_df) * 0.6)
val_size = int(len(train_df) * 0.2)
train = train_df[:train_size]  # 训练集
validation = train_df[train_size:train_size + val_size]  # 验证集
test = train_df[train_size + val_size:] # 测试集

In [179]:
train.to_pickle('train.pkl')
validation.to_pickle('val.pkl')
test.to_pickle('test.pkl')