## 导入tushare

In [1]:
import pandas as pd
import tushare as ts
ts.__version__

'1.2.78'

## 设置token

In [2]:
ts.set_token('1f2f092156dfe775a88e478f3a54565e0adab29246c83814e95fa0dd')

## 初始化pro接口

In [3]:
pro = ts.pro_api()

## 上证50成份股股票代码
直接在东方财富网上（ https://data.eastmoney.com/other/index/sz50.html ）获取最新上证50成份股的代码

In [5]:
sz50_codes = ['600000', '600028', '600030', '600031', '600036', '600048', '600050', '600104', '600196', '600276', '600309', '600436',
              '600438', '600519', '600547', '600570', '600585', '600588', '600690', '600745', '600809', '600837', '600887', '600893',
              '600900', '601012', '601066', '601088', '601138', '601166', '601211', '601288', '601318', '601336', '601398', '601601',
              '601628', '601633', '601668', '601688', '601728', '601857', '601888', '601899', '601919', '601995', '603259', '603288',
              '603501', '603986']
print(len(sz50_codes))

50


## 查看上证50成份股信息
获取股票的Tushare代码（ts_code）以及股票名称、行业、上市日期等信息，保存到csv文件

In [6]:
# all_stocks_info = pro.stock_basic(exchange='', list_status='L', fields='ts_code,symbol,name,area,industry,list_date')
# sz50_info = all_stocks_info[all_stocks_info['symbol'].isin(sz50_codes)].reset_index(drop=True)
# print(len(sz50_info))
# print(sz50_info)
# sz50_info.to_csv('./data/上证50股票信息.csv', index=None)
sz50_info = pd.read_csv('./data/上证50股票信息.csv')

## 获取上证50成份股历史数据
提取股票信息的ts_code列，循环通过daily接口查询股票数据并拼接，返回结果按股票编号和交易日期排序。

In [7]:
query_codes = sz50_info['ts_code'].tolist()
sz50_prices = []
for code in query_codes:
    sz50_prices.append(ts.pro_bar(ts_code=code, adj='qfq', start_date='20100101', end_date='20211231'))
sz50_prices = pd.concat(sz50_prices, axis=0, ignore_index=True)
sz50_prices = sz50_prices.sort_values(by=['ts_code', 'trade_date']).reset_index(drop=True)
print(sz50_prices.shape)

sz50_prices.to_csv('./data/上证50股票历史股价2010-2021.csv', index=None)

(124128, 11)


## 选取交易股
取上证50中上市期限较长且中途没有长时间停牌的前20只股票作为交易股

In [3]:
sz50_info = pd.read_csv('./data/上证50股票信息.csv')
sz50_prices = pd.read_csv('./data/上证50股票历史股价2010-2021.csv')


picked_stockcode = sz50_prices['ts_code'].value_counts().index.to_list()
picked_stockcode = [x for x in picked_stockcode if sz50_info.loc[sz50_info['ts_code'] == x]['list_date'].values[0] < 20090101]
# 上证50中共34只股票于20090101之前上市
# print(picked_stockcode.__len__())

# 筛掉停牌时间过久的股票
# 600745.SH闻泰科技曾于2018.04-2018.12停牌半年
# 600547.SH山东黄金曾于2013，2014，2017停牌
picked_stockcode = [x for x in picked_stockcode if not (x=='600745.SH' or x=='600547.SH')]

# 选取其中上市年限最长的20只
picked_stockcode = picked_stockcode[:20]
print(picked_stockcode)

picked_stocks = sz50_prices[sz50_prices['ts_code'].isin(picked_stockcode)]
print(picked_stocks.head())
print(picked_stocks.info())

['601601.SH', '601628.SH', '600028.SH', '601857.SH', '600519.SH', '600276.SH', '600837.SH', '600585.SH', '601398.SH', '600031.SH', '600048.SH', '600809.SH', '601166.SH', '600036.SH', '600436.SH', '600588.SH', '600196.SH', '600030.SH', '601899.SH', '600887.SH']
        ts_code  trade_date    open    high     low   close  pre_close  \
2870  600028.SH    20100104  6.0869  6.1041  5.9281  5.9324     6.0483   
2871  600028.SH    20100105  5.9410  6.0311  5.8079  5.9667     5.9324   
2872  600028.SH    20100106  5.9453  5.9581  5.8251  5.8380     5.9667   
2873  600028.SH    20100107  5.8422  5.8723  5.6062  5.6834     5.8380   
2874  600028.SH    20100108  5.6190  5.6448  5.4688  5.6233     5.6834   

      change  pct_chg         vol       amount  
2870 -0.1159  -1.9162   842260.21  1180334.595  
2871  0.0343   0.5782  1100179.18  1517311.545  
2872 -0.1287  -2.1570  1042644.64  1430293.213  
2873 -0.1546  -2.6482  1241500.42  1663420.138  
2874 -0.0601  -1.0575  1321802.82  1712045.170  


## 选取交易日期
由于各股票在某些日期没有股价数据，故选择各股票数据的交集作为实际的交易日期

In [4]:
trade_dates_dict = {}
for _, row in picked_stocks.iterrows():
    if row['ts_code'] not in trade_dates_dict.keys():
        trade_dates_dict[row['ts_code']] = set()
    trade_dates_dict[row['ts_code']].add(row['trade_date'])

max_tradecount = 0
max_tradecode = ''
for k, v in trade_dates_dict.items():
    tradecount = len(v)
    if tradecount > max_tradecount:
        max_tradecount = tradecount
        max_tradecode = k
print('原数据中交易日期最多的股票:', max_tradecode)
print('交易天数:', max_tradecount)
print('--------------------------')
trade_dates = set()
for k in trade_dates_dict.keys():
    if len(trade_dates) == 0:
        trade_dates = trade_dates_dict[k]
    else:
        trade_dates = trade_dates.intersection(trade_dates_dict[k])
print('交易股取交集的交易天数:', len(trade_dates))

trade_dates = pd.DataFrame(trade_dates, columns=['trade_date'])
trade_dates.sort_values(by='trade_date', inplace=True, ignore_index=True)
print(trade_dates.head())
print(trade_dates.tail())

trade_dates.to_csv('./data/上证50成份股（20只）交易日期2010-2021.csv')

原数据中交易日期最多的股票: 601601.SH
交易天数: 2914
--------------------------
交易股取交集的交易天数: 2622
   trade_date
0    20100104
1    20100105
2    20100106
3    20100107
4    20100112
      trade_date
2617    20211227
2618    20211228
2619    20211229
2620    20211230
2621    20211231


## 最终提取的交易数据

In [27]:
final_stocks = picked_stocks[picked_stocks['trade_date'].isin(trade_dates['trade_date'])].reset_index(drop=True)
print(final_stocks.info())
print(final_stocks.head())
final_stocks.to_csv('./data/szstock_20.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52440 entries, 0 to 52439
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ts_code     52440 non-null  object 
 1   trade_date  52440 non-null  int64  
 2   open        52440 non-null  float64
 3   high        52440 non-null  float64
 4   low         52440 non-null  float64
 5   close       52440 non-null  float64
 6   pre_close   52440 non-null  float64
 7   change      52440 non-null  float64
 8   pct_chg     52440 non-null  float64
 9   vol         52440 non-null  float64
 10  amount      52440 non-null  float64
dtypes: float64(9), int64(1), object(1)
memory usage: 4.4+ MB
None
     ts_code  trade_date    open    high     low   close  pre_close  change  \
0  600028.SH    20100104  6.0869  6.1041  5.9281  5.9324     6.0483 -0.1159   
1  600028.SH    20100105  5.9410  6.0311  5.8079  5.9667     5.9324  0.0343   
2  600028.SH    20100106  5.9453  5.9581  5.8251  5.83

验证处理后数据所有股票交易日期相同

In [28]:
data = pd.read_csv('./data/szstock_20.csv')
print(data['trade_date'].value_counts().sort_values())

20100104    20
20100204    20
20100119    20
20100106    20
20100107    20
            ..
20211227    20
20211228    20
20211229    20
20211108    20
20211231    20
Name: trade_date, Length: 2622, dtype: int64


## 获取大盘历史指数

In [30]:
# 上证指数
sz_code = '000001.SH'
sz_index = pro.index_daily(ts_code=sz_code, start_date='20100101', end_date='20211231')
print(sz_index.info())
print(sz_index.head())
sz_index.to_csv('./data/上证指数2010-2021.csv')

# 上证50指数
sz50_code = '000016.SH'
sz50_index = pro.index_daily(ts_code=sz50_code, start_date='20100101', end_date='20211231')
print(sz50_index.info())
print(sz50_index.head())
sz50_index.to_csv('./data/上证50指数2010-2021.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2917 entries, 0 to 2916
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ts_code     2917 non-null   object 
 1   trade_date  2917 non-null   object 
 2   close       2917 non-null   float64
 3   open        2917 non-null   float64
 4   high        2917 non-null   float64
 5   low         2917 non-null   float64
 6   pre_close   2917 non-null   float64
 7   change      2917 non-null   float64
 8   pct_chg     2917 non-null   float64
 9   vol         2917 non-null   float64
 10  amount      2917 non-null   float64
dtypes: float64(9), object(2)
memory usage: 250.8+ KB
None
     ts_code trade_date      close       open       high        low  \
0  000001.SH   20211231  3639.7754  3626.2420  3642.8430  3624.9419   
1  000001.SH   20211230  3619.1886  3596.4921  3628.9177  3595.4957   
2  000001.SH   20211229  3597.0002  3630.9159  3630.9159  3596.3225   
3  000001.SH   20

In [8]:
sz50_index = pd.read_csv('./data/上证50指数2010-2021.csv', index_col=0)
sz50_index = sz50_index[sz50_index['trade_date'].isin(trade_dates['trade_date'])]
# final_stocks['ajexdi'] = [1] * len(final_stocks) # 临时需要，与原项目数据格式兼容
sz50_index = sz50_index.iloc[::-1].reset_index(drop=True)
print(sz50_index.info())
print(sz50_index.head())
sz50_index.to_csv('./data/sz50.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2622 entries, 0 to 2621
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ts_code     2622 non-null   object 
 1   trade_date  2622 non-null   int64  
 2   close       2622 non-null   float64
 3   open        2622 non-null   float64
 4   high        2622 non-null   float64
 5   low         2622 non-null   float64
 6   pre_close   2622 non-null   float64
 7   change      2622 non-null   float64
 8   pct_chg     2622 non-null   float64
 9   vol         2622 non-null   float64
 10  amount      2622 non-null   float64
dtypes: float64(9), int64(1), object(1)
memory usage: 225.5+ KB
None
     ts_code  trade_date     close      open      high       low  pre_close  \
0  000016.SH    20100104  2514.646  2565.108  2570.152  2514.237   2553.800   
1  000016.SH    20100105  2543.991  2526.291  2560.667  2487.048   2514.646   
2  000016.SH    20100106  2514.014  2538.285  2549.571

## 获取上证50ETF基金数据

In [14]:
# all_funds_info = pro.fund_basic(market='E', fields='ts_code,name,management,fund_type,found_date')
# sz50etf_info = all_funds_info[all_funds_info['ts_code'].str.contains('510050')].reset_index(drop=True)
# print(sz50etf_info)

# 基金接口单次调用获取数据有上限，两年一段分6次获取
sz50etf_prices = []
date_sep = ['20100101', '20111231', '20120101', '20131231', '20140101', '20151231',
            '20160101', '20171231', '20180101', '20191231', '20200101', '20211231']
for i in range(0, len(date_sep), 2):
    sz50etf_prices.append(pro.fund_daily(ts_code='510050.SH', start_date=date_sep[i], end_date=date_sep[i+1]))

sz50etf_prices = pd.concat(sz50etf_prices, axis=0, ignore_index=True)
sz50etf_prices = sz50etf_prices.sort_values(by=['trade_date']).reset_index(drop=True)

print(sz50etf_prices.info())
print(sz50etf_prices.head())
print(sz50etf_prices.tail())

sz50etf_prices.to_csv('./data/华夏上证50ETF2010-2021.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2917 entries, 0 to 2916
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ts_code     2917 non-null   object 
 1   trade_date  2917 non-null   object 
 2   pre_close   2917 non-null   float64
 3   open        2917 non-null   float64
 4   high        2917 non-null   float64
 5   low         2917 non-null   float64
 6   close       2917 non-null   float64
 7   change      2917 non-null   float64
 8   pct_chg     2917 non-null   float64
 9   vol         2917 non-null   float64
 10  amount      2917 non-null   float64
dtypes: float64(9), object(2)
memory usage: 250.8+ KB
None
     ts_code trade_date  pre_close   open   high    low  close  change  \
0  510050.SH   20100104      2.559  2.569  2.572  2.510  2.512  -0.047   
1  510050.SH   20100105      2.512  2.525  2.562  2.489  2.544   0.032   
2  510050.SH   20100106      2.544  2.539  2.550  2.511  2.512  -0.032   
3  51

## 成长股（第二组数据）股票信息
从创业板指数成分股中选取10只作为成长股数据

In [4]:
cyb10_codes = ['300024', '300088', '300070', '300146',
               '300015', '300033', '300059', '300003', '300212', '300171']

all_stocks_info = pro.stock_basic(exchange='', list_status='L', fields='ts_code,symbol,name,area,industry,list_date')
cyb10_info = all_stocks_info[all_stocks_info['symbol'].isin(cyb10_codes)].reset_index(drop=True)
print(cyb10_info)
cyb10_info.to_csv('./data/创业板10只股票信息.csv', index=None)
cyb10_info = pd.read_csv('./data/创业板10只股票信息.csv')

     ts_code  symbol  name area industry list_date
0  300003.SZ  300003  乐普医疗   北京     医疗保健  20091030
1  300015.SZ  300015  爱尔眼科   湖南     医疗保健  20091030
2  300024.SZ  300024   机器人   辽宁     专用机械  20091030
3  300033.SZ  300033   同花顺   浙江     软件服务  20091225
4  300059.SZ  300059  东方财富   上海       证券  20100319
5  300070.SZ  300070   碧水源   北京     环境保护  20100421
6  300088.SZ  300088  长信科技   安徽      元器件  20100526
7  300146.SZ  300146  汤臣倍健   广东     医疗保健  20101215
8  300171.SZ  300171   东富龙   上海     医疗保健  20110201
9  300212.SZ  300212   易华录   北京     软件服务  20110505


## 获取数据
调用API获取数据，选取交易日期等步骤，方法与之前相同。数据区间为20120101-20211231

In [5]:
query_codes = cyb10_info['ts_code'].tolist()
cyb10_prices = []
for code in query_codes:
    cyb10_prices.append(ts.pro_bar(ts_code=code, adj='qfq', start_date='20120101', end_date='20211231'))
cyb10_prices = pd.concat(cyb10_prices, axis=0, ignore_index=True)
cyb10_prices = cyb10_prices.sort_values(by=['ts_code', 'trade_date']).reset_index(drop=True)
print(cyb10_prices.shape)

cyb10_prices.to_csv('./data/创业板10只股票历史股价2012-2021.csv', index=None)

trade_dates_dict = {}
for _, row in cyb10_prices.iterrows():
    if row['ts_code'] not in trade_dates_dict.keys():
        trade_dates_dict[row['ts_code']] = set()
    trade_dates_dict[row['ts_code']].add(row['trade_date'])

max_tradecount = 0
max_tradecode = ''
for k, v in trade_dates_dict.items():
    tradecount = len(v)
    if tradecount > max_tradecount:
        max_tradecount = tradecount
        max_tradecode = k
print('原数据中交易日期最多的股票:', max_tradecode)
print('交易天数:', max_tradecount)
print('--------------------------')
trade_dates = set()
for k in trade_dates_dict.keys():
    if len(trade_dates) == 0:
        trade_dates = trade_dates_dict[k]
    else:
        trade_dates = trade_dates.intersection(trade_dates_dict[k])
print('交易股取交集的交易天数:', len(trade_dates))

trade_dates = pd.DataFrame(trade_dates, columns=['trade_date'])
trade_dates.sort_values(by='trade_date', inplace=True, ignore_index=True)

trade_dates.to_csv('./data/创业板10只股票交易日期2012-2021.csv')

final_stocks = cyb10_prices[cyb10_prices['trade_date'].isin(trade_dates['trade_date'])].reset_index(drop=True)
cyb10_prices.to_csv('./data/cybstock_10_fulldata.csv')

(23217, 11)
原数据中交易日期最多的股票: 300024.SZ
交易天数: 2430
--------------------------
交易股取交集的交易天数: 1604
