In [None]:
"""
func:
处理从joinquant上获得的财报和其他数据
将其通过pd.pivot_table按因子分类处理到不同dataframe上
ffill填充空值
并保存为csv
"""

In [4]:
import pandas as pd
import numpy as np
import datetime
import math
import os
from pandas.core.groupby.groupby import DataError

In [5]:
td = pd.read_csv('dataset/trading_date.csv', low_memory=False)    
td['date'] = pd.to_datetime(td['date'],format='%Y-%m-%d')

In [11]:
def ffill_func(df_dict):
    for key in df_dict.keys() : 
        dataset_df = df_dict[key][key]
        dataset_df.index = pd.to_datetime(dataset_df.index,format='%Y-%m-%d')
        new_df = pd.DataFrame(index=td.iloc[252:,0])
        dataset_df = pd.merge(new_df,dataset_df, how='left', left_index=True, right_index=True).sort_index() # 合并交易日历和数据
        dataset_df.fillna(method='pad', inplace=True) # ffill
        dataset_df.to_csv('dataset/processed_data/'+key+'.csv')

# 资产负债表

In [52]:
balance_dict = {
    'total_owner_equities': pd.DataFrame(),
    'total_assets': pd.DataFrame(),
    'equities_parent_company_owners': pd.DataFrame(),
    'total_liability': pd.DataFrame(),
    'inventories': pd.DataFrame(),
    'total_non_current_liability': pd.DataFrame(),
    'fixed_assets': pd.DataFrame(),
    'construction_materials': pd.DataFrame(),
    'constru_in_process': pd.DataFrame(),
    'total_assets': pd.DataFrame(),
    'total_current_assets': pd.DataFrame(),
    'total_current_liability': pd.DataFrame(),
    'preferred_shares_equity': pd.DataFrame(),
}

In [53]:
################################ copy 1
for root, dirs, files in os.walk('dataset/original_data/A股各公司财务数据/合并资产负债表/'):
    for f in files:
#         print(f)
        data_df = pd.read_csv('dataset/original_data/A股各公司财务数据/合并资产负债表/' + f, encoding='gbk')
        for key in balance_dict.keys():
            try:
                pt = pd.pivot_table(data_df, index=['pub_date'], columns=['code'], values=[key], margins=False)
                balance_dict[key] = pd.concat([balance_dict[key], pt],axis=1)
            except DataError: 
                print(f, key, 'No data')
ffill_func(balance_dict)

000043.csv total_owner_equities No data


KeyboardInterrupt: 

# 一致预期数据

In [73]:
balance_dict = {
    'FY1': pd.DataFrame(),
    'FY3': pd.DataFrame(),
    'FY12': pd.DataFrame(),
}

In [74]:
for key in balance_dict.keys() : 
    print(key)
    balance_dict[key] = pd.read_csv('dataset/original_data/' + key + '.csv').set_index('date').astype('float64') # 读取

FY1
FY3


In [75]:
def fill_fy(df_dict, fy):
    df_dict[fy] = df_dict[fy].sort_index(axis=1)
    df_dict[fy].index = pd.to_datetime(df_dict[fy].index,format='%Y-%m-%d')
    new_df = pd.DataFrame(index=td.iloc[252:,0])
    df_dict[fy].index = pd.to_datetime(df_dict[fy].index,format='%Y-%m-%d')
    for month in df_dict[fy].index[::-1] : 
        for i in range(1,33) : 
            if month + datetime.timedelta(days=i) in df_dict[fy].index : 
                break
            df_dict[fy].loc[month + datetime.timedelta(days=i)] = df_dict[fy].loc[month]
    df_dict[fy] = pd.merge(new_df,df_dict[fy], how='left', left_index=True, right_index=True).sort_index() # 合并交易日历和数据
    df_dict[fy] = df_dict[fy].sort_index()
    df_dict[fy].to_csv('dataset/processed_data/'+fy+'.csv')
fill_fy(balance_dict,'FY1')
fill_fy(balance_dict,'FY3')

# 利润表

In [None]:
income_dict = {
    'operating_revenue': pd.DataFrame(),
    'operating_cost': pd.DataFrame(),
    'net_profit': pd.DataFrame(),
    'total_operating_revenue': pd.DataFrame(),
}

In [None]:
for root, dirs, files in os.walk('dataset/original_data/A股各公司财务数据/合并利润表/'):
    for f in files:
        data_df = pd.read_csv('dataset/original_data/A股各公司财务数据/合并利润表/' + f, encoding='gbk')
        for key in income_dict.keys():
            try:
                pt = pd.pivot_table(data_df, index=['pub_date'], columns=['code'], values=[key], margins=False)
                income_dict[key] = pd.concat([income_dict[key], pt],axis=1)
            except DataError: 
                print(f, key, 'No data')
ffill_func(income_dict)

# 现金流量表

In [None]:
cf_dict = {
    'inventory_decrease': pd.DataFrame(),
    'net_operate_cash_flow': pd.DataFrame(),
}

In [None]:
for root, dirs, files in os.walk('dataset/original_data/A股各公司财务数据/合并现金流量表/'):
    for f in files:
        data_df = pd.read_csv('dataset/original_data/A股各公司财务数据/合并现金流量表/' + f, encoding='gbk')
        for key in cf_dict.keys():
            try:
                pt = pd.pivot_table(data_df, index=['pub_date'], columns=['code'], values=[key], margins=False)
                cf_dict[key] = pd.concat([cf_dict[key], pt],axis=1)
            except DataError: 
                print(f, key, 'No data')
                
ffill_func(cf_dict)

# 财务指标数据

In [8]:
def financial_indicator(ind) : 
    stock_df = pd.DataFrame()
    for root, dirs, files in os.walk('dataset/original_data/A股各公司财务数据/财务指标数据/'):
        for f in files:
            data_df = pd.read_csv('dataset/original_data/A股各公司财务数据/财务指标数据/' + f, encoding='gbk')
            pt = pd.pivot_table(data_df, index=['pubDate'], columns=['code'], values=[ind], margins=False)
            stock_df = pd.concat([stock_df, pt]) # join

    return stock_df.sort_index()

In [9]:
indicator_keys = [
    'roe_df',
    'roa_df',
    'net_profit_margin_df',
    'gross_profit_margin_df',
    'inc_net_profit_year_on_year_df',
    'financing_expense_to_total_revenue_df',
    'operating_expense_to_total_revenue_df',
    'eps',
    ]

indicator_dict = {}
for key in indicator_keys : 
    indicator_dict[key] = financial_indicator(key)
ffill_func(indicator_dict)