In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px


## All function to create features


In [3]:
# calculate factors functions
def returnMAE(df, n):
    avr = pd.Series()
    for id in df.SecuritiesCode.unique():
        avr = pd.concat([avr, df[df.SecuritiesCode == id].Close.rolling(window=n, min_periods=1).mean()], ignore_index = False)    
    df[f'average{n}'] = avr 
    return df


def returnEWM(df,n=12):
    avr = pd.Series()
    for id in df.SecuritiesCode.unique():
        avr = pd.concat([avr, df[df.SecuritiesCode == id].Close.ewm(com=0.5, adjust=True, span=10).mean()], ignore_index = False)    
    
    df[f'EWM_{n}'] = avr 
    return df

def returnBoll(df,n=20):
    bl1 = pd.Series()
    bl2 = pd.Series()
    for id in df.SecuritiesCode.unique():
        bl1 = pd.concat([bl1, df[df.SecuritiesCode == id].Close.rolling(window=n, min_periods=1).apply(lambda x: x.mean()-2*x.std(), raw=False)], ignore_index = False)    
        bl2 = pd.concat([bl2, df[df.SecuritiesCode == id].Close.rolling(window=n, min_periods=1).apply(lambda x: x.mean()+2*x.std(), raw=False)], ignore_index = False)
    df[f'bollDown'] = bl1 
    df[f'bollUp'] = bl2
    return df

def returnMAEVolume(df, n=12):
    avr = pd.Series()
    for id in df.SecuritiesCode.unique():
        avr = pd.concat([avr, df[df.SecuritiesCode == id].Volume.rolling(window=n, min_periods=1).mean()], ignore_index = False)    
    
    df[f'MAEVolume_{n}'] = avr 
    return df

def returnSTDVolume(df, n=10):
    avr = pd.Series()
    for id in df.SecuritiesCode.unique():
        avr = pd.concat([avr, df[df.SecuritiesCode == id].Volume.rolling(window=n, min_periods=1).std()], ignore_index = False)    
    
    df[f'STDVolume_{n}'] = avr 
    return df

def calUpNumber(x):
    data = x.iloc[1:]
    data_shift = x.shift(1).iloc[2:]
    new = data_shift - data
    return new[new > 0].count()
        
def returnUpDate(df, n=13):
    number = pd.Series(dtype='float64')
    for id in df.SecuritiesCode.unique():
        number = pd.concat([number, df[df.SecuritiesCode == id].Volume.rolling(window=n, min_periods=1).apply(calUpNumber, raw=False)], ignore_index = False)
    
    df[f'NumberUp_{n-1}'] = number 
    return df

def returnWillingness(df, n=26):
    will = pd.Series(dtype='float64')
    for id in df.SecuritiesCode.unique():
        df1 = df[df.SecuritiesCode == id]
        df1['diff1'] = df1.High - df1.Close.shift(1)
        df1['diff2'] = df1.Close.shift(1) - df1.Low
        sum1 = df1.diff1.rolling(window=n, min_periods=1).sum()
        sum2 = df1.diff2.rolling(window=n, min_periods=1).sum()
        will = pd.concat([will, sum1/sum2], ignore_index=False)
    df['Willness'] = will
    return df

## apply features function
def applyFactors(df):
    df['money_flow'] = (df.Close + df.High + df.Low)/3 * df.Volume
    df = returnMAE(df, 5) 
    df = returnMAE(df, 10) 
    df = returnMAE(df, 20) 
    df = returnMAE(df, 60) 
    df = returnMAE(df, 120) 
    df = returnBoll(df,n=20)
    df = returnEWM(df,n=10)
    df = returnUpDate(df)
    df = returnMAEVolume(df)
    df = returnSTDVolume(df)
    df = returnSTDVolume(df, 20)
    df = returnWillingness(df)

# data preprocess 3 sigma rule (不一定有效的方法)
def calculateOutlier(column):
    mu = column.mean()
    std = column.std()
    outlier = column[np.abs(column - mu) > 3*std]
    if pd.isnull(outlier.any()):
        return column
    else:
        column.clip(lower=mu - 3*std, upper=mu + 3*std)
        
# StandardScaler 神经网络训练适用
def applyStandN(df):
    df_ = pd.DataFrame(columns = df.columns)

    for id in df.SecuritiesCode.unique():
        std_data = df[df.SecuritiesCode == id]
        ss = StandardScaler()
        std_data.loc[:, ['Open', 'High', 'Low', 'Close', 'Volume']] = ss.fit_transform(std_data.loc[:, ['Open', 'High', 'Low', 'Close', 'Volume']])
        std_data.loc[:, ['Open', 'High', 'Low', 'Close', 'Volume']].apply(calculateOutlier, axis=0)
        price_std = pd.concat([df_,std_data])
        df_ = price_std
        
    return df

# reduce memory usage
def reduce_mem_usage(df, verbose=True):
    ''' 
    reduce every columns mem useage 
    change the dtype of it 
    '''
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
# 读取原始数据
df = pd.read_csv('/Users/dylan/iCollections/桌面文件/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv')
df = reduce_mem_usage(df)

Mem. usage decreased to 113.45 Mb (42.7% reduction)


In [5]:
df = applyFactors(df)

  avr = pd.Series()
  avr = pd.Series()
  avr = pd.Series()
  avr = pd.Series()
  avr = pd.Series()
  bl1 = pd.Series()
  bl2 = pd.Series()
  avr = pd.Series()


ValueError: comass, span, halflife, and alpha are mutually exclusive

In [None]:
df.to_csv('/Users/dylan/iCollections/桌面文件/jpx-tokyo-stock-exchange-prediction/code_JPX /data_preprocess/data_output/prices_update_.csv')