In [1]:
from gplearn import genetic

import pandas as pd
import numpy as np
import math

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn import metrics

from xgboost import XGBClassifier

In [2]:
# data preparation
df = pd.read_csv("sz50.csv")

order = ['close', 'open', 'high', 'low', 'vol']
x = df[order]
y = np.where(df['change'] > 0,1,-1)

split_rate = 0.75
split = int(split_rate * len(x))

x_train, x_test, y_train, y_test = x[:split], x[split:], y[:split], y[split:]
x

Unnamed: 0,close,open,high,low,vol
0,1011.3470,996.9960,1021.5680,993.8920,8064653.0
1,1060.8010,1008.2790,1060.8980,1008.2790,14468180.0
2,1075.6560,1059.1410,1086.6940,1059.0950,16991334.0
3,1086.3030,1075.5620,1095.8410,1070.9800,13729419.0
4,1102.6620,1087.6800,1108.2910,1082.5090,10780427.0
...,...,...,...,...,...
3933,2841.9824,2858.5110,2866.1354,2828.2901,37023632.0
3934,2798.7695,2720.6127,2826.7235,2715.1375,49061355.0
3935,2694.0229,2799.9951,2800.1779,2689.8974,46971206.0
3936,2685.5907,2696.6972,2734.7985,2629.1822,41532694.0


In [3]:
# using genetic programming to create factors
function_set = ['add', 'sub', 'mul', 'div', 'log', 'sqrt', 'abs', 'neg', 'max', 'min']
gp = genetic.SymbolicTransformer(generations = 3, 
                          population_size = 1000,
                          n_components = 10,
                          function_set = function_set,
                          init_depth = (1, 4),
                          tournament_size = 20,
                          p_crossover = 0.4,
                          p_subtree_mutation = 0.01,
                          p_hoist_mutation = 0,
                          p_point_mutation = 0.01,
                          p_point_replace = 0.4)
gp.fit(x_train, y_train)
print(f'This is our factors getting by genetic programming algorithm \n{gp}.')

This is our factors getting by genetic programming algorithm 
[div(max(X3, X0), add(X1, 0.547)),
 mul(abs(sub(neg(-0.371), div(X0, X1))), abs(sub(mul(0.834, 0.111), div(X0, X1)))),
 mul(mul(abs(sub(neg(-0.371), log(0.265))), abs(sub(mul(0.834, 0.111), div(X0, X1)))), abs(sub(mul(0.834, 0.111), div(X0, X1)))),
 mul(div(X0, X1), abs(sub(mul(0.834, 0.111), div(X0, X1)))),
 sqrt(div(max(X3, X0), add(X1, 0.547))),
 div(div(X0, X1), sqrt(neg(0.975))),
 div(X0, X1),
 div(X0, X1),
 min(sqrt(X2), div(X0, X1)),
 div(X0, X1)].


In [4]:
df['f1'] = df['close'] / df['open']

tmp1 = df[['close', 'vol']].min(1) / df['open']
tmp2 = np.log(df['low'])
tmp1.max() < tmp2.min()
df['f2'] = tmp1

df['f3'] = -df['low'] / df['open'] * np.log(tmp2)

df

Unnamed: 0,trade_date,ts_code,close,open,high,low,pre_close,change,pct_chg,vol,amount,f1,f2,f3
0,20040102,000016.SH,1011.3470,996.9960,1021.5680,993.8920,1000.0000,11.3470,1.1347,8064653.0,5.742759e+06,1.014394,1.014394,-1.925743
1,20040105,000016.SH,1060.8010,1008.2790,1060.8980,1008.2790,1011.3470,49.4540,4.8899,14468180.0,1.070646e+07,1.052091,1.052091,-1.933838
2,20040106,000016.SH,1075.6560,1059.1410,1086.6940,1059.0950,1060.8010,14.8550,1.4004,16991334.0,1.241700e+07,1.015593,1.015593,-1.940838
3,20040107,000016.SH,1086.3030,1075.5620,1095.8410,1070.9800,1075.6560,10.6470,0.9898,13729419.0,1.035253e+07,1.009986,1.009986,-1.934248
4,20040108,000016.SH,1102.6620,1087.6800,1108.2910,1082.5090,1086.3030,16.3590,1.5059,10780427.0,8.188927e+06,1.013774,1.013774,-1.934814
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3933,20200312,000016.SH,2841.9824,2858.5110,2866.1354,2828.2901,2888.3612,-46.3788,-1.6057,37023632.0,6.074492e+07,0.994218,0.994218,-2.050934
3934,20200313,000016.SH,2798.7695,2720.6127,2826.7235,2715.1375,2841.9824,-43.2129,-1.5205,49061355.0,8.281450e+07,1.028728,1.028728,-2.063536
3935,20200316,000016.SH,2694.0229,2799.9951,2800.1779,2689.8974,2798.7695,-104.7466,-3.7426,46971206.0,7.646031e+07,0.962153,0.962153,-1.985259
3936,20200317,000016.SH,2685.5907,2696.6972,2734.7985,2629.1822,2694.0229,-8.4322,-0.3130,41532694.0,7.008388e+07,0.995881,0.995881,-2.011955


In [5]:
x_order = ['f1', 'f2', 'f3']
ddf = df[x_order]
ddf

Unnamed: 0,f1,f2,f3
0,1.014394,1.014394,-1.925743
1,1.052091,1.052091,-1.933838
2,1.015593,1.015593,-1.940838
3,1.009986,1.009986,-1.934248
4,1.013774,1.013774,-1.934814
...,...,...,...
3933,0.994218,0.994218,-2.050934
3934,1.028728,1.028728,-2.063536
3935,0.962153,0.962153,-1.985259
3936,0.995881,0.995881,-2.011955


In [6]:
X_train, X_test, y_train, y_test = ddf[:split], ddf[split:], y[:split], y[split:]

In [7]:
# checking imbalanced data
change = list(np.where(df['change'] > 0, 1, -1 ))
change_series = pd.Series({'Up' : change.count(1), 'Down' : change.count(-1)}, name = 'change_table')
imbalanced_threshold = 0.1
change_series['Imbalance'] = not (-imbalanced_threshold < (change.count(1) - change.count(-1)) / (change.count(1) + change.count(-1)) < imbalanced_threshold)

# imbalanced data checking
if change_series['Imbalance'] == 0:
    print('Imbalanced data do not exist.')
else:
    print('Imbalanced data exists.')

Imbalanced data do not exist.


In [8]:
models = {
    'RF': RandomForestClassifier(),
    'GBDT': XGBClassifier(),
}

RF = 'RF'
GBDT = 'GBDT'
scores = ['precision', 'recall']

In [9]:
# Set the parameters by cross-validation
tuned_parameters = {
     'RF':[{'n_estimators':[1, 2, 3], 'max_depth': [3, 5]}],
     'GBDT':[{'n_estimators':[1, 2, 3], 'max_depth': [3, 5], 'learning_rate': [0.1, 0.05, 0.01]}]
}

In [10]:
for model in models:    
    for score in scores:
        clf_ncv = GridSearchCV(
            models[model], 
            tuned_parameters[model],
            cv = 10,
            scoring = '%s_macro' % score)

        cur_cv = TimeSeriesSplit(n_splits = 10).split(X_train)
        clf_tcv = GridSearchCV(
            models[model], 
            tuned_parameters[model], 
            cv = cur_cv,
            scoring = '%s_macro' % score)
        clf_ncv.fit(X_train, y_train)
        clf_tcv.fit(X_train, y_train)
    print(f'The best parameters for {model} cv are {clf_ncv.best_params_}')
    print(f'The best parameters for {model} time series cv are {clf_tcv.best_params_}')

The best parameters for RF cv are {'max_depth': 3, 'n_estimators': 3}
The best parameters for RF time series cv are {'max_depth': 3, 'n_estimators': 2}
The best parameters for GBDT cv are {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 1}
The best parameters for GBDT time series cv are {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 3}


In [11]:
rf_cv = RandomForestClassifier(n_estimators = 1, max_depth= 3, random_state = 1)
rf_cv.fit(X_train, y_train)
y_pred = rf_cv.predict(X_test)
print(metrics.classification_report(y_test, y_pred))
print(metrics.roc_auc_score(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.90      0.60      0.72       479
           1       0.71      0.94      0.81       506

    accuracy                           0.77       985
   macro avg       0.81      0.77      0.77       985
weighted avg       0.80      0.77      0.77       985

0.7700495927781033


In [12]:
rf_tscv = RandomForestClassifier(n_estimators = 3, max_depth = 3, random_state = 2)
rf_tscv.fit(X_train, y_train)
y_pred = rf_tscv.predict(X_test)
print(metrics.classification_report(y_test, y_pred))
print(metrics.roc_auc_score(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.86      0.88      0.87       479
           1       0.88      0.86      0.87       506

    accuracy                           0.87       985
   macro avg       0.87      0.87      0.87       985
weighted avg       0.87      0.87      0.87       985

0.8692434006947939


In [13]:
gbdt_cv = XGBClassifier(learning_rate = 0.1, n_estimators = 1, max_depth = 3, random_state = 3)
gbdt_cv.fit(X_train, y_train)
y_pred = gbdt_cv.predict(X_test)
print(metrics.classification_report(y_test, y_pred))
print(metrics.roc_auc_score(y_test, y_pred))
print(gbdt_cv.feature_importances_)

              precision    recall  f1-score   support

          -1       0.85      0.88      0.86       479
           1       0.88      0.85      0.87       506

    accuracy                           0.86       985
   macro avg       0.86      0.86      0.86       985
weighted avg       0.86      0.86      0.86       985

0.8643026892323433
[0.9700973  0.         0.02990275]


In [14]:
gbdt_tscv = XGBClassifier(learning_rate = 0.05, n_estimators = 3, max_depth = 3, random_state = 4)
gbdt_tscv.fit(X_train, y_train)
y_pred = gbdt_tscv.predict(X_test)
print(metrics.classification_report(y_test, y_pred))
print(metrics.roc_auc_score(y_test, y_pred))
print(gbdt_tscv.feature_importances_)

              precision    recall  f1-score   support

          -1       0.85      0.88      0.86       479
           1       0.88      0.85      0.87       506

    accuracy                           0.86       985
   macro avg       0.86      0.86      0.86       985
weighted avg       0.86      0.86      0.86       985

0.8643026892323433
[0.9641862  0.         0.03581378]


# Result

In [15]:
print('The data set is sz50')
print('The feature of the models are calculated with the function from genetic algorithm, and the functions are in the picture in the zip file. I call them f1, f2 and f3')
print('GBDT performs the best for my dataset (daily, sz50, from 20040102 to 20200318) with the acuracy 0.86, f1-score 0.86, precision 0.86, ROC/AUC 0.86.')

The data set is sz50
The feature of the models are calculated with the function from genetic algorithm, and the functions are in the picture in the zip file. I call them f1, f2 and f3
GBDT performs the best for my dataset (daily, sz50, from 20040102 to 20200318) with the acuracy 0.86, f1-score 0.86, precision 0.86, ROC/AUC 0.86.
