In [1]:
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score
from sklearn import metrics
from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("final_v5_dataset.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,time,ticker,abnormal_return,y,volatility,reaction_positive,reaction_negative,volume_adi,volume_mfi,...,item7_polarity,item7_subjectivity,industry_B,industry_C,industry_D,industry_E,industry_F,industry_G,industry_H,industry_I
0,0,2006,nwpx,0.635924,1,0.062822,5,6,-351593.8,50.359009,...,0.519824,0.313428,0,0,1,0,0,0,0,0
1,1,2007,nwpx,0.026142,1,0.147784,5,6,-1440436.0,57.280553,...,0.487179,0.326236,0,0,1,0,0,0,0,0
2,2,2008,nwpx,-0.762775,0,0.180493,3,8,-1414003.0,53.844115,...,0.49005,0.307105,0,0,1,0,0,0,0,0
3,3,2010,nwpx,0.210231,1,0.075963,10,1,-4214262.0,50.947005,...,0.429119,0.321627,0,0,1,0,0,0,0,0
4,4,2011,nwpx,-0.013295,0,0.161627,6,5,-4860406.0,49.96121,...,0.5,0.309386,0,0,1,0,0,0,0,0


In [128]:
y=data['y']
X=data.iloc[:,5:]

In [129]:
# data processing for imbalanced data 
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from collections import Counter
from imblearn.over_sampling import SMOTE 
standardizer = StandardScaler()
sm = SMOTE(random_state=42) 
X_sm, y_sm = sm.fit_resample(X, y) 
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm,test_size = 0.2,random_state = 150)
standardizer.fit(X_train) 
X_train = standardizer.transform(X_train) 
X_test = standardizer.transform(X_test)


In [47]:
def algorithm_pipeline(X_train_data, X_test_data, y_train_data, y_test_data, 
                       model, param_grid, cv=5, scoring_fit='accuracy', 
                       scoring_test=accuracy_score):
    
    gs = GridSearchCV(
        estimator=model,
        param_grid=param_grid, 
        cv=cv, 
        # n_jobs=4, 
        scoring=scoring_fit,
        verbose=1
    )    
    fitted_model = gs.fit(X_train_data, y_train_data)
    best_model = fitted_model.best_estimator_
    pred = fitted_model.predict(X_test_data)
    score = scoring_test(y_test_data, pred)
    
    return [best_model, pred, score]

In [48]:
grid_parameters = [
    { # xgb  
        #'n_estimators': [200,220,240,260],
        #'max_depth': [8,9,10,11,12], 
        #'min_child_weight':range(1,10,2)
        #'gamma':[i/10.0 for i in range(0,5)]
        #'subsample':[0.9,1],
        #'colsample_bytree':[0.9,1],
        #'reg_alpha':[0,0.01,0.015,0.02],
        #'eta':[0.15,0.2,0.25]
    }, 
    { # lgbm
        #'n_estimators': [i*20 for i in range(5,11)],
        #'max_depth': [i for i in range(6,10)],
        #'min_child_weight':range(1,10,2)
        #'reg_alpha': [0,0.05,0.1],
        #'learning_rate':[0.05,0.1,0.15]
    }, 
    
    { # randomforest
        #'n_estimators': [200,220,240,260],
        #'max_depth': [9,10,11,12], 
        #'max_features': [3,4,5,6],
        #'min_samples_leaf': [3, 4, 5],
        #'min_samples_split': [3,4,5,6]
    },
    { # dt
        #'criterion': ['gini', 'entropy'],
        #'max_depth': [9,10,11,12],
        #'splitter': ['best', 'random'],
        #'min_samples_leaf': [5, 10, 20, 50, 100],
        
    },
    { # gradient boost
        #'n_estimators': [200,220,240,260],
        #'max_depth': [8,9,10,11,12],
        #'min_samples_split':np.linspace(0.1, 1.0, 10, endpoint=True),
        #'min_samples_leaf': np.linspace(0.1, 0.5, 5, endpoint=True),
        #'max_features':list(range(1,35,3)),
        #'learning_rate':[0.15,0.2,0.25]
    },
 
]

In [None]:
models_to_train = [xgb.XGBClassifier(eval_metric=['logloss','auc','error'],use_label_encoder=False,random_state = 88), 
                   lgb.LGBMClassifier(random_state = 88), RandomForestClassifier(random_state = 88),
                   DecisionTreeClassifier(random_state = 88),  GradientBoostingClassifier(random_state = 88) ]
#best_model, pred, score
models_preds_scores = []

# loop through each model, 4 in this case
for i, model in enumerate(models_to_train):
    params = grid_parameters[i]
    
    result = algorithm_pipeline(X_train, X_test, y_train, y_test,
                                 model, params, cv=5)
    models_preds_scores.append(result)

In [None]:
for result in models_preds_scores:
    print('Model: {0}, Score: {1}'.format(result[0], result[2]))
xgb_classifier = models_preds_scores[0][0]
lgbm_classifier = models_preds_scores[1][0]
rf_classifier = models_preds_scores[2][0]
tree_classifier = models_preds_scores[3][0]
gb_classifier= models_preds_scores[4][0]


In [130]:
xgb_classifier = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=0.15,
              eval_metric=['logloss', 'auc', 'error'], gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.150000006, max_delta_step=0, max_depth=11,
              min_child_weight=1, monotone_constraints='()',
              n_estimators=260, n_jobs=10, num_parallel_tree=1, random_state=88,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)
xgb_classifier.fit(X_train,y_train)
lgbm_classifier = lgb.LGBMClassifier(max_depth=7, min_child_weight=1, n_estimators=120,
               random_state=88, reg_alpha=0)
lgbm_classifier.fit(X_train,y_train)
rf_classifier = RandomForestClassifier(max_depth=12, max_features=5, min_samples_leaf=4,
                       n_estimators=200, random_state=88)
rf_classifier.fit(X_train,y_train)
tree_classifier =  DecisionTreeClassifier(min_samples_leaf=10,max_depth=12,random_state = 88,splitter='random')
tree_classifier.fit(X_train,y_train)
gb_classifier = GradientBoostingClassifier(max_depth=12, n_estimators=260,random_state = 88)
gb_classifier.fit(X_train,y_train)


GradientBoostingClassifier(max_depth=12, n_estimators=260, random_state=88)

In [153]:
def cal_score(model, X_test,y_test):
    pred = model.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, pred)
    return accuracy
modelscore = []
modelscore.append(cal_score(xgb_classifier,X_test,y_test))
modelscore.append(cal_score(lgbm_classifier,X_test,y_test))
modelscore.append(cal_score(rf_classifier,X_test,y_test))
modelscore.append(cal_score(tree_classifier,X_test,y_test))
modelscore.append(cal_score(gb_classifier,X_test,y_test))
print(modelscore)

[0.7522123893805309, 0.7367256637168141, 0.7020648967551623, 0.6039823008849557, 0.7654867256637168]


method 1

In [131]:
from sklearn.ensemble import StackingClassifier
def get_model(model_level1):
    
    level0 = []
    level0.append(('xgb', xgb_classifier))
    level0.append(('lgbm', lgbm_classifier))
    level0.append(('rf', rf_classifier))
    level0.append(('cart', tree_classifier))
    level0.append(('gb',gb_classifier))
   
    level1 = model_level1
   
    model = StackingClassifier(estimators=level0, final_estimator=level1)
    return model


def train_model(model, X, y):
    model.fit(X, y)
    return model


def evaluate_model(model, X_test, y_test):    
    y_pred = model.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    return score

stacking_model_1 = get_model(LogisticRegression())
stacking_model_1 = train_model(stacking_model_1, X_train, y_train)

score_1 = evaluate_model(stacking_model_1, X_test, y_test)
#print('Score of stacking model:', score_1)

In [132]:
print('Score of stacking model:', score_1)
# 0.7728613569321534

Score of stacking model: 0.7728613569321534


In [133]:
#'kernel': ['poly', 'rbf', 'sigmoid'], # you shoul duse only rbf, here is for illustration purpose 
#'C': [50, 10, 1.0, 0.1, 0.01], # There are two parameters, tune C carefully. (Gamma is decided by 'scale' option)
#'gamma': ['scale']

from sklearn.svm import SVC
score = []
for i in [10,1,0.1]:
    stacking_model_2 = get_model(SVC(kernel = 'rbf',C=i, gamma = 'scale'))
    stacking_model_2 = train_model(stacking_model_2, X_train, y_train)

    score_2 = evaluate_model(stacking_model_2, X_test, y_test)
    score.append(score_2)


In [134]:
print('Score of stacking model:', score)
#0.7706489675516224

Score of stacking model: [0.7706489675516224, 0.7662241887905604, 0.7684365781710915]


In [135]:
from mlxtend.classifier import StackingClassifier
stack_classifier = StackingClassifier(classifiers=[xgb_classifier, lgbm_classifier, rf_classifier,tree_classifier,gb_classifier], 
                                     meta_classifier=LogisticRegression())

stack_classifier.fit(X_train, y_train)
y_pred = stack_classifier.predict(X_test)
print('Final prediction score: %f' % accuracy_score(y_test, y_pred))
#0.772124

Final prediction score: 0.772124


In [146]:
from mlxtend.classifier import StackingClassifier
stack_classifier = StackingClassifier(classifiers=[xgb_classifier, lgbm_classifier,rf_classifier,tree_classifier,gb_classifier], 
                                     meta_classifier=SVC(kernel = 'rbf',C=10, gamma = 'scale'))

stack_classifier.fit(X_train, y_train)
y_pred = stack_classifier.predict(X_test)
print('Final prediction score: %f' % accuracy_score(y_test, y_pred))
#0.771386

Final prediction score: 0.771386


In [137]:
from vecstack import stacking
level0_models = [xgb_classifier, lgbm_classifier, rf_classifier,tree_classifier,gb_classifier]
S_train, S_test = stacking(level0_models, X_train, y_train, X_test, regression=False,
                           mode='oof_pred_bag', needs_proba=False, save_dir=None,
                           metric=accuracy_score, n_folds=5,  stratified=True,
                           shuffle=True, random_state=0, verbose=0)

In [147]:
level1_model = LogisticRegression()

level1_model = level1_model.fit(S_train, y_train)
y_pred = level1_model.predict(S_test)
print('Final prediction score: %f' % accuracy_score(y_test, y_pred))
#0.769912

Final prediction score: 0.769912


In [150]:
level1_model = SVC(kernel = 'rbf',C=1, gamma = 'scale')

level1_model = level1_model.fit(S_train, y_train)
y_pred = level1_model.predict(S_test)
print('Final prediction score: %f' % accuracy_score(y_test, y_pred))

Final prediction score: 0.771386


method 2

In [139]:
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
import numpy as np

#olds = StratifiedKFold(n_splits = 5)
def get_oof(clf ,x_train, y_train, x_test, n_folds = 5):
    """K-fold stacking"""
    ntrain = X_train.shape[0]
    ntest =  X_test.shape[0]
    classnum = len(np.unique(y_train))
    oof_test_skf = np.empty((5, ntest))
    kf = KFold(n_splits=n_folds)
    oof_train = np.zeros((ntrain,classnum))
    oof_test = np.zeros((ntest,classnum))
        
    for i,(train_index,test_index) in enumerate(kf.split(X_train)):
        #print(type(train_index))
        
        kf_X_train = X_train[train_index] # 数据
        kf_y_train = y_train[train_index] # 标签
        
        kf_X_test = X_train[test_index]  # k-fold的验证集
        
        clf.fit(kf_X_train, kf_y_train)
        
        oof_train[test_index] = clf.predict_proba(kf_X_test)
        #oof_test_skf[i,:]=clf.predict_proba(X_test)[:,0]
        oof_test += clf.predict_proba(X_test)
    oof_test = oof_test/float(n_folds)
    return oof_train, oof_test

In [140]:
# new feature input level1
import functools 
newfeature_list_1 = []
newtestdata_list_1 = []
xgb_oof_train_1, xgb_oof_test_1 = get_oof(xgb_classifier,X_train, y_train.values, X_test)
newfeature_list_1.append(xgb_oof_train_1)
newtestdata_list_1.append(xgb_oof_test_1)

lgbm_oof_train_1, xgb_oof_test_1 = get_oof(lgbm_classifier,X_train, y_train.values, X_test)
newfeature_list_1.append(lgbm_oof_train_1)
newtestdata_list_1.append(xgb_oof_test_1)

rf_oof_train_1, rf_oof_test_1 = get_oof(rf_classifier,X_train, y_train.values, X_test)
newfeature_list_1.append(rf_oof_train_1)
newtestdata_list_1.append(rf_oof_test_1)

dt_oof_train_1, dt_oof_test_1 = get_oof(tree_classifier,X_train, y_train.values, X_test)
newfeature_list_1.append(dt_oof_train_1)
newtestdata_list_1.append(dt_oof_test_1)

gb_oof_train_1, gb_oof_test_1 = get_oof(gb_classifier,X_train, y_train.values, X_test)
newfeature_list_1.append(gb_oof_train_1)
newtestdata_list_1.append(gb_oof_test_1)


In [141]:
newfeature_1 = functools.reduce(lambda x,y:np.concatenate((x,y),axis=1),newfeature_list_1)    
newtestdata_1 =functools.reduce(lambda x,y:np.concatenate((x,y),axis=1),newtestdata_list_1)

In [142]:
lr = LogisticRegression()
lr.fit(newfeature_1, y_train)
pred_1 = lr.predict(newtestdata_1)
accuracy_1 = accuracy_score(y_test, pred_1)
print(accuracy_1)
#0.7802359882005899

0.7802359882005899


In [152]:
from sklearn.svm import SVC
svr = SVC(kernel = 'rbf',C=0.1, gamma = 'scale',probability=True)
svr.fit(newfeature_1, y_train)
pred_2 = svr.predict(newtestdata_1)
accuracy_2 = accuracy_score(y_test, pred_2)
print(accuracy_2)


0.7721238938053098
