In [1]:
import numpy as np 
import pandas as pd 
import gc
import utils as u
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, matthews_corrcoef, roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from plotnine import (element_blank, scale_color_manual, scale_x_continuous, ggplot, aes, geom_line ,geom_bar, geom_point, theme, element_text, labs, ggtitle, scale_y_continuous, coord_flip, ggsave)
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

In [2]:
# define path and load train and test data 

output = 'model/5.Stacking/'
path_rf = 'model/3.Random Forest/'
stacking_layer2 = 'model/5.Stacking/stacking layer2 output/'
stacking_layer3 = 'model/5.Stacking/stacking layer3 output/'

tr = pd.read_csv(stacking_layer2 + 'train_layer2.csv')
te = pd.read_csv(stacking_layer2 + 'test_layer2.csv')
tr.replace([np.nan], -9999, inplace=True)
te.replace([np.nan], -9999, inplace=True)
gc.collect()

0

In [3]:
# train_x, train_y 
x = tr.drop(['index', 'Id', 'Response'], axis = 1)
y = tr['Response']
print(x.shape)
print(y.shape)

(1183747, 68)
(1183747,)


In [4]:
tr_layer3 = pd.read_csv(stacking_layer3 + 'train_layer3.csv')
te_layer3 = pd.read_csv(stacking_layer3 + 'test_layer3.csv')

In [11]:
tr_layer3 = tr.copy()
te_layer3 = te.copy()

In [12]:
best_params_lgb = pd.read_csv(path_rf  + 'best_parameters.csv', index_col = 0).to_dict()
best_params_lgb['parameters']

{'max_depth': 30.0,
 'min_samples_leaf': 2.0,
 'min_samples_split': 4.0,
 'n_estimators': 650.0}

# output _ random forest

In [15]:
layer_name = 'rf_tree_medium'
params = best_params_lgb['parameters']
params['var_name'] = layer_name
params['n_estimators'] = 650
params['x'] = x
params['y'] = y
params['te'] = te

In [16]:
clf, scores, tr_stacking, te_stacking = u.rf_model(params)
tr_layer3, te_layer3 = u.submit_and_layer_merge(output = output, csv_name = layer_name, te_stacking = te_stacking, tr_stacking = tr_stacking, te = te, scores = scores, tr_layer = tr_layer3, te_layer = te_layer3)
pd.DataFrame(scores)

fold 1
mcc value: 0.444
g_means value: 0.496
auc value: 0.623
f1_score value: 0.378
threshold: 0.464
fold 2
mcc value: 0.458
g_means value: 0.511
auc value: 0.631
f1_score value: 0.395
threshold: 0.421


Unnamed: 0,fold,mcc,g_means,auc_scores,f1_scores,threshold
0,1,0.44424,0.496413,0.623082,0.37756,0.464286
1,2,0.458064,0.511477,0.630669,0.395257,0.421429


# Output

In [11]:
tr_layer3.to_csv(stacking_layer3 + 'train_layer3.csv', index = 0)
te_layer3.to_csv(stacking_layer3 + 'test_layer3.csv', index = 0)

In [8]:
tr_layer3.head()

Unnamed: 0,index,Id,L1_S24_F1523,L1_S24_F1525,L1_S24_F1582,L1_S24_F1585,L2_S26_F3038,L2_S26_F3099,L3_S32_F3851,L3_S32_F3854,...,xgb_depth_high_tree_medium,xgb_depth_high_tree_high,lgb_depth_low_tree_low,lgb_depth_low_tree_high,lgb_depth_high_tree_low,lgb_depth_high_tree_high,rf_tree_low,rf_tree_medium,rf_tree_high,logistic
0,0,4,0,0,0,0,0,0,0,0,...,0.006533,0.001162,0.001721,0.001366,0.002427,0.001732,0.003682,0.00798,0.009535,0.003111
1,1,6,0,0,0,0,0,0,0,0,...,0.006765,0.002759,0.003299,0.002546,0.003469,0.003527,0.005148,0.009025,0.007399,0.003201
2,2,7,0,0,0,0,0,0,0,0,...,0.008755,0.002348,0.002736,0.00216,0.003508,0.00297,0.009079,0.012793,0.010229,0.003189
3,3,9,0,0,0,0,0,0,0,0,...,0.005985,0.000723,0.000742,0.00029,0.002451,0.000818,0.008208,0.005309,0.004118,0.00307
4,4,11,0,0,0,0,0,0,0,0,...,0.028806,0.024032,0.024498,0.017744,0.018089,0.022893,0.098235,0.114918,0.113526,0.005713


In [10]:
te_layer3.head()

Unnamed: 0,Id,L1_S24_F1523,L1_S24_F1525,L1_S24_F1582,L1_S24_F1585,L2_S26_F3038,L2_S26_F3099,L3_S32_F3851,L3_S32_F3854,L3_S35_F3902,...,xgb_depth_high_tree_medium,xgb_depth_high_tree_high,lgb_depth_low_tree_low,lgb_depth_low_tree_high,lgb_depth_high_tree_low,lgb_depth_high_tree_high,rf_tree_low,rf_tree_medium,rf_tree_high,logistic
0,1,1,3,0,0,0,0,0,0,0,...,0.007828,0.00292,0.002038,0.002447,0.003132,0.002137,0.004891,0.006688,0.008507,0.003209
1,2,0,0,0,0,0,0,0,0,0,...,0.00617,0.001005,0.000803,0.001255,0.002312,0.000843,0.001668,0.00334,0.002472,0.003091
2,3,1,2,0,0,1,2,0,0,0,...,0.008135,0.003432,0.002496,0.002368,0.003239,0.002371,0.010664,0.008948,0.006549,0.003233
3,5,0,0,0,0,1,1,0,0,0,...,0.00885,0.00238,0.002016,0.001985,0.002982,0.001909,0.003437,0.004754,0.004262,0.003184
4,8,1,2,0,0,1,2,0,0,0,...,0.008285,0.003216,0.002275,0.002291,0.003146,0.002119,0.00866,0.00876,0.009671,0.003224


# TEST

In [12]:
data = {"a": [1, 2, 3],
'b': [3, 5, 1],
'c': [45, 98, 12]}
a = pd.DataFrame(data)


Unnamed: 0,a,b,c
0,1,3,45
1,2,5,98
2,3,1,12


In [17]:
def geometric_mean(data):  # 计算几何平均数
    total=1
    for i in data:
        total*=i #等同于total=total*i
    return pow(total,1/len(data))

In [18]:
a['mean'] = a.apply(lambda x: geometric_mean(x), axis = 1)
a

Unnamed: 0,a,b,c,mean
0,1,3,45,6.852549
1,2,5,98,13.608916
2,3,1,12,3.722419


# TEST