In [1]:
import pandas as pd
import numpy as np
import copy, time, os
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder

In [2]:
print(os.listdir('data/'))

['enron61702insiderpay.pdf', 'sample_submission.csv', 'test_features.csv', 'train_data.csv']


In [3]:
data_path = 'data/'
df_train = pd.read_csv(data_path + 'train_data.csv')
df_test = pd.read_csv(data_path + 'test_features.csv')
df_sample_submission = pd.read_csv(data_path + 'sample_submission.csv')
df_train.shape, df_test.shape, df_sample_submission.shape

((113, 22), (33, 21), (33, 2))

In [4]:
df_train.head()

Unnamed: 0,name,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,...,long_term_incentive,other,poi,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
0,RICE KENNETH D,1750000.0,,-3504386.0,,ken.rice@enron.com,19794175.0,46950.0,18.0,42.0,...,1617011.0,174839.0,True,2748364.0,,420636.0,864.0,905.0,505050.0,22542539.0
1,SKILLING JEFFREY K,5600000.0,,,,jeff.skilling@enron.com,19250000.0,29336.0,108.0,88.0,...,1920000.0,22122.0,True,6843672.0,,1111258.0,2042.0,3627.0,8682716.0,26093672.0
2,SHELBY REX,200000.0,,-4167.0,,rex.shelby@enron.com,1624396.0,22884.0,39.0,13.0,...,,1573324.0,True,869220.0,,211844.0,91.0,225.0,2003885.0,2493616.0
3,KOPPER MICHAEL J,800000.0,,,,michael.kopper@enron.com,,118134.0,,,...,602671.0,907502.0,True,985032.0,,224305.0,,,2652612.0,985032.0
4,CALGER CHRISTOPHER F,1250000.0,,-262500.0,,christopher.calger@enron.com,,35818.0,144.0,199.0,...,375304.0,486.0,True,126027.0,,240189.0,2188.0,2598.0,1639297.0,126027.0


In [5]:
train_Y = pd.DataFrame(LabelEncoder().fit_transform(df_train['poi']))
df_train['poi'] = train_Y
df_train.pop('poi')
train_Y

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
...,...
108,0
109,0
110,0
111,0


In [6]:
#合併資料 一起做特徵工程
train_num = train_Y.shape[0]
all_data = pd.concat([df_train, df_test])
all_data.shape

(146, 21)

In [7]:
name = df_test['name']
all_data.pop('name')

0           RICE KENNETH D
1       SKILLING JEFFREY K
2               SHELBY REX
3         KOPPER MICHAEL J
4     CALGER CHRISTOPHER F
              ...         
28         BIBI PHILIPPE A
29         SHERRIFF JOHN R
30            GIBBS DANA R
31          LINDHOLM TOD A
32         MCMAHON JEFFREY
Name: name, Length: 146, dtype: object

In [8]:
def na_check(df_data):
    data_na = (df_data.isnull().sum() / len(df_data)) * 100
    data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :data_na})
    display(missing_data.head(10))
na_check(all_data)

Unnamed: 0,Missing Ratio
loan_advances,97.260274
director_fees,88.356164
restricted_stock_deferred,87.671233
deferral_payments,73.287671
deferred_income,66.438356
long_term_incentive,54.794521
bonus,43.835616
from_messages,41.09589
from_poi_to_this_person,41.09589
from_this_person_to_poi,41.09589


In [11]:
all_data['have_mail'] = all_data['email_address'].fillna(0).map(lambda x:0 if x==0 else 1)

In [12]:
#all_data['email_address'] = pd.DataFrame(LabelEncoder().fit_transform(all_data['email_address'].fillna('NAN')))
all_data.pop('email_address')

0               ken.rice@enron.com
1          jeff.skilling@enron.com
2             rex.shelby@enron.com
3         michael.kopper@enron.com
4     christopher.calger@enron.com
                  ...             
28         philippe.bibi@enron.com
29         john.sherriff@enron.com
30            dana.gibbs@enron.com
31          tod.lindholm@enron.com
32       jeffrey.mcmahon@enron.com
Name: email_address, Length: 146, dtype: object

In [10]:
#MAIL特徵 ['to_messages', 'email_address', 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi']
#for c in ['to_messages', 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi']:
#    all_data[c] = all_data[c].fillna(0)

In [11]:
#all_data.pop('total_payments')
#all_data.pop('total_stock_value')

In [13]:
for c in ['loan_advances','director_fees','restricted_stock_deferred','deferral_payments','deferred_income','long_term_incentive','bonus']:
    all_data[c] = all_data[c].fillna(0)
    all_data[c] = np.log1p(all_data[c]).fillna(0)
na_check(all_data)

Unnamed: 0,Missing Ratio
to_messages,41.09589
shared_receipt_with_poi,41.09589
from_this_person_to_poi,41.09589
from_poi_to_this_person,41.09589
from_messages,41.09589
other,36.30137
salary,34.931507
expenses,34.931507
exercised_stock_options,30.136986
restricted_stock,24.657534


In [14]:
for c in ['to_messages','shared_receipt_with_poi','from_this_person_to_poi','from_poi_to_this_person','from_messages','other','salary','expenses','exercised_stock_options','restricted_stock','total_payments','total_stock_value']:
    all_data[c] = all_data[c].fillna(0)
    all_data[c] = np.log1p(all_data[c]).fillna(0)
na_check(all_data)

Unnamed: 0,Missing Ratio


In [25]:
# import seaborn as sns
# import matplotlib.pyplot as plt
# corr = all_data.corr()
# sns.heatmap(corr)
# plt.show()

In [26]:
# corr

In [27]:
# high_list = list(corr[(corr['poi']>0.25) | (corr['poi']<-0.25)].index)
# high_list.pop(5)
# print(high_list)

In [15]:
MMEncoder = MinMaxScaler()
train_X = MMEncoder.fit_transform(all_data)
# estimator = GradientBoostingClassifier()
# cross_val_score(estimator, train_X, train_Y, cv=5).mean()

In [18]:
#財務特徵  ['salary', 'deferral_payments', 'total_payments', 'loan_advances', 'bonus', 'restricted_stock_deferred', 'deferred_income', 'total_stock_value', 'expenses', 'exercised_stock_options', 'other', 'long_term_incentive', 'restricted_stock', 'director_fees']


In [14]:
#all_data['bonus'] = all_data['bonus'].fillna(0)

# for c in ['total_payments', 'salary', 'deferral_payments', 'loan_advances', 'deferred_income',  'expenses', 'other', 'long_term_incentive', 'director_fees']:
#     all_data[c] = all_data[c].fillna(all_data[c].median())
#na_check(all_data)

In [15]:
#for c in ['total_stock_value','restricted_stock_deferred','exercised_stock_options','restricted_stock']:
#    all_data[c] = all_data[c].fillna(0)
#na_check(all_data)

In [12]:
# MMEncoder = MinMaxScaler()
# for col in all_data.columns:
#     all_data[col] = MMEncoder.fit_transform(all_data[col].values.reshape(-1,1))


In [16]:
all_data.describe()

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,loan_advances,long_term_incentive,other,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value,have_mail
count,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0,146.0
mean,7.648413,3.308122,0.0,1.318645,9.729579,6.789197,2.597995,1.906663,1.297594,0.437464,5.933559,6.39556,9.892364,0.186627,8.070261,3.702992,4.134479,11.599808,12.058717,0.760274
std,6.825948,5.672042,0.0,3.666848,6.565177,5.121815,2.560611,2.048469,1.722164,2.644036,6.597591,5.518466,5.886797,1.625863,5.988279,3.346903,3.584626,5.126997,5.152227,0.428386
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.596949,0.0,0.0,0.0,0.0,11.449142,12.339272,1.0
50%,12.611541,0.0,0.0,0.0,13.318414,9.910361,2.861793,1.151293,0.0,0.0,0.0,6.867369,12.795328,0.0,12.257702,4.63336,5.666726,13.754724,13.780819,1.0
75%,13.592368,9.172615,0.0,0.0,14.354348,10.891923,3.955725,3.731645,2.690802,0.0,12.834856,11.922432,13.606767,0.0,12.509319,6.796185,7.369169,14.492625,14.657031,1.0
max,18.393758,17.283849,0.0,14.150924,19.557757,15.470915,9.572828,6.270988,6.413459,18.245434,17.697526,17.56895,18.685521,16.553527,17.100333,8.616495,9.625756,19.551717,19.889728,1.0


In [21]:
# ['salary', 'deferral_payments', 'loan_advances', 'bonus', 'deferred_income',  'expenses', 'other', 'long_term_incentive', 'director_fees']
#all_data['total_payments']



In [17]:
all_data_train = all_data[:train_num]
all_data_test = all_data[train_num:]
all_data_train.shape, all_data_test.shape , train_Y.shape

((113, 20), (33, 20), (113, 1))

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(all_data_train, train_Y, test_size=0.2, random_state=22)

In [46]:
#LogisticRegression
lr = LogisticRegression(tol=0.0001, penalty='l2', fit_intercept=True, C=1.0)
lr.fit(X_train, Y_train)
lr_pred = lr.predict(X_test)
lr_pred_proba = lr.predict_proba(X_test)

from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(Y_test, lr_pred_proba[:,1]) 
print('AUC:',metrics.auc(fpr, tpr)) 

AUC: 0.8833333333333333


In [47]:
#RandomForest
rfc = RandomForestClassifier(n_estimators=200, min_samples_split=2, min_samples_leaf=4,max_features='sqrt', max_depth=3, bootstrap=True)
rfc.fit(X_train, Y_train)
rfc_pred = rfc.predict(X_test)
rfc_pred_proba = rfc.predict_proba(X_test)

fpr, tpr, thresholds = metrics.roc_curve(Y_test, rfc_pred_proba[:,1]) 
print('AUC:',metrics.auc(fpr, tpr)) 

AUC: 0.75


In [48]:
#GradientBoosting
gdbc = GradientBoostingClassifier(tol=[100], subsample=0.8, n_estimators=100, max_features=0.1,max_depth=5, learning_rate=0.03)
gdbc.fit(X_train, Y_train)
gdbc_pred = gdbc.predict(X_test)
gdbc_pred_proba = gdbc.predict_proba(X_test)

fpr, tpr, thresholds = metrics.roc_curve(Y_test, gdbc_pred_proba[:,1]) 
print('AUC:',metrics.auc(fpr, tpr)) 

AUC: 0.8333333333333333


In [49]:
blending_pred = lr_pred_proba[:,1]*0.05 + rfc_pred_proba[:,1]*0.2 + gdbc_pred_proba[:,1]*0.75
fpr, tpr, thresholds = metrics.roc_curve(Y_test, blending_pred) 
print('AUC:',metrics.auc(fpr, tpr)) 

AUC: 0.8333333333333333


In [None]:
# lr = LogisticRegression(tol=0.001, penalty='l2', fit_intercept=True, C=1.0)
# gdbt = GradientBoostingClassifier(tol=100, subsample=0.75, n_estimators=250, max_features=20,
#                                   max_depth=6, learning_rate=0.03)
# rf = RandomForestClassifier(n_estimators=100, min_samples_split=2, min_samples_leaf=1, 
#                             max_features='sqrt', max_depth=6, bootstrap=True)

In [50]:
from mlxtend.classifier import StackingClassifier
meta_estimator = GradientBoostingClassifier(tol=[100], subsample=0.8, n_estimators=100, max_features=0.1,max_depth=5, learning_rate=0.03)
stacking = StackingClassifier(classifiers=[lr, gdbc, rfc], meta_classifier=meta_estimator)
stacking.fit(X_train, Y_train)
stacking_pred = stacking.predict_proba(X_test)

fpr, tpr, thresholds = metrics.roc_curve(Y_test, stacking_pred[:,1]) 
print('AUC:',metrics.auc(fpr, tpr)) 

AUC: 0.8083333333333332


In [35]:
#調參數
from sklearn.model_selection import KFold, GridSearchCV

#logistic
param_grid = dict(tol=[0.0001, 0.0005, 0.001], penalty=['l1','l2'], fit_intercept=[True, False], C=[0.5, 1.0, 1.5])
grid_search = GridSearchCV(lr, param_grid, scoring="roc_auc", n_jobs=-1, verbose=1)
grid_result = grid_search.fit(X_train, Y_train)
print("Best logistic Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

#RF
param_grid = dict(n_estimators=[100,150,200,250,300], min_samples_split=[2], min_samples_leaf=[1,2,4,8,16],max_features=['sqrt'], max_depth=[3,4,5,6], bootstrap=[True,False])
grid_search = GridSearchCV(rfc, param_grid, scoring="roc_auc", n_jobs=-1, verbose=1)
grid_result = grid_search.fit(X_train, Y_train)
print("Best RF Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))


#GDBC
param_grid = dict(tol=[100], subsample=[0.7,0.75,0.8], n_estimators=[100,150,200,250,300], max_features=[0.1,0.2],max_depth=[3,4,5,6], learning_rate=[0.02,0.03])
grid_search = GridSearchCV(gdbc, param_grid, scoring="roc_auc", n_jobs=-1, verbose=1)
grid_result = grid_search.fit(X_train, Y_train)
print("Best GDBC Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 149 out of 180 | elapsed:    1.4s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


Best logistic Accuracy: 0.887500 using {'C': 1.0, 'fit_intercept': True, 'penalty': 'l2', 'tol': 0.0001}
Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:   13.9s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   17.3s finished


Best RF Accuracy: 0.831250 using {'bootstrap': True, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
Fitting 5 folds for each of 240 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 304 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 804 tasks      | elapsed:    8.2s


Best GDBC Accuracy: 0.806250 using {'learning_rate': 0.03, 'max_depth': 5, 'max_features': 0.1, 'n_estimators': 100, 'subsample': 0.8, 'tol': 100}


[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:   12.7s finished


In [51]:
#LogisticRegression
# lr_pred = lr.predict(all_data_test)
lr_pred_proba = lr.predict_proba(all_data_test)
sub = pd.DataFrame({'name': name, 'poi': lr_pred_proba[:,1]})
sub.to_csv('Midterm_exam_lr.csv', index=False)

In [52]:
#RandomForest
# rfc_pred = rfc.predict(all_data_test)
rfc_pred_proba = rfc.predict_proba(all_data_test)
sub = pd.DataFrame({'name': name, 'poi': rfc_pred_proba[:,1]})
sub.to_csv('Midterm_exam_rfc.csv', index=False)

In [53]:
#GradientBoosting
# gdbc_pred = gdbc.predict(all_data_test)
gdbc_pred_proba = gdbc.predict_proba(all_data_test)
sub = pd.DataFrame({'name': name, 'poi': gdbc_pred_proba[:,1]})
sub.to_csv('Midterm_exam_gdbc.csv', index=False)

In [55]:
#Stacking
stacking_pred = stacking.predict_proba(all_data_test)
sub = pd.DataFrame({'name': name, 'poi': stacking_pred[:,1]})
sub.to_csv('Midterm_exam_stacking.csv', index=False)

In [56]:
#Blending
blending_pred = lr_pred_proba[:,1]*0.05 + rfc_pred_proba[:,1]*0.2 + gdbc_pred_proba[:,1]*0.75
sub = pd.DataFrame({'name': name, 'poi': blending_pred})
sub.to_csv('Midterm_exam_blending.csv', index=False)

In [43]:
# 設定要訓練的超參數組合
n_estimators = [100, 150, 200, 250, 300]
max_depth = [1, 2, 3, 4, 5]
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)

## 建立搜尋物件，放入模型及參數組合字典 (n_jobs=-1 會使用全部 cpu 平行運算)
grid_search = GridSearchCV(rfc, param_grid, scoring="roc_auc", n_jobs=-1, verbose=1)

# 開始搜尋最佳參數
grid_result = grid_search.fit(X_train, Y_train)

print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  94 out of 125 | elapsed:    2.0s remaining:    0.6s


Best Accuracy: 0.756250 using {'max_depth': 1, 'n_estimators': 200}


[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:    2.6s finished


In [85]:
#df_sample_submission['poi'].map(lambda x:1 if x>0.5 else  0)