In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

data_path = './Data/'
df_train = pd.read_csv(data_path + 'train_data.csv')
df_test = pd.read_csv(data_path + 'test_features.csv')

train_Y = df_train.poi
ids = df_test.name
df_train = df_train.drop(['name', 'poi'], axis = 1)
df_test = df_test.drop(['name'], axis = 1)
train_num = train_Y.shape[0]
df = pd.concat([df_train, df_test])
df.head(10)


Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,loan_advances,long_term_incentive,other,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
0,1750000.0,,-3504386.0,,ken.rice@enron.com,19794175.0,46950.0,18.0,42.0,4.0,,1617011.0,174839.0,2748364.0,,420636.0,864.0,905.0,505050.0,22542539.0
1,5600000.0,,,,jeff.skilling@enron.com,19250000.0,29336.0,108.0,88.0,30.0,,1920000.0,22122.0,6843672.0,,1111258.0,2042.0,3627.0,8682716.0,26093672.0
2,200000.0,,-4167.0,,rex.shelby@enron.com,1624396.0,22884.0,39.0,13.0,14.0,,,1573324.0,869220.0,,211844.0,91.0,225.0,2003885.0,2493616.0
3,800000.0,,,,michael.kopper@enron.com,,118134.0,,,,,602671.0,907502.0,985032.0,,224305.0,,,2652612.0,985032.0
4,1250000.0,,-262500.0,,christopher.calger@enron.com,,35818.0,144.0,199.0,25.0,,375304.0,486.0,126027.0,,240189.0,2188.0,2598.0,1639297.0,126027.0
5,,10259.0,,,joe.hirko@enron.com,30766064.0,77978.0,,,,,,2856.0,,,,,,91093.0,30766064.0
6,,,,,scott.yeager@enron.com,8308552.0,53947.0,,,,,,147950.0,3576206.0,,158403.0,,,360300.0,11884758.0
7,1200000.0,27610.0,-144062.0,,wes.colwell@enron.com,,16514.0,40.0,240.0,11.0,,,101740.0,698242.0,,288542.0,1132.0,1758.0,1490344.0,698242.0
8,7000000.0,202911.0,-300000.0,,kenneth.lay@enron.com,34348384.0,99832.0,36.0,123.0,16.0,81525000.0,3600000.0,10359729.0,14761694.0,,1072321.0,2411.0,4273.0,103559793.0,49110078.0
9,600000.0,,,,ben.glisan@enron.com,384728.0,125978.0,16.0,52.0,6.0,,71023.0,200308.0,393818.0,,274975.0,874.0,873.0,1272284.0,778546.0


In [42]:
# 篩選出數值特徵
# num_features = []
# for dtype, feature in zip(df.dtypes, df.columns):
#     if (dtype == 'float64') | (dtype == 'int64'):
#         num_features.append(feature)
# print(f'{len(num_features)} Numeric Features: {num_features}')


In [13]:
# LabelEncoder + 缺失值(-1) + MinMaxScaler + RandomForestClassifier
LEncoder = LabelEncoder()
MMEncoder = MinMaxScaler()
for c in df.columns:
    if df[c].dtype == 'object':
        df[c] = df[c].fillna('None')
        df[c] = LEncoder.fit_transform(df[c])
    else:
        df[c] = df[c].fillna(df[c].mean())
    df[c] = MMEncoder.fit_transform(df[c].values.reshape(-1, 1))
df.head()

# auc = 0.8 
train_X = df[:train_num]
test_X = df[train_num:]

clf = RandomForestClassifier() 
clf.fit(train_X, train_Y)
pred = clf.predict_proba(test_X)
sub = pd.DataFrame({'name':ids, 'poi':pred[:, 0]})
sub.to_csv('Submit.csv', index = False)



In [32]:
# stacking - auc = 0.44
from mlxtend.classifier import StackingClassifier
lr = LogisticRegression(tol=0.001, penalty='l2', fit_intercept=True, C=1.0)
rf = RandomForestClassifier(n_estimators=100, min_samples_split=2, min_samples_leaf=1, 
                            max_features='log2', max_depth=6, bootstrap=True)
gdbt = GradientBoostingClassifier(tol=100, subsample=0.75, n_estimators=250,
                                  max_depth=6, learning_rate=0.03)
meta_estimator = gdbt
stacking = StackingClassifier(classifiers = [lr, rf, gdbt], meta_classifier = meta_estimator)
stacking.fit(train_X, train_Y)
stacking_pred = stacking.predict_proba(test_X)
sub = pd.DataFrame({'name':ids, 'poi':stacking_pred[:, 0]})
sub.to_csv('kaggle_stacking.csv', index = False)


