# Enron Fruad

In [2]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib notebook
from scipy.stats import trim_mean
import seaborn as sns
import numpy as np
import pandas as pd
import sys
import pickle
from feature_format import featureFormat
from feature_format import targetFeatureSplit

from sklearn.ensemble import GradientBoostingClassifier

features_list = ["poi"]

data_dict = pickle.load(open("final_project_dataset.pkl", "r") )

clf = GradientBoostingClassifier()   

pickle.dump(clf, open("my_classifier.pkl", "w") )
pickle.dump(data_dict, open("my_dataset.pkl", "w") )
pickle.dump(features_list, open("my_feature_list.pkl", "w") )



## 数据探索
- 数据点总数
- 类之间的分配（POI/非 POI）
- 使用的特征数量
- 是否有哪些特征有很多缺失值？等。

In [3]:
df = pd.DataFrame(data_dict).T
df.head()

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,...,long_term_incentive,other,poi,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
ALLEN PHILLIP K,4175000.0,2869717.0,-3081055.0,,phillip.allen@enron.com,1729541.0,13868,2195.0,47.0,65.0,...,304805.0,152.0,False,126027.0,-126027.0,201955.0,1407.0,2902.0,4484442,1729541
BADUM JAMES P,,178980.0,,,,257817.0,3486,,,,...,,,False,,,,,,182466,257817
BANNANTINE JAMES M,,,-5104.0,,james.bannantine@enron.com,4046157.0,56301,29.0,39.0,0.0,...,,864523.0,False,1757552.0,-560222.0,477.0,465.0,566.0,916197,5243487
BAXTER JOHN C,1200000.0,1295738.0,-1386055.0,,,6680544.0,11200,,,,...,1586055.0,2660303.0,False,3942714.0,,267102.0,,,5634343,10623258
BAY FRANKLIN R,400000.0,260455.0,-201641.0,,frank.bay@enron.com,,129142,,,,...,,69.0,False,145796.0,-82782.0,239671.0,,,827696,63014


### 数据点总数
- 146嫌疑犯，21個特征，包含「财务特征」，「邮件特征」，「POI 标签」

In [4]:
df.shape


(146, 21)

### 类之间的分配

- 「财务特征」為 Numerical

- 「邮件特征」為 Numerical / Text

- 「POI标签」為 Categorical


In [37]:
df.dtypes

bonus                        object
deferral_payments            object
deferred_income              object
director_fees                object
email_address                object
exercised_stock_options      object
expenses                     object
from_messages                object
from_poi_to_this_person      object
from_this_person_to_poi      object
loan_advances                object
long_term_incentive          object
other                        object
poi                          object
restricted_stock             object
restricted_stock_deferred    object
salary                       object
shared_receipt_with_poi      object
to_messages                  object
total_payments               object
total_stock_value            object
dtype: object

In [17]:
df.columns.values
flt_col = ['bonus', 'deferral_payments', 'deferred_income', 'director_fees',
       'exercised_stock_options', 'expenses',
       'from_messages', 'from_poi_to_this_person',
       'from_this_person_to_poi', 'loan_advances', 'long_term_incentive',
       'other', 'restricted_stock', 'restricted_stock_deferred',
       'salary', 'shared_receipt_with_poi', 'to_messages',
       'total_payments', 'total_stock_value']

### 把「email_address」，「poi」以外做轉換型態

In [18]:
df[flt_col] = df[flt_col].apply(pd.to_numeric, errors='coerce')
df.dtypes

bonus                        float64
deferral_payments            float64
deferred_income              float64
director_fees                float64
email_address                 object
exercised_stock_options      float64
expenses                     float64
from_messages                float64
from_poi_to_this_person      float64
from_this_person_to_poi      float64
loan_advances                float64
long_term_incentive          float64
other                        float64
poi                           object
restricted_stock             float64
restricted_stock_deferred    float64
salary                       float64
shared_receipt_with_poi      float64
to_messages                  float64
total_payments               float64
total_stock_value            float64
dtype: object

### 使用的特征数量
### 是否有哪些特征有很多缺失值？等

In [5]:
df_miss = df.select_dtypes(include = ['float64', 'int64'])#.iloc[:, 1:]
total = df_miss.isnull().sum().sort_values(ascending=False)
pd.DataFrame(total, columns=['Missing Values'])

Unnamed: 0,Missing Values
loan_advances,142
director_fees,129
restricted_stock_deferred,128
deferral_payments,107
deferred_income,97
long_term_incentive,80
bonus,64
from_this_person_to_poi,60
from_poi_to_this_person,60
from_messages,60


### 遺失值過多的Feature，丟失過多數據，不適用填充值方式取代
- loan_advances	142
- director_fees	129
- restricted_stock_deferred	128
- deferral_payments	107
- deferred_income	97
- long_term_incentive	80

### 遺失值比較少的Feature
- exercised_stock_options	44
- restricted_stock	36
- total_payments	21
- total_stock_value	20


#### 針對遺失值較少的處理方式
- 透過相關性，檢查是跟否存在規律

In [127]:
dropf = ['loan_advances','director_fees','restricted_stock_deferred',
         'deferral_payments','deferred_income','long_term_incentive']

# 移除遺失過多的feature
df_less = df.drop(dropf ,axis=1).copy()

corrmat = df_less.select_dtypes(include = ['float64', 'int64']).corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, annot=True, cmap='viridis', square=True);
plt.xticks(rotation=90) 
plt.yticks(rotation=0) 

<IPython.core.display.Javascript object>

(array([  0.5,   1.5,   2.5,   3.5,   4.5,   5.5,   6.5,   7.5,   8.5,
          9.5,  10.5,  11.5,  12.5]), <a list of 13 Text yticklabel objects>)

高度相關性的特徵值
**bonus, exercised_stock_options, expense, other, restricted_stock, salary, total_payments, total_stock_value**

### 處理遺失值
 - 嘗試用**Regression**方式，能否預測出遺失值少的 **total_stock_value，total_payments**
     1. 先把以上Features，移除NaN，看剩下多少樣本數
     2. 如果數量夠，就採用Train/test split，去做訓練，驗證評估：R Square數值。


In [128]:
candidate = [ 'bonus', 'exercised_stock_options', 'expenses', 'other', 'restricted_stock', 'salary', 'total_payments', 'total_stock_value']
df_less = df_less[candidate].fillna(0)

In [129]:
df_less = df_less[df_less['bonus']>0]
df_less = df_less[df_less['salary']>0]
df_less = df_less[df_less['total_stock_value']>0]
df_less = df_less[df_less['total_payments']>0]
df_less.shape

(79, 8)

- 樣本數只有79個，先做數據處理

In [130]:
from sklearn.model_selection import train_test_split
train = df_less[['bonus','salary','total_stock_value']]
y_train = df_less.total_payments
Xtrain, Xtest, ytrain, ytest = train_test_split(train, y_train, test_size = 0.4, random_state = None)

In [131]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
Stand = StandardScaler()
Xtrain = Stand.fit_transform(Xtrain)
Xtest = Stand.fit_transform(Xtest)
#ytrain = np.log10(ytrain) 
#ytest = np.log10(ytest)

### Regressor 回歸模型
1. GradientBoostingRegressor
2. RandomForestRegressor

In [124]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

In [132]:
gb_reg = GradientBoostingRegressor()
rf_reg = RandomForestRegressor()

gb_reg.fit(Xtrain, ytrain)
rf_reg.fit(Xtrain, ytrain)

ygb_pred = gb_reg.predict(Xtest)
yrf_pred = rf_reg.predict(Xtest)
print('GBR R Square: %d'%(r2_score(ytest, ygb_pred)))
print('RF R Square: %d'%(r2_score(ytest, yrf_pred)))
print('GBR MAE: %d'%(mean_squared_error(ytest, ygb_pred)))
print('RF MAE: %d'%(mean_squared_error(ytest, yrf_pred)))

GBR R Square: -1
RF R Square: 0
GBR MAE: 693093869392141
RF MAE: 444794522542109


### 結論：Regression 插值，全部不可行，Std 過大

### 處理遺失值，把NaN全部改成0
- Std過大，不適合用interpolate內插法
- 樣本過少，不適合插入中位數，或是平均值

In [135]:
df = df.fillna(0)
df.head()

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,...,long_term_incentive,other,poi,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
ALLEN PHILLIP K,4175000.0,2869717.0,-3081055.0,0.0,phillip.allen@enron.com,1729541.0,13868.0,2195.0,47.0,65.0,...,304805.0,152.0,False,126027.0,-126027.0,201955.0,1407.0,2902.0,4484442.0,1729541.0
BADUM JAMES P,0.0,178980.0,0.0,0.0,,257817.0,3486.0,0.0,0.0,0.0,...,0.0,0.0,False,0.0,0.0,0.0,0.0,0.0,182466.0,257817.0
BANNANTINE JAMES M,0.0,0.0,-5104.0,0.0,james.bannantine@enron.com,4046157.0,56301.0,29.0,39.0,0.0,...,0.0,864523.0,False,1757552.0,-560222.0,477.0,465.0,566.0,916197.0,5243487.0
BAXTER JOHN C,1200000.0,1295738.0,-1386055.0,0.0,,6680544.0,11200.0,0.0,0.0,0.0,...,1586055.0,2660303.0,False,3942714.0,0.0,267102.0,0.0,0.0,5634343.0,10623258.0
BAY FRANKLIN R,400000.0,260455.0,-201641.0,0.0,frank.bay@enron.com,0.0,129142.0,0.0,0.0,0.0,...,0.0,69.0,False,145796.0,-82782.0,239671.0,0.0,0.0,827696.0,63014.0


### Email Address 遺失值處理
- 邮件特征得知，最高訊息含量的，會跟poi有關，郵箱地址沒有什麼訊息含量

In [140]:
df = df.drop('email_address', axis = 1)

### 特征Poi處理
- 沒有存在遺失值
- 進行LabelEncoder編碼處理

In [141]:
df['poi'].unique()

array([False,  True], dtype=bool)

In [143]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
df['poi'] = label.fit_transform(df['poi'])
df['poi'].tail()

WINOKUR JR. HERBERT S    0
WODRASKA JOHN            0
WROBEL BRUCE             0
YEAGER F SCOTT           1
YEAP SOON                0
Name: poi, dtype: int64

## 异常值调查

In [63]:
for i in flt_col:
    fig, ax = plt.subplots(figsize=(5, 5))
    df.boxplot(i, showfliers=True)
    #sns.boxplot(i, data=df)
    #sns.stripplot(i, data=df, jitter=True);
    plt.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [14]:
plt.scatter(df['salary'], df['bonus'], alpha=.5, edgecolors='face')
plt.xlabel("Salary")
plt.ylabel("Bonus")
plt.show()

<IPython.core.display.Javascript object>

### 找出 outlier

In [58]:
df[df['salary']>2.5e7]

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,...,long_term_incentive,other,poi,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
TOTAL,97343619.0,32083396.0,-27992891.0,1398517.0,,311764000.0,5235198.0,,,,...,48521928.0,42667589.0,False,130322299.0,-7576788.0,26704229.0,,,309886585.0,434509511.0


In [60]:
df.drop('TOTAL', axis = 0, inplace = True)

In [61]:
df[df['salary']>2.5e7]

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,...,long_term_incentive,other,poi,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value


In [62]:
plt.scatter(df['salary'], df['bonus'], alpha=.5, edgecolors='face')
plt.xlabel("Salary")
plt.ylabel("Bonus")
plt.show()

<IPython.core.display.Javascript object>

In [66]:
plt.scatter(df['salary'], df['total_payments'], alpha=.5, edgecolors='face')
plt.xlabel("Salary")
plt.ylabel("Total Payments")
plt.show()

In [71]:
corrmat = df.select_dtypes(include = ['float64', 'int64']).corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);
plt.xticks(rotation=90) 
plt.yticks(rotation=0) 

<IPython.core.display.Javascript object>

(array([  0.5,   1.5,   2.5,   3.5,   4.5,   5.5,   6.5,   7.5,   8.5,
          9.5,  10.5,  11.5,  12.5,  13.5,  14.5,  15.5,  16.5,  17.5,  18.5]),
 <a list of 19 Text yticklabel objects>)

- 檢視NaN

In [78]:
df[df['total_stock_value'].isnull()==False]

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,...,long_term_incentive,other,poi,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
ALLEN PHILLIP K,4175000.0,2869717.0,-3081055.0,,phillip.allen@enron.com,1729541.0,13868.0,2195.0,47.0,65.0,...,304805.0,152.0,False,126027.0,-126027.0,201955.0,1407.0,2902.0,4484442.0,1729541.0
BADUM JAMES P,,178980.0,,,,257817.0,3486.0,,,,...,,,False,,,,,,182466.0,257817.0
BANNANTINE JAMES M,,,-5104.0,,james.bannantine@enron.com,4046157.0,56301.0,29.0,39.0,0.0,...,,864523.0,False,1757552.0,-560222.0,477.0,465.0,566.0,916197.0,5243487.0
BAXTER JOHN C,1200000.0,1295738.0,-1386055.0,,,6680544.0,11200.0,,,,...,1586055.0,2660303.0,False,3942714.0,,267102.0,,,5634343.0,10623258.0
BAY FRANKLIN R,400000.0,260455.0,-201641.0,,frank.bay@enron.com,,129142.0,,,,...,,69.0,False,145796.0,-82782.0,239671.0,,,827696.0,63014.0
BAZELIDES PHILIP J,,684694.0,,,,1599641.0,,,,,...,93750.0,874.0,False,,,80818.0,,,860136.0,1599641.0
BECK SALLY W,700000.0,,,,sally.beck@enron.com,,37172.0,4343.0,144.0,386.0,...,,566.0,False,126027.0,,231330.0,2639.0,7315.0,969068.0,126027.0
BELDEN TIMOTHY N,5249999.0,2144013.0,-2334434.0,,tim.belden@enron.com,953136.0,17355.0,484.0,228.0,108.0,...,,210698.0,True,157569.0,,213999.0,5521.0,7991.0,5501630.0,1110705.0
BELFER ROBERT,,-102500.0,,3285.0,,3285.0,,,,,...,,,False,,44093.0,,,,102500.0,-44093.0
BERBERIAN DAVID,,,,,david.berberian@enron.com,1624396.0,11892.0,,,,...,,,False,869220.0,,216582.0,,,228474.0,2493616.0
