In [1]:
import pickle

In [2]:
data_dict = pickle.load(open("final_project_dataset.pkl", "r") )

In [3]:
import pandas as pd

In [4]:
pdata = pd.DataFrame.from_dict(data_dict, orient='index')

In [5]:
import numpy as np

In [6]:
pdata = pdata.replace('NaN', np.nan, regex=True)

In [7]:
# New Nan related features
temp = []
for el in pdata['email_address'].values:
    if type(el) == float:
        temp.append(False)
    else:
        temp.append(True)
pdata['has_email_data'] = temp
pdata['total_navalue_count'] = pdata.isnull().sum(axis=1)
pdata['sal_navalue_count'] = pdata[['salary', 'deferral_payments', 'long_term_incentive', 'total_payments', 'bonus', 'expenses', 'loan_advances', 'other', 'director_fees', 'deferred_income']].isnull().sum(axis=1)

In [8]:
# Dropping text columns and converting Nan to zero
pdata = pdata.drop('email_address', 1)
pdata = pdata.replace(np.nan, 0, regex=True)

In [9]:
# Removing outliers
pdata = pdata[pdata.index != 'TOTAL']
pdata = pdata[pdata.index != 'SKILLING JEFFREY K']
pdata = pdata[pdata.index != 'BELFER ROBERT']
pdata = pdata[pdata.index != 'BHATNAGAR SANJAY']

In [10]:
# New non-Nan related features
pdata['short_gain'] = pdata['exercised_stock_options'] - pdata['long_term_incentive']
pdata['sal_ratio'] = (pdata['bonus'] + pdata['long_term_incentive'] + pdata['expenses']) / pdata['total_payments']
pdata['stock_ratio'] = pdata['restricted_stock'] / pdata['total_stock_value']
pdata['em_ratio'] = (pdata['shared_receipt_with_poi'] + pdata['from_this_person_to_poi'] + pdata['from_poi_to_this_person'] ) / (pdata['to_messages'] + pdata['from_messages'])
pdata['total_total'] = pdata['total_payments'] + pdata['total_stock_value']
pdata['from_ratio'] = pdata['from_this_person_to_poi'] / pdata['from_messages']
pdata['sent_ratio'] = (pdata['from_poi_to_this_person'] + pdata['shared_receipt_with_poi']) / pdata['to_messages']

In [11]:
import matplotlib.pyplot as plt
%matplotlib inline
# import seaborn as sns
# sns.set(color_codes=True)

In [12]:
# # First look at monetary relationships...
# gr = pdata.copy()
# gr['total_total'] = map(lambda x: np.sqrt(x), gr['total_total'])
# gr['sal_ratio'] = map(lambda x: np.sqrt(x), gr['sal_ratio'])
# gr['stock_ratio'] = map(lambda x: np.sqrt(x), gr['stock_ratio'])
# gr['to_messages'] = map(lambda x: np.sqrt(x), gr['to_messages'])
# els1 = ['total_total', 'sal_ratio', 'stock_ratio', 'to_messages']
# scatter_matrix = gr
# g = sns.pairplot(scatter_matrix[:], vars=els1, hue="poi", size=1.5)

In [13]:
# Back into dictionary
pdata = pdata.replace(np.nan, 0, regex=True)
pdata = pdata.replace(np.inf, 0, regex=True)
pdata = pdata.replace(-np.inf, 0, regex=True)
pdict = pdata.T.to_dict()

In [14]:
import sys
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit

In [18]:
features_list = ['poi', 'salary', 'to_messages', 'deferral_payments', 
                 'total_payments', 'exercised_stock_options', 'bonus', 
                 'restricted_stock', 'shared_receipt_with_poi', 'restricted_stock_deferred', 
                 'total_stock_value', 'expenses', 'from_messages', 
                 'other', 'from_this_person_to_poi', 'director_fees',
                 'deferred_income', 'long_term_incentive', 'from_poi_to_this_person',
                 'from_ratio', 'sent_ratio', 'sal_ratio', 'stock_ratio', 
                 'em_ratio', 'has_email_data', 'total_navalue_count', 'sal_navalue_count',
                 'total_total', 'short_gain']

In [19]:
my_dataset = pdict
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [20]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
features = scaler.fit_transform(features)

In [21]:
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

In [22]:
from sklearn.feature_selection import SelectKBest
x_new = SelectKBest(k=12)
x_new.fit_transform(features_train, labels_train)
best = []
for i in range(len(x_new.get_support())):
    if x_new.get_support()[i]:
        best.append(features_list[i+1])

In [23]:
features_list = ['poi']
for el in best:
    features_list.append(el)
features_list

['poi',
 'salary',
 'bonus',
 'shared_receipt_with_poi',
 'expenses',
 'deferred_income',
 'from_ratio',
 'sent_ratio',
 'sal_ratio',
 'em_ratio',
 'total_navalue_count',
 'sal_navalue_count',
 'total_total']

In [24]:
my_dataset = pdict
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [25]:
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

In [26]:
# Logistic Regression

In [27]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('pca', PCA(n_components=2)),
                    ('clf', LogisticRegression(random_state=1, 
                                               penalty='l2',
                                               C=1.0))])
pipe_lr.fit(features_train, labels_train)
print('Test Accuracy: %.3f' % pipe_lr.score(features_test, labels_test))

Test Accuracy: 0.884


In [28]:
from sklearn.cross_validation import cross_val_score
scores = cross_val_score( estimator = pipe_lr, X = features_train, y = labels_train, cv = 10, n_jobs = 1)
print('CV accuracy scores: %s' % scores)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

CV accuracy scores: [ 0.81818182  0.90909091  0.8         1.          0.8         0.9         0.9
  0.88888889  0.88888889  0.88888889]
CV accuracy: 0.879 +/- 0.057


In [697]:
# SVM

In [29]:
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
pipe_svc = Pipeline([('scl', StandardScaler()),
                     ('pca', PCA(n_components=2)),
                     ('clf', SVC(random_state=1))])
param_range = [0.0001, 0.001, 0.01, 1.0, 10.0, 100.0, 1000.0]
param_grid = [{'clf__C': param_range,
               'clf__kernel': ['linear']},
              {'clf__C': param_range,
               'clf__gamma': param_range,
               'clf__kernel': ['rbf']}]
gs = GridSearchCV(estimator=pipe_svc,
                  param_grid = param_grid,
                  scoring = 'f1_weighted',
                  n_jobs=-1)
gs = gs.fit(features_train, labels_train)
print(gs.best_score_)
print(gs.best_params_)

0.87145222719
{'clf__gamma': 1.0, 'clf__C': 1000.0, 'clf__kernel': 'rbf'}


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [30]:
from sklearn.cross_validation import cross_val_score
scores = cross_val_score( estimator = pipe_svc, X = features_train, y = labels_train, cv = 10, n_jobs = 1)
print('CV accuracy scores: %s' % scores)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score
clf = gs.best_estimator_
clf.fit(features_train, labels_train)
print('precision: ', precision_score(labels_test, clf.predict(features_test)))
print('recall: ', recall_score(labels_test, clf.predict(features_test)))
print('f1: ', f1_score(labels_test, clf.predict(features_test)))

CV accuracy scores: [ 0.81818182  0.81818182  0.9         0.9         0.9         0.9         0.9
  0.88888889  0.88888889  0.88888889]
CV accuracy: 0.880 +/- 0.031
('precision: ', 0.33333333333333331)
('recall: ', 0.40000000000000002)
('f1: ', 0.36363636363636359)


In [615]:
# Random Forest

In [32]:
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedShuffleSplit
pipe_rfc = Pipeline([('scl', MinMaxScaler()),
                     ('pca', PCA(n_components=2)),
                     ('clf', RandomForestClassifier(n_estimators=100,
                                                    random_state=1))])
cv = StratifiedShuffleSplit(labels_train, n_iter=10, random_state = 42)
mss_range = [2, 10, 20, 30, 40]
param_grid = [{'clf__min_samples_split': mss_range,
               'clf__criterion': ['entropy', 'gini']}]
gs = GridSearchCV(estimator=pipe_rfc,
                  param_grid = param_grid,
                  scoring = 'f1_weighted',
                  cv = cv,
                  n_jobs=-1)
gs = gs.fit(features_train, labels_train)
print(gs.best_score_)
print(gs.best_params_)

0.933271413829
{'clf__criterion': 'entropy', 'clf__min_samples_split': 2}


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [33]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score
clf = gs.best_estimator_
clf.fit(features_train, labels_train)
print('Test accuracy: %.3f' % clf.score(features_test, labels_test))
print('Test precision: %.3f' % precision_score(clf.predict(features_test), labels_test))
print('Test recall: %.3f' % recall_score(clf.predict(features_test), labels_test))
print('Test F1: %.3f' % f1_score(clf.predict(features_test), labels_test))

Test accuracy: 0.884
Test precision: 0.200
Test recall: 0.500
Test F1: 0.286


In [None]:
# Adaboost

In [34]:
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
pipe_rfc = Pipeline([('scl', StandardScaler()),
                     ('pca', PCA(n_components=3)),
                     ('clf', AdaBoostClassifier(RandomForestClassifier(
                                                n_estimators=100,
                                                criterion='entropy',
                                                min_samples_split=40,
                                                random_state=1)))])
clf = pipe_rfc.fit(features_train, labels_train)

In [37]:
# scores = cross_val_score( estimator = pipe_rfc, X = features_train, y = labels_train, cv = 10, n_jobs = 1)
# print('CV accuracy scores: %s' % scores)
# print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

scores = cross_val_score(clf, features_train, labels_train, cv=10, scoring='accuracy')
print('CV accuracy scores: %s' % scores)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

print('Test precision: %.3f' % precision_score(clf.predict(features_test), labels_test))
print('Test recall: %.3f' % recall_score(clf.predict(features_test), labels_test))
print('Test F1: %.3f' % f1_score(clf.predict(features_test), labels_test))

CV accuracy scores: [ 0.72727273  0.72727273  0.8         0.9         0.8         0.9         0.9
  0.88888889  0.88888889  0.88888889]
CV accuracy: 0.842 +/- 0.068
Test precision: 0.200
Test recall: 0.333
Test F1: 0.250


In [None]:
# K Nearest Neighbors

In [39]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
clf = BaggingClassifier(KNeighborsClassifier(),
                            max_samples=0.5, max_features=0.5)
pipe_knn = Pipeline([('scl', StandardScaler()),
                     ('clf', clf)])
pipe_knn.fit(features_train, labels_train)
print('Test accuracy: %.3f' % clf.score(features_test, labels_test))
print('Test precision: %.3f' % precision_score(clf.predict(features_test), labels_test))
print('Test recall: %.3f' % recall_score(clf.predict(features_test), labels_test))
print('Test F1: %.3f' % f1_score(clf.predict(features_test), labels_test))

Test accuracy: 0.879
Test precision: 0.000
Test recall: 0.000
Test F1: 0.000


In [None]:
# Decision Tree

In [40]:
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import StratifiedShuffleSplit
pipe_dtc = Pipeline([('scl', MinMaxScaler()),
                     ('pca', PCA(n_components=5)),
                     ('clf', DecisionTreeClassifier(random_state=1))])
cv = StratifiedShuffleSplit(labels_train, n_iter=10, random_state = 42)
mss_range = [2, 10, 20, 30, 40]
param_grid = [{'clf__min_samples_split': mss_range,
               'clf__criterion': ['entropy', 'gini']}]
gs = GridSearchCV(estimator=pipe_dtc,
                  param_grid = param_grid,
                  scoring = 'f1_weighted',
                  cv = cv,
                  n_jobs=-1)
clf = gs.fit(features_train, labels_train)
print(clf.best_score_)
print(clf.best_params_)

0.933271413829
{'clf__criterion': 'entropy', 'clf__min_samples_split': 20}


In [41]:
print('Test accuracy: %.3f' % clf.score(features_test, labels_test))
print('Test precision: %.3f' % precision_score(clf.predict(features_test), labels_test))
print('Test recall: %.3f' % recall_score(clf.predict(features_test), labels_test))
print('Test F1: %.3f' % f1_score(clf.predict(features_test), labels_test))

Test accuracy: 0.878
Test precision: 0.400
Test recall: 0.500
Test F1: 0.444


