In [1]:
import sys
import pickle
sys.path.append("../tools/")
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from fancyimpute import KNN
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedShuffleSplit

with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

data_dict.pop('TOTAL',0) 
data_dict.pop('THE TRAVEL AGENCY IN THE PARK',0) 



enron = pd.DataFrame.from_records(list(data_dict.values()))
employees = pd.Series(list(data_dict.keys())) 
enron.set_index(employees, inplace=True) 
# Index is employees name

enron.replace(to_replace='NaN', value=np.nan, inplace=True)

dropfeatures = ['loan_advances', 'director_fees', 'restricted_stock_deferred',
               'deferral_payments', 'deferred_income', 'long_term_incentive', 'bonus']

enron.drop(dropfeatures, axis=1, inplace=True)
enron.drop([u'email_address'] ,axis=1, inplace=True)
all_nans = enron[['from_poi_to_this_person','from_this_person_to_poi',
                  'from_messages','to_messages','shared_receipt_with_poi']].isnull().all(1)

enron.loc[all_nans, ['from_poi_to_this_person','from_this_person_to_poi',
                  'from_messages','to_messages','shared_receipt_with_poi']] = 0 


enron['ratio_from_poi'] = enron['from_poi_to_this_person']/enron['to_messages']
enron['ratio_from_poi'].fillna(0, inplace=True)



enron['ratio_to_poi'] = enron['from_this_person_to_poi']/enron['from_messages']
enron['ratio_to_poi'].fillna(0, inplace=True)

print(enron.columns)

enron_copy = pd.DataFrame(enron[['exercised_stock_options', 'expenses', 'from_messages', 
                                 'from_poi_to_this_person', 'from_this_person_to_poi', 
                                 'other', 'poi', 'restricted_stock', 'salary', 
                                 'shared_receipt_with_poi', 'to_messages', 
                                 'total_payments', 'total_stock_value', 
                                 'ratio_from_poi', 'ratio_to_poi']].copy())

col = [['exercised_stock_options', 'expenses', 'from_messages', 
        'from_poi_to_this_person', 'from_this_person_to_poi', 
        'other', 'poi', 'restricted_stock', 
        'salary', 'shared_receipt_with_poi', 'to_messages', 
        'total_payments', 'total_stock_value', 
        'ratio_from_poi', 'ratio_to_poi']]


enron = pd.DataFrame(KNN(k=2).complete(enron_copy))


enron.columns = col
enron.index = enron_copy.index

featurelist = ['poi', 'exercised_stock_options', 'expenses', 'from_messages', 
               'from_poi_to_this_person', 'from_this_person_to_poi', 'other', 
               'restricted_stock', 'salary', 'shared_receipt_with_poi', 
               'to_messages', 'total_payments', 'total_stock_value' 
               ]

enronml = pd.DataFrame(enron[['poi', 'exercised_stock_options', 'expenses', 'from_messages', 
               'from_poi_to_this_person', 'from_this_person_to_poi', 'other', 
               'restricted_stock', 'salary', 'shared_receipt_with_poi', 
               'to_messages', 'total_payments', 'total_stock_value' 
               ]].copy())




enronml = enronml.to_dict(orient="index")
dataset = enronml

data = featureFormat(dataset, featurelist, sort_keys = True)
labels, features = targetFeatureSplit(data)





Index([u'exercised_stock_options', u'expenses', u'from_messages',
       u'from_poi_to_this_person', u'from_this_person_to_poi', u'other',
       u'poi', u'restricted_stock', u'salary', u'shared_receipt_with_poi',
       u'to_messages', u'total_payments', u'total_stock_value',
       u'ratio_from_poi', u'ratio_to_poi'],
      dtype='object')
Imputing row 1/144 with 1 missing, elapsed time: 0.014
Imputing row 101/144 with 2 missing, elapsed time: 0.182


In [2]:
from sklearn.naive_bayes import GaussianNB

pca = PCA()
gnbc = GaussianNB()
steps = [('scaler', MinMaxScaler()),
         ('best', SelectKBest()),
         ('pca', pca),
         ('gnbc', gnbc)]

pipeline = Pipeline(steps)

parameters = [    
{
'best__k':[3],
'pca__n_components': [1,2]
},
{
'best__k':[4],
'pca__n_components': [1,2,3]
},
{
'best__k':[5],
'pca__n_components': [1,2,3,4]
},
{
'best__k':[6],
'pca__n_components': [1,2,3,4,5]
},
{
'best__k':[7],
'pca__n_components': [1,2,3,4,5,6]
},
{
'best__k':[8],
'pca__n_components': [1,2,3,4,5,6,7]
},
{
'best__k':[9],
'pca__n_components': [1,2,3,4,5,6,7,8]
},
{
'best__k':[10],
'pca__n_components': [1,2,3,4,5,6,7,8,9]
},
{
'best__k':[11],
'pca__n_components': [1,2,3,4,5,6,7,8,9,10]
},
{
'best__k':[12],
'pca__n_components': [1,2,3,4,5,6,7,8,9,10,11]
}
]

cv = StratifiedShuffleSplit(test_size=0.2, random_state=42)
gnbforc = GridSearchCV(pipeline, param_grid = parameters, cv=cv, scoring="f1")
gnbforc.fit(features, labels)

means = gnbforc.cv_results_['mean_test_score']
stds = gnbforc.cv_results_['std_test_score']


for mean, std, params in zip(means, stds, gnbforc.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

print "\n"
print(gnbforc.best_estimator_ )
print "\n"
print(gnbforc.best_score_)
print "\n"
print(gnbforc.best_params_)

feature_step = gnbforc.best_estimator_.named_steps['best']

# Get SelectKBest scores, rounded to 2 decimal places, name them "feature_scores"
feature_scores = ['%.2f' % elem for elem in feature_step.scores_ ]
# Get SelectKBest pvalues, rounded to 3 decimal places, name them "feature_scores_pvalues"
feature_scores_pvalues = ['%.3f' % elem for elem in  feature_step.pvalues_ ]
# Get SelectKBest feature names, whose indices are stored in 'skb_step.get_support',
# create a tuple of feature names, scores and pvalues, name it "features_selected_tuple"
features_selected_tuple=[(featurelist[i+1], feature_scores[i], feature_scores_pvalues[i]) for i in feature_step.get_support(indices=True)]

# Sort the tuple by score, in reverse order
features_selected_tuple = sorted(features_selected_tuple, key=lambda feature: float(feature[1]) , reverse=True)

# Print
print ' '
print 'Selected Features, Scores, P-Values'
print features_selected_tuple


  'precision', 'predicted', average, warn_for)


0.240 (+/-0.435) for {'best__k': 3, 'pca__n_components': 1}
0.248 (+/-0.466) for {'best__k': 3, 'pca__n_components': 2}
0.195 (+/-0.437) for {'best__k': 4, 'pca__n_components': 1}
0.251 (+/-0.495) for {'best__k': 4, 'pca__n_components': 2}
0.244 (+/-0.493) for {'best__k': 4, 'pca__n_components': 3}
0.215 (+/-0.358) for {'best__k': 5, 'pca__n_components': 1}
0.251 (+/-0.495) for {'best__k': 5, 'pca__n_components': 2}
0.234 (+/-0.463) for {'best__k': 5, 'pca__n_components': 3}
0.250 (+/-0.486) for {'best__k': 5, 'pca__n_components': 4}
0.282 (+/-0.512) for {'best__k': 6, 'pca__n_components': 1}
0.305 (+/-0.471) for {'best__k': 6, 'pca__n_components': 2}
0.306 (+/-0.496) for {'best__k': 6, 'pca__n_components': 3}
0.277 (+/-0.446) for {'best__k': 6, 'pca__n_components': 4}
0.291 (+/-0.450) for {'best__k': 6, 'pca__n_components': 5}
0.180 (+/-0.470) for {'best__k': 7, 'pca__n_components': 1}
0.305 (+/-0.471) for {'best__k': 7, 'pca__n_components': 2}
0.282 (+/-0.443) for {'best__k': 7, 'pca

In [3]:
pca = PCA()
steps = [('scaler', MinMaxScaler()),
         ('best', SelectKBest()),
         ('pca', pca),
         ('knn', KNeighborsClassifier())]

pipeline = Pipeline(steps)

parameters = [    
{
'best__k':[3],
'pca__n_components': [1,2],
'knn__n_neighbors': [1,2,3,4,5,6,7,8,9,10]
},
{
'best__k':[4],
'pca__n_components': [1,2,3],
'knn__n_neighbors': [1,2,3,4,5,6,7,8,9,10]
},
{
'best__k':[5],
'pca__n_components': [1,2,3,4],
'knn__n_neighbors': [1,2,3,4,5,6,7,8,9,10]
},
{
'best__k':[6],
'pca__n_components': [1,2,3,4,5],
'knn__n_neighbors': [1,2,3,4,5,6,7,8,9,10]
},
{
'best__k':[7],
'pca__n_components': [1,2,3,4,5,6],
'knn__n_neighbors': [1,2,3,4,5,6,7,8,9,10]
},
{
'best__k':[8],
'pca__n_components': [1,2,3,4,5,6,7],
'knn__n_neighbors': [1,2,3,4,5,6,7,8,9,10]
},
{
'best__k':[9],
'pca__n_components': [1,2,3,4,5,6,7,8],
'knn__n_neighbors': [1,2,3,4,5,6,7,8,9,10]
},
{
'best__k':[10],
'pca__n_components': [1,2,3,4,5,6,7,8,9],
'knn__n_neighbors': [1,2,3,4,5,6,7,8,9,10]
},
{
'best__k':[11],
'pca__n_components': [1,2,3,4,5,6,7,8,9,10],
'knn__n_neighbors': [1,2,3,4,5,6,7,8,9,10]
},
{
'best__k':[12],
'pca__n_components': [1,2,3,4,5,6,7,8,9,10,11],
'knn__n_neighbors': [1,2,3,4,5,6,7,8,9,10]
}
]

cv = StratifiedShuffleSplit(test_size=0.2, random_state=42)
knnforc = GridSearchCV(pipeline, param_grid = parameters, cv=cv, scoring="f1")
knnforc.fit(features, labels)

means = knnforc.cv_results_['mean_test_score']
stds = knnforc.cv_results_['std_test_score']


for mean, std, params in zip(means, stds, knnforc.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

print "\n"
print(knnforc.best_estimator_ )
print "\n"
print(knnforc.best_score_)
print "\n"
print(knnforc.best_params_)

feature_step = knnforc.best_estimator_.named_steps['best']

# Get SelectKBest scores, rounded to 2 decimal places, name them "feature_scores"
feature_scores = ['%.2f' % elem for elem in feature_step.scores_ ]
# Get SelectKBest pvalues, rounded to 3 decimal places, name them "feature_scores_pvalues"
feature_scores_pvalues = ['%.3f' % elem for elem in  feature_step.pvalues_ ]
# Get SelectKBest feature names, whose indices are stored in 'skb_step.get_support',
# create a tuple of feature names, scores and pvalues, name it "features_selected_tuple"
features_selected_tuple=[(featurelist[i+1], feature_scores[i], feature_scores_pvalues[i]) for i in feature_step.get_support(indices=True)]

# Sort the tuple by score, in reverse order
features_selected_tuple = sorted(features_selected_tuple, key=lambda feature: float(feature[1]) , reverse=True)

# Print
print ' '
print 'Selected Features, Scores, P-Values'
print features_selected_tuple

0.310 (+/-0.451) for {'best__k': 3, 'pca__n_components': 1, 'knn__n_neighbors': 1}
0.120 (+/-0.298) for {'best__k': 3, 'pca__n_components': 2, 'knn__n_neighbors': 1}
0.120 (+/-0.367) for {'best__k': 3, 'pca__n_components': 1, 'knn__n_neighbors': 2}
0.040 (+/-0.240) for {'best__k': 3, 'pca__n_components': 2, 'knn__n_neighbors': 2}
0.187 (+/-0.480) for {'best__k': 3, 'pca__n_components': 1, 'knn__n_neighbors': 3}
0.164 (+/-0.447) for {'best__k': 3, 'pca__n_components': 2, 'knn__n_neighbors': 3}
0.080 (+/-0.320) for {'best__k': 3, 'pca__n_components': 1, 'knn__n_neighbors': 4}
0.040 (+/-0.240) for {'best__k': 3, 'pca__n_components': 2, 'knn__n_neighbors': 4}
0.073 (+/-0.295) for {'best__k': 3, 'pca__n_components': 1, 'knn__n_neighbors': 5}
0.067 (+/-0.400) for {'best__k': 3, 'pca__n_components': 2, 'knn__n_neighbors': 5}
0.000 (+/-0.000) for {'best__k': 3, 'pca__n_components': 1, 'knn__n_neighbors': 6}
0.000 (+/-0.000) for {'best__k': 3, 'pca__n_components': 2, 'knn__n_neighbors': 6}
0.00

In [4]:
from sklearn.linear_model import LogisticRegression

pca = PCA()

steps = [('scaler', MinMaxScaler()),
         ('best', SelectKBest()),
         ('pca', pca),
         ('logreg', LogisticRegression())]

pipeline = Pipeline(steps)

parameters = [    
{
'best__k':[3],
'pca__n_components': [1,2],
'logreg__C': [0.0001, 0.001, 0.01, 1, 10, 100]
},
{
'best__k':[4],
'pca__n_components': [1,2,3],
'logreg__C': [0.0001, 0.001, 0.01, 1, 10, 100]
},
{
'best__k':[5],
'pca__n_components': [1,2,3,4],
'logreg__C': [0.0001, 0.001, 0.01, 1, 10, 100]
},
{
'best__k':[6],
'pca__n_components': [1,2,3,4,5],
'logreg__C': [0.0001, 0.001, 0.01, 1, 10, 100]
},
{
'best__k':[7],
'pca__n_components': [1,2,3,4,5,6],
'logreg__C': [0.0001, 0.001, 0.01, 1, 10, 100]
},
{
'best__k':[8],
'pca__n_components': [1,2,3,4,5,6,7],
'logreg__C': [0.0001, 0.001, 0.01, 1, 10, 100]
},
{
'best__k':[9],
'pca__n_components': [1,2,3,4,5,6,7,8],
'logreg__C': [0.0001, 0.001, 0.01, 1, 10, 100]
},
{
'best__k':[10],
'pca__n_components': [1,2,3,4,5,6,7,8,9],
'logreg__C': [0.0001, 0.001, 0.01, 1, 10, 100]
},
{
'best__k':[11],
'pca__n_components': [1,2,3,4,5,6,7,8,9,10],
'logreg__C': [0.0001, 0.001, 0.01, 1, 10, 100]
},
{
'best__k':[12],
'pca__n_components': [1,2,3,4,5,6,7,8,9,10,11],
'logreg__C': [0.0001, 0.001, 0.01, 1, 10, 100]
}
]

cv = StratifiedShuffleSplit(test_size=0.2, random_state=42)
logregforc = GridSearchCV(pipeline, param_grid = parameters, cv=cv, scoring="f1")
logregforc.fit(features, labels)

means = logregforc.cv_results_['mean_test_score']
stds = logregforc.cv_results_['std_test_score']


for mean, std, params in zip(means, stds, logregforc.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

print "\n"
print(logregforc.best_estimator_ )
print "\n"
print(logregforc.best_score_)
print "\n"
print(logregforc.best_params_)

feature_step = logregforc.best_estimator_.named_steps['best']

# Get SelectKBest scores, rounded to 2 decimal places, name them "feature_scores"
feature_scores = ['%.2f' % elem for elem in feature_step.scores_ ]
# Get SelectKBest pvalues, rounded to 3 decimal places, name them "feature_scores_pvalues"
feature_scores_pvalues = ['%.3f' % elem for elem in  feature_step.pvalues_ ]
# Get SelectKBest feature names, whose indices are stored in 'skb_step.get_support',
# create a tuple of feature names, scores and pvalues, name it "features_selected_tuple"
features_selected_tuple=[(featurelist[i+1], feature_scores[i], feature_scores_pvalues[i]) for i in feature_step.get_support(indices=True)]

# Sort the tuple by score, in reverse order
features_selected_tuple = sorted(features_selected_tuple, key=lambda feature: float(feature[1]) , reverse=True)

# Print
print ' '
print 'Selected Features, Scores, P-Values'
print features_selected_tuple

0.000 (+/-0.000) for {'best__k': 3, 'pca__n_components': 1, 'logreg__C': 0.0001}
0.000 (+/-0.000) for {'best__k': 3, 'pca__n_components': 2, 'logreg__C': 0.0001}
0.000 (+/-0.000) for {'best__k': 3, 'pca__n_components': 1, 'logreg__C': 0.001}
0.000 (+/-0.000) for {'best__k': 3, 'pca__n_components': 2, 'logreg__C': 0.001}
0.000 (+/-0.000) for {'best__k': 3, 'pca__n_components': 1, 'logreg__C': 0.01}
0.000 (+/-0.000) for {'best__k': 3, 'pca__n_components': 2, 'logreg__C': 0.01}
0.000 (+/-0.000) for {'best__k': 3, 'pca__n_components': 1, 'logreg__C': 1}
0.040 (+/-0.240) for {'best__k': 3, 'pca__n_components': 2, 'logreg__C': 1}
0.160 (+/-0.392) for {'best__k': 3, 'pca__n_components': 1, 'logreg__C': 10}
0.120 (+/-0.367) for {'best__k': 3, 'pca__n_components': 2, 'logreg__C': 10}
0.233 (+/-0.383) for {'best__k': 3, 'pca__n_components': 1, 'logreg__C': 100}
0.260 (+/-0.456) for {'best__k': 3, 'pca__n_components': 2, 'logreg__C': 100}
0.000 (+/-0.000) for {'best__k': 4, 'pca__n_components': 1

In [9]:
dump_classifier_and_data(gnbforc.best_estimator_, dataset, featurelist)