# Importing libraries

In [229]:
import sys
import pickle
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
import matplotlib.pyplot as plt
%matplotlib inline
import scipy

import missingno as msno
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# Features by data type

In [165]:
### features_list selects which features to include.
features_list = ['poi', 'salary', 'deferral_payments', 'total_payments', 'loan_advances','bonus', 
                      'restricted_stock_deferred', 'deferred_income', 'total_stock_value', 
                      'expenses', 'exercised_stock_options', 'other', 'long_term_incentive', 
                      'restricted_stock', 'director_fees', 'to_messages', 'from_poi_to_this_person', 'from_messages', 
                       'from_this_person_to_poi', 'shared_receipt_with_poi'
                ]

# Identifying columns with financial values
financial_features = ['salary', 'deferral_payments', 'total_payments', 'loan_advances','bonus', 
                      'restricted_stock_deferred', 'deferred_income', 'total_stock_value', 
                      'expenses', 'exercised_stock_options', 'other', 'long_term_incentive', 
                      'restricted_stock', 'director_fees'
                     ]

# Identfying columns with numerical values
features_with_count = ['to_messages', 'from_poi_to_this_person', 'from_messages', 
                       'from_this_person_to_poi', 'shared_receipt_with_poi'
                      ]

In [166]:
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

# Removing the 'TOTAL' value in data_dict because it is a column sum of salaries and doesn't belong to any single employee.
del data_dict['TOTAL']

### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

# EDA


In [167]:
# Transforming features into a df so that I won't have to remember to transform both features train and test.
df_features = pd.DataFrame(features)
print "Any null-values present in the features chosen? \n", df_features.isnull().any()
df_features.head()

Any null-values present in the features chosen? 
0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
dtype: bool


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,201955.0,2869717.0,4484442.0,0.0,4175000.0,-126027.0,-3081055.0,1729541.0,13868.0,1729541.0,152.0,304805.0,126027.0,0.0,2902.0,47.0,2195.0,65.0,1407.0
1,0.0,178980.0,182466.0,0.0,0.0,0.0,0.0,257817.0,3486.0,257817.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,477.0,0.0,916197.0,0.0,0.0,-560222.0,-5104.0,5243487.0,56301.0,4046157.0,864523.0,0.0,1757552.0,0.0,566.0,39.0,29.0,0.0,465.0
3,267102.0,1295738.0,5634343.0,0.0,1200000.0,0.0,-1386055.0,10623258.0,11200.0,6680544.0,2660303.0,1586055.0,3942714.0,0.0,0.0,0.0,0.0,0.0,0.0
4,239671.0,260455.0,827696.0,0.0,400000.0,-82782.0,-201641.0,63014.0,129142.0,0.0,69.0,0.0,145796.0,0.0,0.0,0.0,0.0,0.0,0.0


In [168]:
# Splitting dataset into train and test for features and labels
features_train, features_test, labels_train, labels_test = \
    train_test_split(df_features, labels, test_size=0.3, random_state=42)

# Feature importances using Decision Tree

In [169]:
################
# DecisionTree #
################

# Fitting the model
clf = tree.DecisionTreeClassifier(random_state=42)
clf.fit(features_train, labels_train)

# Feature Importances to identify which features have a high variance to be included in final model and which to exclude.
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
print "Feature Ranking: "
for i in range(len(importances)):
    print "  {}.  feature: {} ({})".format(i+1, features_list[i+1], importances[indices[i]])

Feature Ranking: 
  1.  feature: salary (0.220426513942)
  2.  feature: deferral_payments (0.21197488041)
  3.  feature: total_payments (0.132625994695)
  4.  feature: loan_advances (0.106100795756)
  5.  feature: bonus (0.105863661155)
  6.  feature: restricted_stock_deferred (0.0757862826828)
  7.  feature: deferred_income (0.0736811081639)
  8.  feature: total_stock_value (0.0620731020005)
  9.  feature: expenses (0.0114676611954)
  10.  feature: exercised_stock_options (0.0)
  11.  feature: other (0.0)
  12.  feature: long_term_incentive (0.0)
  13.  feature: restricted_stock (0.0)
  14.  feature: director_fees (0.0)
  15.  feature: to_messages (0.0)
  16.  feature: from_poi_to_this_person (0.0)
  17.  feature: from_messages (0.0)
  18.  feature: from_this_person_to_poi (0.0)
  19.  feature: shared_receipt_with_poi (0.0)


# Feature engineering

In [242]:
# New feature is bonus/salary and null-values replaced by 0
features_train['bonus_salary_ratio'] = features_train.loc[:, 5] / features_train.loc[:, 1]
features_train['bonus_salary_ratio'] = np.nan_to_num(features_train['bonus_salary_ratio'])

# Repeat same feature engineering for test data
features_test['bonus_salary_ratio'] = features_test.loc[:, 5] / features_test.loc[:, 1]
features_test['bonus_salary_ratio'] = np.nan_to_num(features_test['bonus_salary_ratio'])

features_train.head()

# New feature proved to make my models perform worse. See explanation at the Q&A section at the end

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://panda

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,bonus_salary_ratio
28,314288.0,0.0,1101393.0,0.0,800000.0,0.0,-41250.0,495633.0,27861.0,117551.0,494.0,0.0,378082.0,0.0,102.0,0.0,33.0,4.0,71.0,0.0
40,182245.0,0.0,2692324.0,0.0,200000.0,0.0,0.0,1008941.0,21530.0,601438.0,53775.0,2234774.0,407503.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,758931.0,0.0,664375.0,0.0,0.0,94556.0,0.0,1433.0,25.0,215.0,2.0,508.0,0.0
24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6077885.0,0.0,5127155.0,0.0,0.0,950730.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25,0.0,0.0,0.0,0.0,0.0,-472568.0,0.0,189518.0,0.0,0.0,0.0,0.0,662086.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.797693e+308


## Refining feature_list based on feature importances

In [237]:
feature_list = ['poi', 'salary', 'deferral_payments', 'total_payments', 
                'loan_advances', 'bonus', 'restricted_stock_deferred',
                'deferred_income', 'total_stock_value', 'expenses', 'bonus_salary_ratio'
               ]

In [243]:
######################
# KNearest Neighbour #
######################

clf = KNeighborsClassifier(p=1, weights='distance', leaf_size=19, algorithm='ball_tree', n_neighbors=2)

# clf_tree_rscv = RandomizedSearchCV(clf, param_distributions=params, cv=70, n_iter=100, scoring='recall', n_jobs=-1, verbose=2)
# clf_tree_rscv.fit(features_train, labels_train)
# print clf_tree_rscv.best_params_
# print clf_tree_rscv.best_score_

clf.fit(features_train_scaled, labels_train)
pred = clf.predict(features_test_scaled)

# Classifier scores
precision_score_knn = precision_score(pred, labels_test)
recall_score_knn = recall_score(pred, labels_test)
accuracy_score_knn = accuracy_score(pred, labels_test)

print "Accuracy score: ", accuracy_score_tree
print "Precision score: ", precision_score_tree
print "Recall score: ", recall_score_tree

# {'p': 1, 'weights': 'distance', 'leaf_size': 19, 'algorithm': 'ball_tree', 'n_neighbors': 2}
# 0.09

# Dumping classifier, my_dataset and features_list as .pkl files to be used in tester.py
dump_classifier_and_data(clf, my_dataset, features_list)

Accuracy score:  0.613636363636
Precision score:  0.6
Recall score:  0.166666666667


In [228]:
#################
# Decision Tree #
#################

clf = tree.DecisionTreeClassifier(random_state=42, criterion='gini', max_depth=17, max_features=18, class_weight='balanced', splitter='random', min_samples_leaf=14, min_samples_split=63)
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)


# clf_tree_rscv = RandomizedSearchCV(clf, param_distributions=parameters, cv=70, n_iter=80, scoring='precision', n_jobs=-1, verbose=2)
# clf_tree_rscv.fit(features_train, labels_train)

# print clf_tree_rscv.best_params_
# print clf_tree_rscv.best_score_

# {'splitter': 'random', 'min_samples_leaf': 14, 'max_features': 18, 'criterion': 'gini', 'min_samples_split': 49, 'max_depth': 17, 'class_weight': 'balanced'}
# 0.25

# {'min_samples_split': 63}
# 0.25

# Classifier scores
precision_score_tree = precision_score(pred, labels_test)
recall_score_tree = recall_score(pred, labels_test)
accuracy_score_tree = accuracy_score(pred, labels_test)

print "Accuracy score: ", accuracy_score_tree
print "Precision score: ", precision_score_tree
print "Recall score: ", recall_score_tree

# Dumping classifier, my_dataset and features_list as .pkl files to be used in tester.py
dump_classifier_and_data(clf, my_dataset, features_list)

Accuracy score:  0.613636363636
Precision score:  0.6
Recall score:  0.166666666667


In [244]:
###############
# Naive Bayes #
###############

clf = GaussianNB()
clf.fit(features_train, labels_train)

pred = clf.predict(features_test)

# Classifier scores
precision_score_gnb = precision_score(pred, labels_test)
recall_score_gnb = recall_score(pred, labels_test)
accuracy_score_gnb = accuracy_score(pred, labels_test)

print "Accuracy score: ", accuracy_score_gnb
print "Precision score: ", precision_score_gnb
print "Recall score: ", recall_score_gnb

# Dumping classifier, my_dataset and features_list as .pkl files to be used in tester.py
dump_classifier_and_data(clf, my_dataset, features_list)

Accuracy score:  0.886363636364
Precision score:  0.0
Recall score:  0.0


In [246]:
clf_gbrt = GradientBoostingClassifier(random_state=42, 
                                      learning_rate=0.042548288288288286,
                                      min_samples_split=10,
                                     min_samples_leaf=1)
parameters = {'learning_rate':np.linspace(0.1, 0.00001, num=1000),
              'n_estimators':np.arange(1, 100, 1),
              'max_depth':np.arange(1, 13, 1),
              'max_features':np.arange(1, 20, 1),
              'min_samples_split':np.arange(2, 20, 1),
              'min_samples_leaf':np.arange(1, 20, 1)
             }
clf_gbrt_rscv = RandomizedSearchCV(clf_gbrt, param_distributions=parameters, cv=70, n_iter=100, scoring='precision', n_jobs=-1, verbose=2)
clf_gbrt_rscv.fit(features_train_scaled, labels_train)

print clf_gbrt_rscv.best_params_
print clf_gbrt_rscv.best_score_

# {'learning_rate': 0.042548288288288286, 'min_samples_leaf': 1, 'n_estimators': 99, 'max_features': 3, 'min_samples_split': 10, 'max_depth': 1}
# 0.06

Fitting 70 folds for each of 100 candidates, totalling 7000 fits
[CV] learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9 
[CV] learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9 
[CV] learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9 
[CV] learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9 
[CV] learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9 
[CV] learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9 
[CV]  learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9, total=   0.1s


  'precision', 'predicted', average, warn_for)


[CV] learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9 
[CV] learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9 
[CV] learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9 


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9, total=   0.2s
[CV] learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9 
[CV]  learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9, total=   0.2s
[CV] learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9 


  'precision', 'predicted', average, warn_for)


[CV]  learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9, total=   0.2s
[CV] learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9 


  'precision', 'predicted', average, warn_for)


[CV]  learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9, total=   0.2s
[CV] learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9 


  'precision', 'predicted', average, warn_for)


[CV]  learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9, total=   0.2s
[CV] learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9 
[CV]  learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9, total=   0.1s
[CV] learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9 
[CV] learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9 


  'precision', 'predicted', average, warn_for)


[CV]  learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9, total=   0.2s
[CV]  learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9, total=   0.2s
[CV] learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9 
[CV]  learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9, total=   0.2s
[CV] learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9 
[CV]  learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9, total=   0.2s
[CV] learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9 
[CV]  learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_sa

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.0s


[CV]  learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9, total=   0.1s
[CV] learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9 
[CV]  learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9, total=   0.1s
[CV] learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9 
[CV]  learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9, total=   0.1s
[CV] learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9 
[CV]  learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_samples_split=19, max_features=8, max_depth=9, total=   0.1s
[CV] learning_rate=0.027835045045, min_samples_leaf=1, n_estimators=83, min_sam

  'precision', 'predicted', average, warn_for)


[CV]  learning_rate=0.0967971171171, min_samples_leaf=4, n_estimators=83, min_samples_split=8, max_features=8, max_depth=3, total=   0.1s
[CV] learning_rate=0.0967971171171, min_samples_leaf=4, n_estimators=83, min_samples_split=8, max_features=8, max_depth=3 
[CV]  learning_rate=0.0967971171171, min_samples_leaf=4, n_estimators=83, min_samples_split=8, max_features=8, max_depth=3, total=   0.1s
[CV] learning_rate=0.0967971171171, min_samples_leaf=4, n_estimators=83, min_samples_split=8, max_features=8, max_depth=3 
[CV]  learning_rate=0.0967971171171, min_samples_leaf=4, n_estimators=83, min_samples_split=8, max_features=8, max_depth=3, total=   0.1s
[CV] learning_rate=0.0967971171171, min_samples_leaf=4, n_estimators=83, min_samples_split=8, max_features=8, max_depth=3 
[CV] learning_rate=0.0967971171171, min_samples_leaf=4, n_estimators=83, min_samples_split=8, max_features=8, max_depth=3 
[CV]  learning_rate=0.0967971171171, min_samples_leaf=4, n_estimators=83, min_samples_split=8,

[Parallel(n_jobs=-1)]: Done 276 tasks      | elapsed:    4.1s


[CV]  learning_rate=0.0254328828829, min_samples_leaf=8, n_estimators=55, min_samples_split=3, max_features=2, max_depth=4, total=   0.1s
[CV] learning_rate=0.0254328828829, min_samples_leaf=8, n_estimators=55, min_samples_split=3, max_features=2, max_depth=4 
[CV]  learning_rate=0.0254328828829, min_samples_leaf=8, n_estimators=55, min_samples_split=3, max_features=2, max_depth=4, total=   0.1s
[CV] learning_rate=0.0254328828829, min_samples_leaf=8, n_estimators=55, min_samples_split=3, max_features=2, max_depth=4 
[CV]  learning_rate=0.0254328828829, min_samples_leaf=8, n_estimators=55, min_samples_split=3, max_features=2, max_depth=4, total=   0.1s
[CV] learning_rate=0.0254328828829, min_samples_leaf=8, n_estimators=55, min_samples_split=3, max_features=2, max_depth=4 
[CV]  learning_rate=0.0254328828829, min_samples_leaf=8, n_estimators=55, min_samples_split=3, max_features=2, max_depth=4, total=   0.1s
[CV]  learning_rate=0.0254328828829, min_samples_leaf=8, n_estimators=55, min_s

[Parallel(n_jobs=-1)]: Done 872 tasks      | elapsed:    9.7s


[CV] learning_rate=0.00011009009009, min_samples_leaf=5, n_estimators=64, min_samples_split=6, max_features=3, max_depth=1 
[CV]  learning_rate=0.00011009009009, min_samples_leaf=5, n_estimators=64, min_samples_split=6, max_features=3, max_depth=1, total=   0.0s
[CV] learning_rate=0.0946952252252, min_samples_leaf=14, n_estimators=87, min_samples_split=12, max_features=14, max_depth=4 
[CV]  learning_rate=0.00011009009009, min_samples_leaf=5, n_estimators=64, min_samples_split=6, max_features=3, max_depth=1, total=   0.0s
[CV] learning_rate=0.00011009009009, min_samples_leaf=5, n_estimators=64, min_samples_split=6, max_features=3, max_depth=1 
[CV]  learning_rate=0.00011009009009, min_samples_leaf=5, n_estimators=64, min_samples_split=6, max_features=3, max_depth=1, total=   0.0s
[CV]  learning_rate=0.0946952252252, min_samples_leaf=14, n_estimators=87, min_samples_split=12, max_features=14, max_depth=4, total=   0.1s
[CV] learning_rate=0.0946952252252, min_samples_leaf=14, n_estimator

JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/runpy.py in _run_module_as_main(mod_name='ipykernel_launcher', alter_argv=1)
    169     pkg_name = mod_name.rpartition('.')[0]
    170     main_globals = sys.modules["__main__"].__dict__
    171     if alter_argv:
    172         sys.argv[0] = fname
    173     return _run_code(code, main_globals, None,
--> 174                      "__main__", fname, loader, pkg_name)
        fname = '/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py'
        loader = <pkgutil.ImpLoader instance>
        pkg_name = ''
    175 
    176 def run_module(mod_name, init_globals=None,
    177                run_name=None, alter_sys=False):
    178     """Execute a module's code without importing it

...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/runpy.py in _run_code(code=<code object <module> at 0x1099b5ab0, file "/Use...2.7/site-packages/ipykernel_launcher.py", line 5>, run_globals={'__builtins__': <module '__builtin__' (built-in)>, '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py', '__loader__': <pkgutil.ImpLoader instance>, '__name__': '__main__', '__package__': '', 'app': <module 'ipykernel.kernelapp' from '/Users/ammar...python2.7/site-packages/ipykernel/kernelapp.pyc'>, 'sys': <module 'sys' (built-in)>}, init_globals=None, mod_name='__main__', mod_fname='/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py', mod_loader=<pkgutil.ImpLoader instance>, pkg_name='')
     67         run_globals.update(init_globals)
     68     run_globals.update(__name__ = mod_name,
     69                        __file__ = mod_fname,
     70                        __loader__ = mod_loader,
     71                        __package__ = pkg_name)
---> 72     exec code in run_globals
        code = <code object <module> at 0x1099b5ab0, file "/Use...2.7/site-packages/ipykernel_launcher.py", line 5>
        run_globals = {'__builtins__': <module '__builtin__' (built-in)>, '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py', '__loader__': <pkgutil.ImpLoader instance>, '__name__': '__main__', '__package__': '', 'app': <module 'ipykernel.kernelapp' from '/Users/ammar...python2.7/site-packages/ipykernel/kernelapp.pyc'>, 'sys': <module 'sys' (built-in)>}
     73     return run_globals
     74 
     75 def _run_module_code(code, init_globals=None,
     76                     mod_name=None, mod_fname=None,

...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py in <module>()
     11     # This is added back by InteractiveShellApp.init_path()
     12     if sys.path[0] == '':
     13         del sys.path[0]
     14 
     15     from ipykernel import kernelapp as app
---> 16     app.launch_new_instance()

...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    653 
    654         If a global instance already exists, this reinitializes and starts it
    655         """
    656         app = cls.instance(**kwargs)
    657         app.initialize(argv)
--> 658         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    659 
    660 #-----------------------------------------------------------------------------
    661 # utility functions, for convenience
    662 #-----------------------------------------------------------------------------

...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    472             return self.subapp.start()
    473         if self.poller is not None:
    474             self.poller.start()
    475         self.kernel.start()
    476         try:
--> 477             ioloop.IOLoop.instance().start()
    478         except KeyboardInterrupt:
    479             pass
    480 
    481 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/zmq/eventloop/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    172             )
    173         return loop
    174     
    175     def start(self):
    176         try:
--> 177             super(ZMQIOLoop, self).start()
        self.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    178         except ZMQError as e:
    179             if e.errno == ETERM:
    180                 # quietly return on ETERM
    181                 pass

...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/tornado/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    883                 self._events.update(event_pairs)
    884                 while self._events:
    885                     fd, events = self._events.popitem()
    886                     try:
    887                         fd_obj, handler_func = self._handlers[fd]
--> 888                         handler_func(fd_obj, events)
        handler_func = <function null_wrapper>
        fd_obj = <zmq.sugar.socket.Socket object>
        events = 1
    889                     except (OSError, IOError) as e:
    890                         if errno_from_exception(e) == errno.EPIPE:
    891                             # Happens when the client closes the connection
    892                             pass

...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    272         # Fast path when there are no active contexts.
    273         def null_wrapper(*args, **kwargs):
    274             try:
    275                 current_state = _state.contexts
    276                 _state.contexts = cap_contexts[0]
--> 277                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    278             finally:
    279                 _state.contexts = current_state
    280         null_wrapper._wrapped = True
    281         return null_wrapper

...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    435             # dispatch events:
    436             if events & IOLoop.ERROR:
    437                 gen_log.error("got POLLERR event on ZMQStream, which doesn't make sense")
    438                 return
    439             if events & IOLoop.READ:
--> 440                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    441                 if not self.socket:
    442                     return
    443             if events & IOLoop.WRITE:
    444                 self._handle_send()

...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    467                 gen_log.error("RECV Error: %s"%zmq.strerror(e.errno))
    468         else:
    469             if self._recv_callback:
    470                 callback = self._recv_callback
    471                 # self._recv_callback = None
--> 472                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    473                 
    474         # self.update_state()
    475         
    476 

...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    409         close our socket."""
    410         try:
    411             # Use a NullContext to ensure that all StackContexts are run
    412             # inside our blanket exception handler rather than outside.
    413             with stack_context.NullContext():
--> 414                 callback(*args, **kwargs)
        callback = <function null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    415         except:
    416             gen_log.error("Uncaught exception, closing connection.",
    417                           exc_info=True)
    418             # Close the socket on an uncaught exception from a user callback

...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    272         # Fast path when there are no active contexts.
    273         def null_wrapper(*args, **kwargs):
    274             try:
    275                 current_state = _state.contexts
    276                 _state.contexts = cap_contexts[0]
--> 277                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    278             finally:
    279                 _state.contexts = current_state
    280         null_wrapper._wrapped = True
    281         return null_wrapper

...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    278         if self.control_stream:
    279             self.control_stream.on_recv(self.dispatch_control, copy=False)
    280 
    281         def make_dispatcher(stream):
    282             def dispatcher(msg):
--> 283                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    284             return dispatcher
    285 
    286         for s in self.shell_streams:
    287             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {u'allow_stdin': True, u'code': u"clf_gbrt = GradientBoostingClassifier(random_s...'min_samples_split': 10, 'max_depth': 1}\n# 0.06", u'silent': False, u'stop_on_error': True, u'store_history': True, u'user_expressions': {}}, 'header': {u'date': datetime.datetime(2017, 12, 1, 15, 50, 27, 147670, tzinfo=tzutc()), u'msg_id': u'9e93572dde6d3247615e714cc833fce5', u'msg_type': u'execute_request', u'session': u'a0f7cb6fa624335755a81f1af21991d7', u'username': u'', u'version': u'5.2'}, 'metadata': {}, 'msg_id': u'9e93572dde6d3247615e714cc833fce5', 'msg_type': u'execute_request', 'parent_header': {}})
    230             self.log.warn("Unknown message type: %r", msg_type)
    231         else:
    232             self.log.debug("%s: %s", msg_type, msg)
    233             self.pre_handler_hook()
    234             try:
--> 235                 handler(stream, idents, msg)
        handler = <bound method IPythonKernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = ['a0f7cb6fa624335755a81f1af21991d7']
        msg = {'buffers': [], 'content': {u'allow_stdin': True, u'code': u"clf_gbrt = GradientBoostingClassifier(random_s...'min_samples_split': 10, 'max_depth': 1}\n# 0.06", u'silent': False, u'stop_on_error': True, u'store_history': True, u'user_expressions': {}}, 'header': {u'date': datetime.datetime(2017, 12, 1, 15, 50, 27, 147670, tzinfo=tzutc()), u'msg_id': u'9e93572dde6d3247615e714cc833fce5', u'msg_type': u'execute_request', u'session': u'a0f7cb6fa624335755a81f1af21991d7', u'username': u'', u'version': u'5.2'}, 'metadata': {}, 'msg_id': u'9e93572dde6d3247615e714cc833fce5', 'msg_type': u'execute_request', 'parent_header': {}}
    236             except Exception:
    237                 self.log.error("Exception in message handler:", exc_info=True)
    238             finally:
    239                 self.post_handler_hook()

...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=['a0f7cb6fa624335755a81f1af21991d7'], parent={'buffers': [], 'content': {u'allow_stdin': True, u'code': u"clf_gbrt = GradientBoostingClassifier(random_s...'min_samples_split': 10, 'max_depth': 1}\n# 0.06", u'silent': False, u'stop_on_error': True, u'store_history': True, u'user_expressions': {}}, 'header': {u'date': datetime.datetime(2017, 12, 1, 15, 50, 27, 147670, tzinfo=tzutc()), u'msg_id': u'9e93572dde6d3247615e714cc833fce5', u'msg_type': u'execute_request', u'session': u'a0f7cb6fa624335755a81f1af21991d7', u'username': u'', u'version': u'5.2'}, 'metadata': {}, 'msg_id': u'9e93572dde6d3247615e714cc833fce5', 'msg_type': u'execute_request', 'parent_header': {}})
    394         if not silent:
    395             self.execution_count += 1
    396             self._publish_execute_input(code, parent, self.execution_count)
    397 
    398         reply_content = self.do_execute(code, silent, store_history,
--> 399                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    400 
    401         # Flush output before sending the reply.
    402         sys.stdout.flush()
    403         sys.stderr.flush()

...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code=u"clf_gbrt = GradientBoostingClassifier(random_s...'min_samples_split': 10, 'max_depth': 1}\n# 0.06", silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    191 
    192         self._forward_input(allow_stdin)
    193 
    194         reply_content = {}
    195         try:
--> 196             res = shell.run_cell(code, store_history=store_history, silent=silent)
        res = undefined
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = u"clf_gbrt = GradientBoostingClassifier(random_s...'min_samples_split': 10, 'max_depth': 1}\n# 0.06"
        store_history = True
        silent = False
    197         finally:
    198             self._restore_input()
    199 
    200         if res.error_before_exec is not None:

...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/ipykernel/zmqshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, *args=(u"clf_gbrt = GradientBoostingClassifier(random_s...'min_samples_split': 10, 'max_depth': 1}\n# 0.06",), **kwargs={'silent': False, 'store_history': True})
    528             )
    529         self.payload_manager.write_payload(payload)
    530 
    531     def run_cell(self, *args, **kwargs):
    532         self._last_traceback = None
--> 533         return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
        self.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        args = (u"clf_gbrt = GradientBoostingClassifier(random_s...'min_samples_split': 10, 'max_depth': 1}\n# 0.06",)
        kwargs = {'silent': False, 'store_history': True}
    534 
    535     def _showtraceback(self, etype, evalue, stb):
    536         # try to preserve ordering of tracebacks and print statements
    537         sys.stdout.flush()

...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell=u"clf_gbrt = GradientBoostingClassifier(random_s...'min_samples_split': 10, 'max_depth': 1}\n# 0.06", store_history=True, silent=False, shell_futures=True)
   2713                 self.displayhook.exec_result = result
   2714 
   2715                 # Execute the user code
   2716                 interactivity = "none" if silent else self.ast_node_interactivity
   2717                 has_raised = self.run_ast_nodes(code_ast.body, cell_name,
-> 2718                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler instance>
   2719                 
   2720                 self.last_execution_succeeded = not has_raised
   2721 
   2722                 # Reset this so later displayed values do not modify the

...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Expr object>, <_ast.Print object>, <_ast.Print object>], cell_name='<ipython-input-246-087e7a98c8fa>', interactivity='none', compiler=<IPython.core.compilerop.CachingCompiler instance>, result=<ExecutionResult object at 1a11c9b850, execution..._before_exec=None error_in_exec=None result=None>)
   2817 
   2818         try:
   2819             for i, node in enumerate(to_run_exec):
   2820                 mod = ast.Module([node])
   2821                 code = compiler(mod, cell_name, "exec")
-> 2822                 if self.run_code(code, result):
        self.run_code = <bound method ZMQInteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x1a11fb5630, file "<ipython-input-246-087e7a98c8fa>", line 13>
        result = <ExecutionResult object at 1a11c9b850, execution..._before_exec=None error_in_exec=None result=None>
   2823                     return True
   2824 
   2825             for i, node in enumerate(to_run_interactive):
   2826                 mod = ast.Interactive([node])

...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x1a11fb5630, file "<ipython-input-246-087e7a98c8fa>", line 13>, result=<ExecutionResult object at 1a11c9b850, execution..._before_exec=None error_in_exec=None result=None>)
   2877         outflag = 1  # happens in more places, so it's easier as default
   2878         try:
   2879             try:
   2880                 self.hooks.pre_run_code_hook()
   2881                 #rprint('Running code', repr(code_obj)) # dbg
-> 2882                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x1a11fb5630, file "<ipython-input-246-087e7a98c8fa>", line 13>
        self.user_global_ns = {'BernoulliNB': <class 'sklearn.naive_bayes.BernoulliNB'>, 'GaussianNB': <class 'sklearn.naive_bayes.GaussianNB'>, 'GradientBoostingClassifier': <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', u'################\n# DecisionTree #\n##########..._tree\nprint "Recall score: ", recall_score_tree', u'import sys\nimport pickle\nsys.path.append(".....election import RandomizedSearchCV, GridSearchCV', u"### features_list selects which features to in...hared_receipt_with_poi'\n                      ]", u'### Load the dictionary containing the dataset...ue)\nlabels, features = targetFeatureSplit(data)', u'# Transforming features into a df so that I wo..., df_features.isnull().any()\ndf_features.head()', u'# Transforming features into a df so that I wo..., df_features.isnull().any()\ndf_features.head()', u'# Transforming features into a df so that I wo..., df_features.isnull().any()\ndf_features.head()', u'# Transforming features into a df so that I wo..., df_features.isnull().any()\ndf_features.head()', u'# Splitting dataset into train and test for fe...eatures, labels, test_size=0.3, random_state=42)', u'################\n# DecisionTree #\n##########..._tree\nprint "Recall score: ", recall_score_tree', u'# Transforming features into a df so that I wo..., df_features.isnull().any()\ndf_features.head()', u'# Splitting dataset into train and test for fe...eatures, labels, test_size=0.3, random_state=42)', u'################\n# DecisionTree #\n##########..._tree\nprint "Recall score: ", recall_score_tree', u'# Transforming features into a df so that I wo..., df_features.isnull().any()\ndf_features.head()', u'# Transforming features into a df so that I wo..., df_features.isnull().any()\ndf_features.head()', u'# Splitting dataset into train and test for fe...eatures, labels, test_size=0.3, random_state=42)', u'################\n# DecisionTree #\n##########..._tree\nprint "Recall score: ", recall_score_tree', u'################\n# DecisionTree #\n##########...+1, features_list[i+1], importances[indices[i]])', u'################\n# DecisionTree #\n##########...+1, features_list[i+1], importances[indices[i]])', ...], 'KNeighborsClassifier': <class 'sklearn.neighbors.classification.KNeighborsClassifier'>, 'MinMaxScaler': <class 'sklearn.preprocessing.data.MinMaxScaler'>, 'MultinomialNB': <class 'sklearn.naive_bayes.MultinomialNB'>, 'Out': {8:      salary  total_stock_value  deferral_payment...                   0.0                      0.0  , 11:      salary  deferral_payments  total_payments  ...                   0.0                      0.0  , 15:          0          1          2    3          4...    0.0  
4     0.0   0.0     0.0   0.0     0.0  , 55:          0          1          2    3          4...    0.0  
4     0.0   0.0     0.0   0.0     0.0  , 65:          0          1          2    3          4...    0.0  
4     0.0   0.0     0.0   0.0     0.0  , 73: <function show>, 76: <function show>, 78: <function show>, 79: <function show>, 80: <function show>, ...}, 'RandomForestClassifier': <class 'sklearn.ensemble.forest.RandomForestClassifier'>, ...}
        self.user_ns = {'BernoulliNB': <class 'sklearn.naive_bayes.BernoulliNB'>, 'GaussianNB': <class 'sklearn.naive_bayes.GaussianNB'>, 'GradientBoostingClassifier': <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', u'################\n# DecisionTree #\n##########..._tree\nprint "Recall score: ", recall_score_tree', u'import sys\nimport pickle\nsys.path.append(".....election import RandomizedSearchCV, GridSearchCV', u"### features_list selects which features to in...hared_receipt_with_poi'\n                      ]", u'### Load the dictionary containing the dataset...ue)\nlabels, features = targetFeatureSplit(data)', u'# Transforming features into a df so that I wo..., df_features.isnull().any()\ndf_features.head()', u'# Transforming features into a df so that I wo..., df_features.isnull().any()\ndf_features.head()', u'# Transforming features into a df so that I wo..., df_features.isnull().any()\ndf_features.head()', u'# Transforming features into a df so that I wo..., df_features.isnull().any()\ndf_features.head()', u'# Splitting dataset into train and test for fe...eatures, labels, test_size=0.3, random_state=42)', u'################\n# DecisionTree #\n##########..._tree\nprint "Recall score: ", recall_score_tree', u'# Transforming features into a df so that I wo..., df_features.isnull().any()\ndf_features.head()', u'# Splitting dataset into train and test for fe...eatures, labels, test_size=0.3, random_state=42)', u'################\n# DecisionTree #\n##########..._tree\nprint "Recall score: ", recall_score_tree', u'# Transforming features into a df so that I wo..., df_features.isnull().any()\ndf_features.head()', u'# Transforming features into a df so that I wo..., df_features.isnull().any()\ndf_features.head()', u'# Splitting dataset into train and test for fe...eatures, labels, test_size=0.3, random_state=42)', u'################\n# DecisionTree #\n##########..._tree\nprint "Recall score: ", recall_score_tree', u'################\n# DecisionTree #\n##########...+1, features_list[i+1], importances[indices[i]])', u'################\n# DecisionTree #\n##########...+1, features_list[i+1], importances[indices[i]])', ...], 'KNeighborsClassifier': <class 'sklearn.neighbors.classification.KNeighborsClassifier'>, 'MinMaxScaler': <class 'sklearn.preprocessing.data.MinMaxScaler'>, 'MultinomialNB': <class 'sklearn.naive_bayes.MultinomialNB'>, 'Out': {8:      salary  total_stock_value  deferral_payment...                   0.0                      0.0  , 11:      salary  deferral_payments  total_payments  ...                   0.0                      0.0  , 15:          0          1          2    3          4...    0.0  
4     0.0   0.0     0.0   0.0     0.0  , 55:          0          1          2    3          4...    0.0  
4     0.0   0.0     0.0   0.0     0.0  , 65:          0          1          2    3          4...    0.0  
4     0.0   0.0     0.0   0.0     0.0  , 73: <function show>, 76: <function show>, 78: <function show>, 79: <function show>, 80: <function show>, ...}, 'RandomForestClassifier': <class 'sklearn.ensemble.forest.RandomForestClassifier'>, ...}
   2883             finally:
   2884                 # Reset our crash handler in place
   2885                 sys.excepthook = old_excepthook
   2886         except SystemExit as e:

...........................................................................
/Users/ammarjawaddoosh/ud120-projects/final_project/<ipython-input-246-087e7a98c8fa> in <module>()
      8               'max_features':np.arange(1, 20, 1),
      9               'min_samples_split':np.arange(2, 20, 1),
     10               'min_samples_leaf':np.arange(1, 20, 1)
     11              }
     12 clf_gbrt_rscv = RandomizedSearchCV(clf_gbrt, param_distributions=parameters, cv=70, n_iter=100, scoring='precision', n_jobs=-1, verbose=2)
---> 13 clf_gbrt_rscv.fit(features_train_scaled, labels_train)
     14 
     15 print clf_gbrt_rscv.best_params_
     16 print clf_gbrt_rscv.best_score_
     17 

...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_search.py in fit(self=RandomizedSearchCV(cv=70, error_score='raise',
 ...ain_score='warn', scoring='precision', verbose=2), X=array([[  1.56980101e-02,   1.06353341e-02,   0....000000e+00,   0.00000000e+00,   0.00000000e+00]]), y=[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, ...], groups=None, **fit_params={})
    634                                   return_train_score=self.return_train_score,
    635                                   return_n_test_samples=True,
    636                                   return_times=True, return_parameters=False,
    637                                   error_score=self.error_score)
    638           for parameters, (train, test) in product(candidate_params,
--> 639                                                    cv.split(X, y, groups)))
        cv.split = <bound method StratifiedKFold.split of Stratifie...d(n_splits=70, random_state=None, shuffle=False)>
        X = array([[  1.56980101e-02,   1.06353341e-02,   0....000000e+00,   0.00000000e+00,   0.00000000e+00]])
        y = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, ...]
        groups = None
    640 
    641         # if one choose to see train score, "out" will contain train score info
    642         if self.return_train_score:
    643             (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,

...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<generator object <genexpr>>)
    784             if pre_dispatch == "all" or n_jobs == 1:
    785                 # The iterable was consumed all at once by the above for loop.
    786                 # No need to wait for async callbacks to trigger to
    787                 # consumption.
    788                 self._iterating = False
--> 789             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    790             # Make sure that we get a last message telling us we are done
    791             elapsed_time = time.time() - self._start_time
    792             self._print('Done %3i out of %3i | elapsed: %s finished',
    793                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError                                         Fri Dec  1 15:50:44 2017
PID: 14851       Python 2.7.14: /Users/ammarjawaddoosh/anaconda2/bin/python
...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = (GradientBoostingClassifier(criterion='friedman_m...e=1.0, verbose=0,
              warm_start=False), array([[  1.56980101e-02,   1.06353341e-02,   0....000000e+00,   0.00000000e+00,   0.00000000e+00]]), [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, ...], {'score': make_scorer(precision_score)}, array([ 2,  3,  4,  5,  6,  7,  8, 10, 11, 12, 1... 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]), array([0, 1, 9]), 2, {'learning_rate': 0.076478828828828838, 'max_depth': 2, 'max_features': 19, 'min_samples_leaf': 13, 'min_samples_split': 4, 'n_estimators': 69})
        kwargs = {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': 'warn'}
        self.items = [(<function _fit_and_score>, (GradientBoostingClassifier(criterion='friedman_m...e=1.0, verbose=0,
              warm_start=False), array([[  1.56980101e-02,   1.06353341e-02,   0....000000e+00,   0.00000000e+00,   0.00000000e+00]]), [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, ...], {'score': make_scorer(precision_score)}, array([ 2,  3,  4,  5,  6,  7,  8, 10, 11, 12, 1... 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]), array([0, 1, 9]), 2, {'learning_rate': 0.076478828828828838, 'max_depth': 2, 'max_features': 19, 'min_samples_leaf': 13, 'min_samples_split': 4, 'n_estimators': 69}), {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': 'warn'}), (<function _fit_and_score>, (GradientBoostingClassifier(criterion='friedman_m...e=1.0, verbose=0,
              warm_start=False), array([[  1.56980101e-02,   1.06353341e-02,   0....000000e+00,   0.00000000e+00,   0.00000000e+00]]), [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, ...], {'score': make_scorer(precision_score)}, array([ 0,  1,  4,  5,  6,  7,  8,  9, 10, 11, 1... 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]), array([ 2,  3, 12]), 2, {'learning_rate': 0.076478828828828838, 'max_depth': 2, 'max_features': 19, 'min_samples_leaf': 13, 'min_samples_split': 4, 'n_estimators': 69}), {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': 'warn'}), (<function _fit_and_score>, (GradientBoostingClassifier(criterion='friedman_m...e=1.0, verbose=0,
              warm_start=False), array([[  1.56980101e-02,   1.06353341e-02,   0....000000e+00,   0.00000000e+00,   0.00000000e+00]]), [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, ...], {'score': make_scorer(precision_score)}, array([ 0,  1,  2,  3,  6,  7,  8,  9, 10, 11, 1... 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]), array([ 4,  5, 18]), 2, {'learning_rate': 0.076478828828828838, 'max_depth': 2, 'max_features': 19, 'min_samples_leaf': 13, 'min_samples_split': 4, 'n_estimators': 69}), {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': 'warn'}), (<function _fit_and_score>, (GradientBoostingClassifier(criterion='friedman_m...e=1.0, verbose=0,
              warm_start=False), array([[  1.56980101e-02,   1.06353341e-02,   0....000000e+00,   0.00000000e+00,   0.00000000e+00]]), [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, ...], {'score': make_scorer(precision_score)}, array([ 0,  1,  2,  3,  4,  5,  8,  9, 10, 11, 1... 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]), array([ 6,  7, 19]), 2, {'learning_rate': 0.076478828828828838, 'max_depth': 2, 'max_features': 19, 'min_samples_leaf': 13, 'min_samples_split': 4, 'n_estimators': 69}), {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': 'warn'})]
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator=GradientBoostingClassifier(criterion='friedman_m...e=1.0, verbose=0,
              warm_start=False), X=array([[  1.56980101e-02,   1.06353341e-02,   0....000000e+00,   0.00000000e+00,   0.00000000e+00]]), y=[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, ...], scorer={'score': make_scorer(precision_score)}, train=array([ 2,  3,  4,  5,  6,  7,  8, 10, 11, 12, 1... 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]), test=array([0, 1, 9]), verbose=2, parameters={'learning_rate': 0.076478828828828838, 'max_depth': 2, 'max_features': 19, 'min_samples_leaf': 13, 'min_samples_split': 4, 'n_estimators': 69}, fit_params={}, return_train_score='warn', return_parameters=False, return_n_test_samples=True, return_times=True, error_score='raise')
    453 
    454     try:
    455         if y_train is None:
    456             estimator.fit(X_train, **fit_params)
    457         else:
--> 458             estimator.fit(X_train, y_train, **fit_params)
        estimator.fit = <bound method GradientBoostingClassifier.fit of ...=1.0, verbose=0,
              warm_start=False)>
        X_train = array([[  1.56980101e-02,   0.00000000e+00,   0....000000e+00,   0.00000000e+00,   0.00000000e+00]])
        y_train = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...]
        fit_params = {}
    459 
    460     except Exception as e:
    461         # Note fit time as time until error
    462         fit_time = time.time() - start_time

...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/sklearn/ensemble/gradient_boosting.py in fit(self=GradientBoostingClassifier(criterion='friedman_m...e=1.0, verbose=0,
              warm_start=False), X=array([[  1.56980101e-02,   0.00000000e+00,   0.....00000000e+00,   0.00000000e+00]], dtype=float32), y=array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0]), sample_weight=array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  ...     1.,  1.,  1.,  1.,  1.,  1.], dtype=float32), monitor=None)
   1029                 X_idx_sorted = np.asfortranarray(np.argsort(X, axis=0),
   1030                                                  dtype=np.int32)
   1031 
   1032         # fit the boosting stages
   1033         n_stages = self._fit_stages(X, y, y_pred, sample_weight, random_state,
-> 1034                                     begin_at_stage, monitor, X_idx_sorted)
        begin_at_stage = 0
        monitor = None
        X_idx_sorted = array([[40,  0,  0, ..., 96, 96, 96],
       [68...     [13, 60, 60, ..., 47, 49, 33]], dtype=int32)
   1035         # change shape of arrays after fit (early-stopping or additional ests)
   1036         if n_stages != self.estimators_.shape[0]:
   1037             self.estimators_ = self.estimators_[:n_stages]
   1038             self.train_score_ = self.train_score_[:n_stages]

...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/sklearn/ensemble/gradient_boosting.py in _fit_stages(self=GradientBoostingClassifier(criterion='friedman_m...e=1.0, verbose=0,
              warm_start=False), X=array([[  1.56980101e-02,   0.00000000e+00,   0.....00000000e+00,   0.00000000e+00]], dtype=float32), y=array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0]), y_pred=array([[-1.95774461],
       [-1.95774461],
    ...61],
       [-1.95774461],
       [-1.95774461]]), sample_weight=array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  ...     1.,  1.,  1.,  1.,  1.,  1.], dtype=float32), random_state=<mtrand.RandomState object>, begin_at_stage=0, monitor=None, X_idx_sorted=array([[40,  0,  0, ..., 96, 96, 96],
       [68...     [13, 60, 60, ..., 47, 49, 33]], dtype=int32))
   1084                                       sample_weight[~sample_mask])
   1085 
   1086             # fit next stage of trees
   1087             y_pred = self._fit_stage(i, X, y, y_pred, sample_weight,
   1088                                      sample_mask, random_state, X_idx_sorted,
-> 1089                                      X_csc, X_csr)
        X_csc = None
        X_csr = None
   1090 
   1091             # track deviance (= loss)
   1092             if do_oob:
   1093                 self.train_score_[i] = loss_(y[sample_mask],

...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/sklearn/ensemble/gradient_boosting.py in _fit_stage(self=GradientBoostingClassifier(criterion='friedman_m...e=1.0, verbose=0,
              warm_start=False), i=0, X=array([[  1.56980101e-02,   0.00000000e+00,   0.....00000000e+00,   0.00000000e+00]], dtype=float32), y=array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0]), y_pred=array([[-1.95774461],
       [-1.95774461],
    ...61],
       [-1.95774461],
       [-1.95774461]]), sample_weight=array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  ...     1.,  1.,  1.,  1.,  1.,  1.], dtype=float32), sample_mask=array([ True,  True,  True,  True,  True,  True,...,  True,  True,  True,  True,  True], dtype=bool), random_state=<mtrand.RandomState object>, X_idx_sorted=array([[40,  0,  0, ..., 96, 96, 96],
       [68...     [13, 60, 60, ..., 47, 49, 33]], dtype=int32), X_csc=None, X_csr=None)
    783             if X_csc is not None:
    784                 tree.fit(X_csc, residual, sample_weight=sample_weight,
    785                          check_input=False, X_idx_sorted=X_idx_sorted)
    786             else:
    787                 tree.fit(X, residual, sample_weight=sample_weight,
--> 788                          check_input=False, X_idx_sorted=X_idx_sorted)
        X_idx_sorted = array([[40,  0,  0, ..., 96, 96, 96],
       [68...     [13, 60, 60, ..., 47, 49, 33]], dtype=int32)
    789 
    790             # update tree leaves
    791             if X_csr is not None:
    792                 loss.update_terminal_regions(tree.tree_, X_csr, y, residual, y_pred,

...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/sklearn/tree/tree.py in fit(self=DecisionTreeRegressor(criterion='friedman_mse', ...ect at 0x1a11d027d0>,
           splitter='best'), X=array([[  1.56980101e-02,   0.00000000e+00,   0.....00000000e+00,   0.00000000e+00]], dtype=float32), y=array([-0.12371134, -0.12371134, -0.12371134, -0...4, -0.12371134,
       -0.12371134, -0.12371134]), sample_weight=array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  ...     1.,  1.,  1.,  1.,  1.,  1.], dtype=float32), check_input=False, X_idx_sorted=array([[40,  0,  0, ..., 96, 96, 96],
       [68...     [13, 60, 60, ..., 47, 49, 33]], dtype=int32))
   1119 
   1120         super(DecisionTreeRegressor, self).fit(
   1121             X, y,
   1122             sample_weight=sample_weight,
   1123             check_input=check_input,
-> 1124             X_idx_sorted=X_idx_sorted)
        X_idx_sorted = array([[40,  0,  0, ..., 96, 96, 96],
       [68...     [13, 60, 60, ..., 47, 49, 33]], dtype=int32)
   1125         return self
   1126 
   1127 
   1128 class ExtraTreeClassifier(DecisionTreeClassifier):

...........................................................................
/Users/ammarjawaddoosh/anaconda2/lib/python2.7/site-packages/sklearn/tree/tree.py in fit(self=DecisionTreeRegressor(criterion='friedman_mse', ...ect at 0x1a11d027d0>,
           splitter='best'), X=array([[  1.56980101e-02,   0.00000000e+00,   0.....00000000e+00,   0.00000000e+00]], dtype=float32), y=array([[-0.12371134],
       [-0.12371134],
    ...34],
       [-0.12371134],
       [-0.12371134]]), sample_weight=array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  ...     1.,  1.,  1.,  1.,  1.,  1.], dtype=float32), check_input=False, X_idx_sorted=array([[40,  0,  0, ..., 96, 96, 96],
       [68...     [13, 60, 60, ..., 47, 49, 33]], dtype=int32))
    237         if not 0 <= self.min_weight_fraction_leaf <= 0.5:
    238             raise ValueError("min_weight_fraction_leaf must in [0, 0.5]")
    239         if max_depth <= 0:
    240             raise ValueError("max_depth must be greater than zero. ")
    241         if not (0 < max_features <= self.n_features_):
--> 242             raise ValueError("max_features must be in (0, n_features]")
    243         if not isinstance(max_leaf_nodes, (numbers.Integral, np.integer)):
    244             raise ValueError("max_leaf_nodes must be integral number but was "
    245                              "%r" % max_leaf_nodes)
    246         if -1 < max_leaf_nodes < 2:

ValueError: max_features must be in (0, n_features]
___________________________________________________________________________

In [177]:
#######
# SVC #
#######

clf = SVC(random_state=42, kernel='sigmoid', C=57.9, gamma=0.059, class_weight='balanced')

# Scaling both train and test features skipping 'poi' since it's a bool and the rest are financial values
min_max_scaler = MinMaxScaler()
features_train_scaled = min_max_scaler.fit_transform(features_train.iloc[:, 1:])
features_test_scaled = min_max_scaler.fit_transform(features_test.iloc[:, 1:])


# params = {'C': scipy.stats.expon(scale=100), 
#          'gamma': scipy.stats.expon(scale=.1), 
#          'kernel':['rbf', 'linear', 'poly', 'sigmoid'],
#          'class_weight': [None, 'balanced']
#         }
# clf_svc_rscv = RandomizedSearchCV(clf, param_distributions=params, cv=70, n_iter=50, scoring='precision', verbose=2, n_jobs=-1)
# clf_svc_rscv.fit(features_train_scaled, labels_train)
clf.fit(features_train_scaled, labels_train)

# pred = clf.predict(features_test_scaled)

# Classifier scores
precision_score_svc = precision_score(pred, labels_test)
recall_score_svc = recall_score(pred, labels_test)
accuracy_score_svc = accuracy_score(pred, labels_test)

print "Accuracy score: ", accuracy_score_svc
print "Precision score: ", precision_score_svc
print "Recall score: ", recall_score_svc

# print clf_svc_rscv.best_params_
# print clf_svc_rscv.best_score_

# parameters that return the best recall score:
# {'kernel': 'sigmoid', 'C': 22.124794209078111, 'gamma': 0.38710403019966072, 'class_weight': 'balanced'}
# 0.33

# parameters that return the best precision score:
# {'kernel': 'sigmoid', 'C': 57.914218875973347, 'gamma': 0.059626099388516422, 'class_weight': 'balanced'}
# 0.235

# Dumping classifier, my_dataset and features_list as .pkl files to be used in tester.py
dump_classifier_and_data(clf, my_dataset, features_list)

Accuracy score:  0.886363636364
Precision score:  0.4
Recall score:  0.5


In [161]:
#################
# Random Forest #
#################

# Fitting the model
clf = RandomForestClassifier(random_state=42, n_estimators=1, min_samples_leaf=2, min_samples_split=2, max_depth=31)

# RandomizedsearchCV to find optimal hyper params
#params = {
#          'n_estimators':np.arange(1, 5, 1),
#          'min_samples_leaf':np.arange(1, 5, 1),
#          'min_samples_split':np.arange(2, 20, 1),
#          'max_depth':np.arange(1, 40, 1),
#         }

#clf_rf_rscv = RandomizedSearchCV(clf, cv=70, n_iter=50, param_distributions=params, scoring='recall', verbose=2, n_jobs=-1)
#clf_rf_rscv.fit(features_train, labels_train)

clf.fit(features_train, labels_train)


# print clf_rf_rscv.best_params_
# print clf_rf_rscv.best_score_

# Dumping classifier, my_dataset and features_list as .pkl files to be used in tester.py
dump_classifier_and_data(clf, my_dataset, features_list)

# best results from randomizedsearchCV using "recall" scoring:
# {'n_estimators': 1, 'min_samples_split': 2, 'max_depth': 31, 'min_samples_leaf': 2}
# 0.21

{'n_estimators': 1, 'min_samples_split': 5, 'max_depth': 34, 'min_samples_leaf': 1}
0.21


# Q&A


1.Summarize for us the goal of this project and how machine learning is useful in trying to accomplish it. As part of your answer, give some background on the dataset and how it can be used to answer the project question. Were there any outliers in the data when you got it, and how did you handle those?  [relevant rubric items: “data exploration”, “outlier investigation”]

>Machine learning is powerful at predicting whether a certain outcome is likely to happen (classification) or continuous numbers (regression). In this example where we are asked to predict that a person is a POI (classification) and we have features such as salary, bonus, stock (financial) as well as how many emails they have sent/received (count) we can use such features to learn if these help us predict whether people are POIs or not. As for outliers, I have only removed one due to the scarcity of data to begin with. The one I have removed is the column sum in the PDF of Enron employees' salaries because that's not a feature of an employee.

2.What features did you end up using in your POI identifier, and what selection process did you use to pick them? Did you have to do any scaling? Why or why not? As part of the assignment, you should attempt to engineer your own feature that does not come ready-made in the dataset -- explain what feature you tried to make, and the rationale behind it. (You do not necessarily have to use it in the final analysis, only engineer and test it.) In your feature selection step, if you used an algorithm like a decision tree, please also give the feature importances of the features that you use, and if you used an automated feature selection function like SelectKBest, please report the feature scores and reasons for your choice of parameter values.  [relevant rubric items: “create new features”, “intelligently select features”, “properly scale features”]

>As part of the EDA I looked at feature importances to understand which features were important and which weren't useful to include in my machine learning algorithms. I took all features which had an importance > 0, so that means the following 9 features: 'poi', 'salary', 'deferral_payments', 'total_payments','loan_advances', 'bonus', 'restricted_stock_deferred', 'deferred_income', 'total_stock_value', 'expenses'. As for the feature importances scores for the features I have used they are as follows:   
  1. feature: salary (0.220426513942)
  2. feature: deferral_payments (0.21197488041)
  3. feature: total_payments (0.132625994695)
  4. feature: loan_advances (0.106100795756)
  5. feature: bonus (0.105863661155)
  6. feature: restricted_stock_deferred (0.0757862826828)
  7. feature: deferred_income (0.0736811081639)
  8. feature: total_stock_value (0.0620731020005)
  9. feature: expenses (0.0114676611954)

> I have scaled the numerical features in my SVC model using MinMaxScaler to easier be able to compare the different financial measurements on a scale of 0 to 1.
> I have created my own feature bonus_salary_ratio which is bonus / salary based on the rationale that someone who has a high salary (top correlated feature with a POI) is likely also to have a high bonus however it made my models perform worse, especially GaussianNB due to the large amount of bonuses AND salaries which both were 0's so I was dividing 0 by 0, or some bonuses being negative leading to a -inf when divided by salary.

>As mentioned earlier I have decided to include any feature with a score > 0.

3.What algorithm did you end up using? What other one(s) did you try? How did model performance differ between algorithms?  [relevant rubric item: “pick an algorithm”]

> I ended it up using the GaussianNB because it returned a precision of .4 and recall of .5. which was the highest scores out of them all. I tried SVC and Random Forest, but SVC performed the worse and I suspect it's due to the scarcity of data. I fine-tuned Random Forest using RandomizedSearchCV but couldn't get a precision above .3 for both precision and recall.

4.What does it mean to tune the parameters of an algorithm, and what can happen if you don’t do this well?  How did you tune the parameters of your particular algorithm? What parameters did you tune? (Some algorithms do not have parameters that you need to tune -- if this is the case for the one you picked, identify and briefly explain how you would have done it for the model that was not your final choice or a different model that does utilize parameter tuning, e.g. a decision tree classifier).  [relevant rubric items: “discuss parameter tuning”, “tune the algorithm”]

> Tuning the parameters of an algorithm is really customising the model to match your dataset. For some of the parameters such as min_samples_split there is a trade-off between performance and accuracy: a high sample_split means you are avoiding over-fitting whereas a too high value means you are underfitting. Likewise with the learning_rate, if you give it too high a value you risk it might miss the optimal point whereas if it's too low it will take too long to converge and reach the local minima. For Random Forests, I've tuned the main parameters using RandomizedSearchCV but the best model didn't make it to the .3 precisio/recall scores. Below were the params and their ranges which I used for RandomizedSearchCV for Random Forest:

>params = {
          'n_estimators':np.arange(1, 5, 1),
          'min_samples_leaf':np.arange(1, 5, 1),
          'min_samples_split':np.arange(2, 20, 1),
          'max_depth':np.arange(1, 40, 1),
         }
         
> and for SVC I have used the following params:

> params = {'C': scipy.stats.expon(scale=100), 
          'gamma': scipy.stats.expon(scale=.1), 
          'kernel':['rbf', 'linear', 'poly', 'sigmoid'],
          'class_weight': [None, 'balanced']
         }
> if my GaussianNB classifier weren't the top performing model I would have fine-tuned my model using RandomizedSearchCV like I did for Random Forest and SVC to find the optimal parameters to use in my model.

5.What is validation, and what’s a classic mistake you can make if you do it wrong? How did you validate your analysis?  [relevant rubric items: “discuss validation”, “validation strategy”]

> Validation, also known as cross-validation. It is used to prevent over-fitting to the training data. If you don't use cross-validation you risk overfitting your model to the training dataset which means it won't be able to perform well on un-seen new data because it is unable to generalise well. CV works by splitting the training dataset into smaller sets which the model is evaluated on and for each fold it will then return the score average accuracy score from all of the folds. CV is especially important with imbalanced classes as it increases the probability that your CV folds are more representative of the data.

6.Give at least 2 evaluation metrics and your average performance for each of them.  Explain an interpretation of your metrics that says something human-understandable about your algorithm’s performance. [relevant rubric item: “usage of evaluation metrics”]
