# Importing libraries

In [1]:
import sys
import pickle
sys.path.append("../tools/")

import time
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
import matplotlib.pyplot as plt
%matplotlib inline
import scipy

import missingno as msno
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_selection import SelectKBest
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV



# Features by data type

In [2]:
### features_list selects which features to include.
features_list = ['poi', 'salary', 'deferral_payments', 'total_payments', 'loan_advances','bonus', 
                      'restricted_stock_deferred', 'deferred_income', 'total_stock_value', 
                      'expenses', 'exercised_stock_options', 'other', 'long_term_incentive', 
                      'restricted_stock', 'director_fees', 'to_messages', 'from_poi_to_this_person', 'from_messages', 
                       'from_this_person_to_poi', 'shared_receipt_with_poi'
                ]

# Identifying columns with financial values
financial_features = ['salary', 'deferral_payments', 'total_payments', 'loan_advances','bonus', 
                      'restricted_stock_deferred', 'deferred_income', 'total_stock_value', 
                      'expenses', 'exercised_stock_options', 'other', 'long_term_incentive', 
                      'restricted_stock', 'director_fees'
                     ]

# Identfying columns with numerical values
features_with_count = ['to_messages', 'from_poi_to_this_person', 'from_messages', 
                       'from_this_person_to_poi', 'shared_receipt_with_poi'
                      ]

In [3]:
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

In [4]:
# Ensuring that all keys refer to Enron employees
for k, v in data_dict.iteritems():
    print k

METTS MARK
BAXTER JOHN C
ELLIOTT STEVEN
CORDES WILLIAM R
HANNON KEVIN P
MORDAUNT KRISTINA M
MEYER ROCKFORD G
MCMAHON JEFFREY
HORTON STANLEY C
PIPER GREGORY F
HUMPHREY GENE E
UMANOFF ADAM S
BLACHMAN JEREMY M
SUNDE MARTIN
GIBBS DANA R
LOWRY CHARLES P
COLWELL WESLEY
MULLER MARK S
JACKSON CHARLENE R
WESTFAHL RICHARD K
WALTERS GARETH W
WALLS JR ROBERT H
KITCHEN LOUISE
CHAN RONNIE
BELFER ROBERT
SHANKMAN JEFFREY A
WODRASKA JOHN
BERGSIEKER RICHARD P
URQUHART JOHN A
BIBI PHILIPPE A
RIEKER PAULA H
WHALEY DAVID A
BECK SALLY W
HAUG DAVID L
ECHOLS JOHN B
MENDELSOHN JOHN
HICKERSON GARY J
CLINE KENNETH W
LEWIS RICHARD
HAYES ROBERT E
MCCARTY DANNY J
KOPPER MICHAEL J
LEFF DANIEL P
LAVORATO JOHN J
BERBERIAN DAVID
DETMERING TIMOTHY J
WAKEHAM JOHN
POWERS WILLIAM
GOLD JOSEPH
BANNANTINE JAMES M
DUNCAN JOHN H
SHAPIRO RICHARD S
SHERRIFF JOHN R
SHELBY REX
LEMAISTRE CHARLES
DEFFNER JOSEPH M
KISHKILL JOSEPH G
WHALLEY LAWRENCE G
MCCONNELL MICHAEL S
PIRO JIM
DELAINEY DAVID W
SULLIVAN-SHAKLOVITZ COLLEEN
WROBEL BRUC

In [5]:
# Shape of dataset
print "Rows in the dataset:", len(data_dict)

Rows in the dataset: 146


In [6]:
# Removing the 'TOTAL' value in data_dict because it is a column sum of salaries 
# and doesn't belong to any single employee.
del data_dict['TOTAL']

### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [7]:
# POIs and non-POIs in the dataset
poi_num = 0
non_poi_num = 0
for poi in labels:
    if poi == 1.0:
        poi_num += 1
    else:
        non_poi_num += 1

# Imbalanced classes of POIs - more Non-POIs than POIs.
print "POIs in the dataset: ", poi_num
print "Non-POIs in the dataset: ", non_poi_num

POIs in the dataset:  18
Non-POIs in the dataset:  126


# EDA


In [8]:
# Transforming features into a df so that I won't have to remember to transform
# both features train and test.
df_features = pd.DataFrame(features)
print "Any null-values present in the features chosen? \n", df_features.isnull().any()
df_features.head()

Any null-values present in the features chosen? 
0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
dtype: bool


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,201955.0,2869717.0,4484442.0,0.0,4175000.0,-126027.0,-3081055.0,1729541.0,13868.0,1729541.0,152.0,304805.0,126027.0,0.0,2902.0,47.0,2195.0,65.0,1407.0
1,0.0,178980.0,182466.0,0.0,0.0,0.0,0.0,257817.0,3486.0,257817.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,477.0,0.0,916197.0,0.0,0.0,-560222.0,-5104.0,5243487.0,56301.0,4046157.0,864523.0,0.0,1757552.0,0.0,566.0,39.0,29.0,0.0,465.0
3,267102.0,1295738.0,5634343.0,0.0,1200000.0,0.0,-1386055.0,10623258.0,11200.0,6680544.0,2660303.0,1586055.0,3942714.0,0.0,0.0,0.0,0.0,0.0,0.0
4,239671.0,260455.0,827696.0,0.0,400000.0,-82782.0,-201641.0,63014.0,129142.0,0.0,69.0,0.0,145796.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Splitting dataset into train and test for features and labels
features_train, features_test, labels_train, labels_test = \
    train_test_split(df_features, labels, test_size=0.3, random_state=42)
    

# Feature importances using Decision Tree

In [10]:
#######################
# Feature Importances #
#######################

# Fitting the model
clf = tree.DecisionTreeClassifier(random_state=42)
clf.fit(features_train, labels_train)

# Feature Importances to identify which features have a high variance to be included
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
print "Feature Ranking: "
for i in range(len(importances)):
    print "  {}.  feature: {} ({})".format(i+1, features_list[i+1], importances[indices[i]])

Feature Ranking: 
  1.  feature: salary (0.220426513942)
  2.  feature: deferral_payments (0.21197488041)
  3.  feature: total_payments (0.132625994695)
  4.  feature: loan_advances (0.106100795756)
  5.  feature: bonus (0.105863661155)
  6.  feature: restricted_stock_deferred (0.0757862826828)
  7.  feature: deferred_income (0.0736811081639)
  8.  feature: total_stock_value (0.0620731020005)
  9.  feature: expenses (0.0114676611954)
  10.  feature: exercised_stock_options (0.0)
  11.  feature: other (0.0)
  12.  feature: long_term_incentive (0.0)
  13.  feature: restricted_stock (0.0)
  14.  feature: director_fees (0.0)
  15.  feature: to_messages (0.0)
  16.  feature: from_poi_to_this_person (0.0)
  17.  feature: from_messages (0.0)
  18.  feature: from_this_person_to_poi (0.0)
  19.  feature: shared_receipt_with_poi (0.0)


In [11]:
# New feature is bonus/salary and null-values replaced by 0
features_train['bonus_salary_ratio'] = features_train.loc[:, 5] / features_train.loc[:, 1]
features_train['bonus_salary_ratio'] = np.nan_to_num(features_train['bonus_salary_ratio'])

# Repeat same feature engineering for test data
features_test['bonus_salary_ratio'] = features_test.loc[:, 5] / features_test.loc[:, 1]
features_test['bonus_salary_ratio'] = np.nan_to_num(features_test['bonus_salary_ratio'])

# Explore the new feature created
print features_train['bonus_salary_ratio'].head()
print "Are null-values present in new feature?", features_train['bonus_salary_ratio'].isnull().any().any()

28     0.000000e+00
40     0.000000e+00
86     0.000000e+00
24     0.000000e+00
25   -1.797693e+308
Name: bonus_salary_ratio, dtype: float64
Are null-values present in new feature? False


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://panda

# Feature engineering

In [12]:
# Feature scaling using MinMaxScaling since numerical values span positive and negative

min_max_scaler = MinMaxScaler()

# Reshaping feature because it's a 1D-array
bonus_salary_ratio_train_reshaped = features_train['bonus_salary_ratio'].values.reshape(-1, 1)
bonus_salary_ratio_test_reshaped = features_test['bonus_salary_ratio'].values.reshape(-1, 1)

# Scaling new feature due to presence of negative values
features_train['bonus_salary_ratio'] = min_max_scaler.fit_transform(bonus_salary_ratio_train_reshaped)
features_test['bonus_salary_ratio'] = min_max_scaler.fit_transform(bonus_salary_ratio_test_reshaped)

# Slicing all rows but ignoring first column since it's a bool ("poi")
features_train_scaled = min_max_scaler.fit_transform(features_train.iloc[:, 1:])
features_test_scaled = min_max_scaler.fit_transform(features_test.iloc[:, 1:])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
  data_range = data_max - data_min
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [13]:
# Refining features_list to only include features with >0 importance 
#and to include the new feature in features_list

features_list = ['poi', 'salary', 'deferral_payments', 'total_payments', 
                'loan_advances', 'bonus', 'restricted_stock_deferred',
                'deferred_income', 'total_stock_value', 'expenses'
               ]
clf = tree.DecisionTreeClassifier(random_state=42)
clf.fit(features_train, labels_train)

# Feature Importances to see whether new feature created has any importance.
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]


print "Feature Ranking: "
for i in range(len(importances)):
    print "  {}.  feature: {} ({})".format(i+1, features_list[i+1], importances[indices[i]])

Feature Ranking: 
  1.  feature: salary (0.23848965585)
  2.  feature: deferral_payments (0.220426513942)
  3.  feature: total_payments (0.136188597727)
  4.  feature: loan_advances (0.106100795756)
  5.  feature: bonus (0.0757862826828)
  6.  feature: restricted_stock_deferred (0.0757862826828)
  7.  feature: deferred_income (0.0736811081639)
  8.  feature: total_stock_value (0.0620731020005)
  9.  feature: expenses (0.0114676611954)


IndexError: list index out of range

In [14]:
# Setting up CV using stratifiedshufflesplit due to class imbalance of POI.
# cross_validator to be used as cv parameter for grid/randomizedsearchcv.
cv = StratifiedShuffleSplit(n_splits=100, test_size=0.3, random_state=42, )

In [18]:
############
# AdaBoost #
############

clf = AdaBoostClassifier(random_state=42, 
                         n_estimators=100, 
                         learning_rate=0.47499999999999998, 
                         algorithm='SAMME.R'
                        )

#params = {
#    'n_estimators':np.arange(1, 200, 1),
#    'learning_rate':np.linspace(1.0, 0.001, num=1000),
#    'algorithm':['SAMME', 'SAMME.R']
#}

#clf_ada_rscv = RandomizedSearchCV(clf, param_distributions=params, n_iter=10, cv=cv, scoring='recall', n_jobs=-1, verbose=2)
#clf_ada_rscv.fit(features_train_scaled, labels_train)

# Sleeping so that the final print in the console are the scores

#time.sleep(2)
#print clf_ada_rscv.best_params_
#print clf_ada_rscv.best_score_

clf.fit(features_train_scaled, labels_train)
pred = clf.predict(features_test_scaled)

# Classifier scores
precision_score_ada = precision_score(pred, labels_test)
recall_score_ada = recall_score(pred, labels_test)

print "Precision score: ", precision_score_ada
print "Recall score: ", recall_score_ada

# Dumping classifier, my_dataset and features_list as .pkl files to be used in tester.py
dump_classifier_and_data(clf, my_dataset, features_list)

# tester.py scores: 	Accuracy: 0.86587	Precision: 0.49559	Recall: 0.33700	F1: 0.40119	F2: 0.36004

 Precision score:  0.4
Recall score:  0.4


In [None]:
######################
# KNearest Neighbour #
######################

clf = KNeighborsClassifier()

params = {
    'p':[1, 2],
    'n_neighbors':np.arange(1, 50, 1),
    'weights':['uniform', 'distance'],
    'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size':np.arange(10, 50, 1),
}

clf_knn_rscv = RandomizedSearchCV(clf, param_distributions=params, cv=cv, n_iter=30, scoring='recall', n_jobs=-1, verbose=2)
clf_knn_rscv.fit(features_train_scaled, labels_train)

# Sleeping so that the final print in the console are the scores
time.sleep(2)
print clf_knn_rscv.best_params_
print clf_knn_rscv.best_score_

# {'p': 1, 'weights': 'distance', 'leaf_size': 19, 'algorithm': 'ball_tree', 'n_neighbors': 2}
# 0.09

#clf.fit(features_train_scaled, labels_train)
#pred = clf.predict(features_test_scaled)

# Classifier scores
#precision_score_knn = precision_score(pred, labels_test)
#recall_score_knn = recall_score(pred, labels_test)
#accuracy_score_knn = accuracy_score(pred, labels_test)

#print "Accuracy score: ", accuracy_score_knn
#print "Precision score: ", precision_score_knn
#print "Recall score: ", recall_score_knn

# Dumping classifier, my_dataset and features_list as .pkl files to be used in tester.py
dump_classifier_and_data(clf, my_dataset, features_list)

Fitting 500 folds for each of 30 candidates, totalling 15000 fits
[CV] p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41 
[CV]  p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41, total=   0.0s
[CV] p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41 
[CV] p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41 
[CV]  p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41, total=   0.0s
[CV] p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41 
[CV]  p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41, total=   0.0s
[CV] p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41 
[CV]  p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41, total=   0.0s
[CV] p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41 
[CV]  p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41, total=   0.0s
[CV]  p=2, weights=uniform, leaf_size=23, alg

[Parallel(n_jobs=-1)]: Done  70 tasks      | elapsed:    0.6s


[CV] p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41 
[CV]  p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41, total=   0.0s
[CV] p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41 
[CV] p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41 
[CV]  p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41, total=   0.0s
[CV]  p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41, total=   0.0s
[CV]  p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41, total=   0.0s
[CV] p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41 
[CV] p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41 
[CV] p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41 
[CV]  p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41, total=   0.0s
[CV]  p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41, total=   0.0s
[CV]  p=2, weights=uni

[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:    3.4s


[CV]  p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41, total=   0.0s
[CV]  p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41, total=   0.0s
[CV] p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41 
[CV] p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41 
[CV]  p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41, total=   0.0s
[CV] p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41 
[CV]  p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41, total=   0.0s
[CV] p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41 
[CV] p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41 
[CV]  p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41, total=   0.0s
[CV]  p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41, total=   0.0s
[CV] p=2, weights=uniform, leaf_size=23, algorithm=brute, n_neighbors=41 
[CV]  p=2, weights=uni

[Parallel(n_jobs=-1)]: Done 2014 tasks      | elapsed:    8.5s


[CV]  p=1, weights=uniform, leaf_size=36, algorithm=ball_tree, n_neighbors=36, total=   0.0s
[CV] p=1, weights=uniform, leaf_size=36, algorithm=ball_tree, n_neighbors=36 
[CV] p=1, weights=uniform, leaf_size=36, algorithm=ball_tree, n_neighbors=36 
[CV] p=1, weights=uniform, leaf_size=36, algorithm=ball_tree, n_neighbors=36 
[CV]  p=1, weights=uniform, leaf_size=36, algorithm=ball_tree, n_neighbors=36, total=   0.0s
[CV]  p=1, weights=uniform, leaf_size=36, algorithm=ball_tree, n_neighbors=36, total=   0.0s
[CV] p=1, weights=uniform, leaf_size=36, algorithm=ball_tree, n_neighbors=36 
[CV]  p=1, weights=uniform, leaf_size=36, algorithm=ball_tree, n_neighbors=36, total=   0.0s
[CV] p=1, weights=uniform, leaf_size=36, algorithm=ball_tree, n_neighbors=36 
[CV]  p=1, weights=uniform, leaf_size=36, algorithm=ball_tree, n_neighbors=36, total=   0.0s
[CV] p=1, weights=uniform, leaf_size=36, algorithm=ball_tree, n_neighbors=36 
[CV] p=1, weights=uniform, leaf_size=36, algorithm=ball_tree, n_nei

[Parallel(n_jobs=-1)]: Done 3712 tasks      | elapsed:   14.4s


[CV]  p=1, weights=uniform, leaf_size=38, algorithm=auto, n_neighbors=48, total=   0.0s
[CV] p=1, weights=uniform, leaf_size=38, algorithm=auto, n_neighbors=48 
[CV]  p=1, weights=uniform, leaf_size=38, algorithm=auto, n_neighbors=48, total=   0.0s
[CV] p=1, weights=uniform, leaf_size=38, algorithm=auto, n_neighbors=48 
[CV]  p=1, weights=uniform, leaf_size=38, algorithm=auto, n_neighbors=48, total=   0.0s
[CV] p=1, weights=uniform, leaf_size=38, algorithm=auto, n_neighbors=48 
[CV]  p=1, weights=uniform, leaf_size=38, algorithm=auto, n_neighbors=48, total=   0.0s
[CV] p=1, weights=uniform, leaf_size=38, algorithm=auto, n_neighbors=48 
[CV]  p=1, weights=uniform, leaf_size=38, algorithm=auto, n_neighbors=48, total=   0.0s
[CV] p=1, weights=uniform, leaf_size=38, algorithm=auto, n_neighbors=48 
[CV] p=1, weights=uniform, leaf_size=38, algorithm=auto, n_neighbors=48 
[CV]  p=1, weights=uniform, leaf_size=38, algorithm=auto, n_neighbors=48, total=   0.0s
[CV] p=1, weights=uniform, leaf_si

[Parallel(n_jobs=-1)]: Done 5902 tasks      | elapsed:   22.1s


[CV] p=2, weights=uniform, leaf_size=47, algorithm=auto, n_neighbors=17 
[CV]  p=2, weights=uniform, leaf_size=47, algorithm=auto, n_neighbors=17, total=   0.0s
[CV]  p=2, weights=uniform, leaf_size=47, algorithm=auto, n_neighbors=17, total=   0.0s
[CV]  p=2, weights=uniform, leaf_size=47, algorithm=auto, n_neighbors=17, total=   0.0s
[CV] p=2, weights=uniform, leaf_size=47, algorithm=auto, n_neighbors=17 
[CV]  p=2, weights=uniform, leaf_size=47, algorithm=auto, n_neighbors=17, total=   0.0s
[CV] p=2, weights=uniform, leaf_size=47, algorithm=auto, n_neighbors=17 
[CV]  p=2, weights=uniform, leaf_size=47, algorithm=auto, n_neighbors=17, total=   0.0s
[CV] p=2, weights=uniform, leaf_size=47, algorithm=auto, n_neighbors=17 
[CV] p=2, weights=uniform, leaf_size=47, algorithm=auto, n_neighbors=17 
[CV]  p=2, weights=uniform, leaf_size=47, algorithm=auto, n_neighbors=17, total=   0.0s
[CV]  p=2, weights=uniform, leaf_size=47, algorithm=auto, n_neighbors=17, total=   0.0s
[CV] p=2, weights=u

[Parallel(n_jobs=-1)]: Done 8422 tasks      | elapsed:   40.9s


[CV]  p=1, weights=uniform, leaf_size=12, algorithm=auto, n_neighbors=14, total=   0.0s
[CV] p=1, weights=uniform, leaf_size=12, algorithm=auto, n_neighbors=14 
[CV]  p=1, weights=uniform, leaf_size=12, algorithm=auto, n_neighbors=14, total=   0.0s
[CV] p=1, weights=uniform, leaf_size=12, algorithm=auto, n_neighbors=14 
[CV]  p=1, weights=uniform, leaf_size=12, algorithm=auto, n_neighbors=14, total=   0.0s
[CV] p=1, weights=uniform, leaf_size=12, algorithm=auto, n_neighbors=14 
[CV]  p=1, weights=uniform, leaf_size=12, algorithm=auto, n_neighbors=14, total=   0.0s
[CV] p=1, weights=uniform, leaf_size=12, algorithm=auto, n_neighbors=14 
[CV] p=1, weights=uniform, leaf_size=12, algorithm=auto, n_neighbors=14 
[CV]  p=1, weights=uniform, leaf_size=12, algorithm=auto, n_neighbors=14, total=   0.0s
[CV] p=1, weights=uniform, leaf_size=12, algorithm=auto, n_neighbors=14 
[CV]  p=1, weights=uniform, leaf_size=12, algorithm=auto, n_neighbors=14, total=   0.0s
[CV]  p=1, weights=uniform, leaf_s

[Parallel(n_jobs=-1)]: Done 11503 tasks      | elapsed:  1.0min


[CV] p=1, weights=distance, leaf_size=32, algorithm=brute, n_neighbors=36 
[CV] p=1, weights=distance, leaf_size=32, algorithm=brute, n_neighbors=36 
[CV]  p=1, weights=distance, leaf_size=32, algorithm=brute, n_neighbors=36, total=   0.0s
[CV]  p=1, weights=distance, leaf_size=32, algorithm=brute, n_neighbors=36, total=   0.0s
[CV] p=1, weights=distance, leaf_size=32, algorithm=brute, n_neighbors=36 
[CV] p=1, weights=distance, leaf_size=32, algorithm=brute, n_neighbors=36 
[CV]  p=1, weights=distance, leaf_size=32, algorithm=brute, n_neighbors=36, total=   0.0s
[CV] p=1, weights=distance, leaf_size=32, algorithm=brute, n_neighbors=36 
[CV]  p=1, weights=distance, leaf_size=32, algorithm=brute, n_neighbors=36, total=   0.0s
[CV]  p=1, weights=distance, leaf_size=32, algorithm=brute, n_neighbors=36, total=   0.0s
[CV] p=1, weights=distance, leaf_size=32, algorithm=brute, n_neighbors=36 
[CV] p=1, weights=distance, leaf_size=32, algorithm=brute, n_neighbors=36 
[CV] p=1, weights=distanc

[Parallel(n_jobs=-1)]: Done 15000 out of 15000 | elapsed:  1.2min finished


[CV] p=1, weights=distance, leaf_size=42, algorithm=kd_tree, n_neighbors=31 
[CV] p=1, weights=distance, leaf_size=42, algorithm=kd_tree, n_neighbors=31 
[CV] p=1, weights=distance, leaf_size=42, algorithm=kd_tree, n_neighbors=31 
[CV] p=1, weights=distance, leaf_size=42, algorithm=kd_tree, n_neighbors=31 
[CV] p=1, weights=distance, leaf_size=42, algorithm=kd_tree, n_neighbors=31 
[CV] p=1, weights=distance, leaf_size=42, algorithm=kd_tree, n_neighbors=31 
[CV]  p=1, weights=distance, leaf_size=42, algorithm=kd_tree, n_neighbors=31, total=   0.0s
[CV]  p=1, weights=distance, leaf_size=42, algorithm=kd_tree, n_neighbors=31, total=   0.0s
[CV]  p=1, weights=distance, leaf_size=42, algorithm=kd_tree, n_neighbors=31, total=   0.0s
[CV]  p=1, weights=distance, leaf_size=42, algorithm=kd_tree, n_neighbors=31, total=   0.0s
[CV]  p=1, weights=distance, leaf_size=42, algorithm=kd_tree, n_neighbors=31, total=   0.0s
[CV] p=1, weights=distance, leaf_size=42, algorithm=kd_tree, n_neighbors=31 
[

In [91]:
#################
# Decision Tree #
#################

clf = tree.DecisionTreeClassifier(random_state=42, 
                                  criterion='gini', 
                                  max_depth=17, 
                                  max_features=18, 
                                  class_weight='balanced', 
                                  splitter='random', 
                                  in_samples_leaf=14, 
                                  min_samples_split=63
                                 )

# clf_tree_rscv = RandomizedSearchCV(clf, param_distributions=parameters, cv=70, n_iter=80, scoring='precision', n_jobs=-1, verbose=2)
# clf_tree_rscv.fit(features_train, labels_train)

# print clf_tree_rscv.best_params_
# print clf_tree_rscv.best_score_

# {'splitter': 'random', 'min_samples_leaf': 14, 'max_features': 18, 'criterion': 'gini', 'min_samples_split': 49, 'max_depth': 17, 'class_weight': 'balanced'}
# 0.25

clf.fit(features_train_scaled, labels_train)
pred = clf.predict(features_test_scaled)

# Classifier scores
precision_score_tree = precision_score(pred, labels_test)
recall_score_tree = recall_score(pred, labels_test)
accuracy_score_tree = accuracy_score(pred, labels_test)

print "Accuracy score: ", accuracy_score_tree
print "Precision score: ", precision_score_tree
print "Recall score: ", recall_score_tree

# Dumping classifier, my_dataset and features_list as .pkl files to be used in tester.py
dump_classifier_and_data(clf, my_dataset, features_list)

Accuracy score:  0.886363636364
Precision score:  0.0
Recall score:  0.0


In [27]:
###############
# Naive Bayes #
###############

clf = GaussianNB()
clf.fit(features_train, labels_train)

pred = clf.predict(features_test)

# Classifier scores
precision_score_gnb = precision_score(pred, labels_test)
recall_score_gnb = recall_score(pred, labels_test)
accuracy_score_gnb = accuracy_score(pred, labels_test)

print "Accuracy score: ", accuracy_score_gnb
print "Precision score: ", precision_score_gnb
print "Recall score: ", recall_score_gnb

# Dumping classifier, my_dataset and features_list as .pkl files to be used in tester.py
dump_classifier_and_data(clf, my_dataset, features_list)

# tester py scores: Accuracy: 0.79093	Precision: 0.29194	Recall: 0.39850	F1: 0.33700	F2: 0.37139

Accuracy score:  0.886363636364
Precision score:  0.4
Recall score:  0.5


In [38]:
#######
# GBM #
#######

clf = GradientBoostingClassifier(random_state=42,
                                 learning_rate=0.77800000000000002,
                                 n_estimators=61,
                                 max_features=19,
                                 max_depth=9,
                                 min_samples_split=9,
                                 min_samples_leaf=17
                                )

#params = {'learning_rate':np.linspace(1.0, 0.001, num=100),
#              'n_estimators':np.arange(1, 100, 1),
#              'max_depth':np.arange(1, 50, 1),
#              'max_features':np.arange(1, 20, 1),
#              'min_samples_split':np.arange(2, 20, 1),
#              'min_samples_leaf':np.arange(1, 20, 1)
#             }
#clf_gbrt_rscv = RandomizedSearchCV(clf, param_distributions=params, cv=cv, n_iter=30, scoring='recall', n_jobs=-1, verbose=2)
#clf_gbrt_rscv.fit(features_train_scaled, labels_train)

# Sleeping so that the final print in the console are the scores
#time.sleep(2)
#print clf_gbrt_rscv.best_params_
#print clf_gbrt_rscv.best_score_

# {'learning_rate': 0.77800000000000002, 'min_samples_leaf': 17, 'n_estimators': 61, 'max_features': 19, 'min_samples_split': 9, 'max_depth': 9}
# 0.2265

clf.fit(features_train_scaled, labels_train)
pred = clf.predict(features_test_scaled)

# Classifier scores
precision_score_gbm = precision_score(pred, labels_test)
recall_score_gbm = recall_score(pred, labels_test)

print "Precision score: ", precision_score_gbm
print "Recall score: ", recall_score_gbm

# Dumping classifier, my_dataset and features_list as .pkl files to be used in tester.py
dump_classifier_and_data(clf, my_dataset, features_list)

# tester.py score using default params Precision: 0.46282	Recall: 0.33300

Precision score:  0.2
Recall score:  1.0


In [39]:
#######
# SVC #
#######

clf = SVC(random_state=42, 
          kernel='sigmoid', 
          C=57.9, 
          gamma=0.059, 
          class_weight='balanced'
         )
params = {'C': scipy.stats.expon(scale=100), 
          'gamma': scipy.stats.expon(scale=.1), 
          'kernel':['rbf', 'linear', 'poly', 'sigmoid'],
          'class_weight': [None, 'balanced']
         }
clf_svc_rscv = RandomizedSearchCV(clf, param_distributions=params, cv=cv, n_iter=30, scoring='recall', verbose=2, n_jobs=-1)
clf_svc_rscv.fit(features_train_scaled, labels_train)

#clf.fit(features_train_scaled, labels_train)
#pred = clf.predict(features_test_scaled)

# Classifier scores
#precision_score_svc = precision_score(pred, labels_test)
#recall_score_svc = recall_score(pred, labels_test)

#print "Precision score: ", precision_score_svc
#print "Recall score: ", recall_score_svc

sleep.time(2)
print clf_svc_rscv.best_params_
print clf_svc_rscv.best_score_

# parameters that return the best recall score:
# {'kernel': 'sigmoid', 'C': 22.124794209078111, 'gamma': 0.38710403019966072, 'class_weight': 'balanced'}
# 0.33

# parameters that return the best precision score:
# {'kernel': 'sigmoid', 'C': 57.914218875973347, 'gamma': 0.059626099388516422, 'class_weight': 'balanced'}
# 0.235

# Dumping classifier, my_dataset and features_list as .pkl files to be used in tester.py
dump_classifier_and_data(clf, my_dataset, features_list)

Fitting 500 folds for each of 30 candidates, totalling 15000 fits
[CV] kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced 
[CV]  kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced, total=   0.0s
[CV]  kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced, total=   0.0s
[CV] kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced 
[CV] kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced 
[CV]  kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced, total=   0.0s
[CV] kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced 
[CV]  kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced, total=   0.0s
[CV] kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced 
[CV] kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced 
[CV]  kernel=sigmoid, C=59.3388986004, gamma

[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    1.0s


[CV] kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced 
[CV]  kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced, total=   0.0s
[CV] kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced 
[CV]  kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced, total=   0.0s
[CV] kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced 
[CV]  kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced, total=   0.0s
[CV] kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced 
[CV] kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced 
[CV]  kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced, total=   0.0s
[CV]  kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced, total=   0.0s
[CV] kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced 
[CV] kernel=

[Parallel(n_jobs=-1)]: Done 1356 tasks      | elapsed:    4.7s


[CV] kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced 
[CV] kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced 
[CV]  kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced, total=   0.0s
[CV]  kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced, total=   0.0s
[CV] kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced 
[CV] kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced 
[CV]  kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced, total=   0.0s
[CV]  kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced, total=   0.0s
[CV] kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced 
[CV] kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced 
[CV]  kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced, total=   0.0s
[CV]  kernel

[Parallel(n_jobs=-1)]: Done 3386 tasks      | elapsed:   10.7s


[CV]  kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced, total=   0.0s
[CV] kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced 
[CV] kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced 
[CV] kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced 
[CV]  kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced, total=   0.0s
[CV]  kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced, total=   0.0s
[CV] kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced 
[CV] kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced 
[CV]  kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced, total=   0.0s
[CV] kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced 
[CV]  kernel=sigmoid, C=59.3388986004, gamma=0.286591982341, class_weight=balanced, total=   0.0s
[CV]  kernel

[Parallel(n_jobs=-1)]: Done 6216 tasks      | elapsed:   19.5s


[CV] kernel=poly, C=44.8527007812, gamma=0.0347986383793, class_weight=balanced 
[CV] kernel=poly, C=44.8527007812, gamma=0.0347986383793, class_weight=balanced 
[CV]  kernel=poly, C=44.8527007812, gamma=0.0347986383793, class_weight=balanced, total=   0.0s
[CV] kernel=poly, C=44.8527007812, gamma=0.0347986383793, class_weight=balanced 
[CV]  kernel=poly, C=44.8527007812, gamma=0.0347986383793, class_weight=balanced, total=   0.0s
[CV] kernel=poly, C=44.8527007812, gamma=0.0347986383793, class_weight=balanced 
[CV]  kernel=poly, C=44.8527007812, gamma=0.0347986383793, class_weight=balanced, total=   0.0s
[CV]  kernel=poly, C=44.8527007812, gamma=0.0347986383793, class_weight=balanced, total=   0.0s
[CV] kernel=poly, C=44.8527007812, gamma=0.0347986383793, class_weight=balanced 
[CV] kernel=poly, C=44.8527007812, gamma=0.0347986383793, class_weight=balanced 
[CV]  kernel=poly, C=44.8527007812, gamma=0.0347986383793, class_weight=balanced, total=   0.0s
[CV] kernel=poly, C=44.8527007812,

[Parallel(n_jobs=-1)]: Done 9031 tasks      | elapsed:   43.2s


[CV]  kernel=rbf, C=255.691629257, gamma=0.0763214777545, class_weight=None, total=   0.0s
[CV]  kernel=rbf, C=255.691629257, gamma=0.0763214777545, class_weight=None, total=   0.0s
[CV] kernel=rbf, C=255.691629257, gamma=0.0763214777545, class_weight=None 
[CV]  kernel=rbf, C=255.691629257, gamma=0.0763214777545, class_weight=None, total=   0.0s
[CV] kernel=rbf, C=255.691629257, gamma=0.0763214777545, class_weight=None 
[CV] kernel=rbf, C=255.691629257, gamma=0.0763214777545, class_weight=None 
[CV]  kernel=rbf, C=255.691629257, gamma=0.0763214777545, class_weight=None, total=   0.0s
[CV] kernel=rbf, C=255.691629257, gamma=0.0763214777545, class_weight=None 
[CV]  kernel=rbf, C=255.691629257, gamma=0.0763214777545, class_weight=None, total=   0.0s
[CV] kernel=rbf, C=255.691629257, gamma=0.0763214777545, class_weight=None 
[CV]  kernel=rbf, C=255.691629257, gamma=0.0763214777545, class_weight=None, total=   0.0s
[CV]  kernel=rbf, C=255.691629257, gamma=0.0763214777545, class_weight=Non

[Parallel(n_jobs=-1)]: Done 11256 tasks      | elapsed:   52.9s


[CV]  kernel=poly, C=49.7261549347, gamma=0.0901795121942, class_weight=None, total=   0.0s
[CV] kernel=poly, C=49.7261549347, gamma=0.0901795121942, class_weight=None 
[CV]  kernel=poly, C=49.7261549347, gamma=0.0901795121942, class_weight=None, total=   0.0s
[CV] kernel=poly, C=49.7261549347, gamma=0.0901795121942, class_weight=None 
[CV]  kernel=poly, C=49.7261549347, gamma=0.0901795121942, class_weight=None, total=   0.0s
[CV] kernel=poly, C=49.7261549347, gamma=0.0901795121942, class_weight=None 
[CV]  kernel=poly, C=49.7261549347, gamma=0.0901795121942, class_weight=None, total=   0.0s
[CV] kernel=poly, C=49.7261549347, gamma=0.0901795121942, class_weight=None 
[CV]  kernel=poly, C=49.7261549347, gamma=0.0901795121942, class_weight=None, total=   0.0s
[CV] kernel=poly, C=49.7261549347, gamma=0.0901795121942, class_weight=None 
[CV] kernel=poly, C=49.7261549347, gamma=0.0901795121942, class_weight=None 
[CV]  kernel=poly, C=49.7261549347, gamma=0.0901795121942, class_weight=None, 

[Parallel(n_jobs=-1)]: Done 14221 tasks      | elapsed:  1.1min


[CV]  kernel=linear, C=35.7639928904, gamma=0.373127926846, class_weight=balanced, total=   0.0s
[CV]  kernel=linear, C=35.7639928904, gamma=0.373127926846, class_weight=balanced, total=   0.0s
[CV] kernel=linear, C=35.7639928904, gamma=0.373127926846, class_weight=balanced 
[CV]  kernel=linear, C=35.7639928904, gamma=0.373127926846, class_weight=balanced, total=   0.0s
[CV]  kernel=linear, C=35.7639928904, gamma=0.373127926846, class_weight=balanced, total=   0.0s
[CV]  kernel=linear, C=35.7639928904, gamma=0.373127926846, class_weight=balanced, total=   0.0s
[CV] kernel=linear, C=35.7639928904, gamma=0.373127926846, class_weight=balanced 
[CV] kernel=linear, C=35.7639928904, gamma=0.373127926846, class_weight=balanced 
[CV] kernel=linear, C=35.7639928904, gamma=0.373127926846, class_weight=balanced 
[CV]  kernel=linear, C=35.7639928904, gamma=0.373127926846, class_weight=balanced, total=   0.0s
[CV] kernel=linear, C=35.7639928904, gamma=0.373127926846, class_weight=balanced 
[CV] ker

[Parallel(n_jobs=-1)]: Done 15000 out of 15000 | elapsed:  1.1min finished


[CV] kernel=rbf, C=64.3214916664, gamma=0.00885327540898, class_weight=balanced 
[CV]  kernel=rbf, C=64.3214916664, gamma=0.00885327540898, class_weight=balanced, total=   0.0s
[CV] kernel=rbf, C=64.3214916664, gamma=0.00885327540898, class_weight=balanced 
[CV] kernel=rbf, C=64.3214916664, gamma=0.00885327540898, class_weight=balanced 
[CV]  kernel=rbf, C=64.3214916664, gamma=0.00885327540898, class_weight=balanced, total=   0.0s
[CV] kernel=rbf, C=64.3214916664, gamma=0.00885327540898, class_weight=balanced 
[CV]  kernel=rbf, C=64.3214916664, gamma=0.00885327540898, class_weight=balanced, total=   0.0s
[CV]  kernel=rbf, C=64.3214916664, gamma=0.00885327540898, class_weight=balanced, total=   0.0s
[CV] kernel=rbf, C=64.3214916664, gamma=0.00885327540898, class_weight=balanced 
[CV]  kernel=rbf, C=64.3214916664, gamma=0.00885327540898, class_weight=balanced, total=   0.0s
[CV]  kernel=rbf, C=64.3214916664, gamma=0.00885327540898, class_weight=balanced, total=   0.0s
[CV] kernel=rbf, C=

In [161]:
#################
# Random Forest #
#################

# Fitting the model
clf = RandomForestClassifier(random_state=42, n_estimators=1, min_samples_leaf=2, min_samples_split=2, max_depth=31)

# RandomizedsearchCV to find optimal hyper params
# params = {
#          'n_estimators':np.arange(1, 5, 1),
#          'min_samples_leaf':np.arange(1, 5, 1),
#          'min_samples_split':np.arange(2, 20, 1),
#          'max_depth':np.arange(1, 40, 1),
#         }

# clf_rf_rscv = RandomizedSearchCV(clf, cv=70, n_iter=50, param_distributions=params, scoring='recall', verbose=2, n_jobs=-1)
# clf_rf_rscv.fit(features_train, labels_train)

# print clf_rf_rscv.best_params_
# print clf_rf_rscv.best_score_

# {'n_estimators': 1, 'min_samples_split': 2, 'max_depth': 31, 'min_samples_leaf': 2}
# 0.21

clf.fit(features_train, labels_train)




# Dumping classifier, my_dataset and features_list as .pkl files to be used in tester.py
dump_classifier_and_data(clf, my_dataset, features_list)

{'n_estimators': 1, 'min_samples_split': 5, 'max_depth': 34, 'min_samples_leaf': 1}
0.21


# Q&A


1.Summarize for us the goal of this project and how machine learning is useful in trying to accomplish it. As part of your answer, give some background on the dataset and how it can be used to answer the project question. Were there any outliers in the data when you got it, and how did you handle those?  [relevant rubric items: “data exploration”, “outlier investigation”]

>Machine learning is powerful at predicting whether a certain outcome is likely to happen (classification) or continuous numbers (regression). In this example where we are asked to predict that a person is a POI (classification) and we have features such as salary, bonus, stock (financial) as well as how many emails they have sent/received (count) we can use such features to learn if these help us predict whether people are POIs or not. As for outliers, I have only removed one due to the scarcity of data to begin with. The one I have removed is the column sum in the PDF of Enron employees' salaries (employee name "TOTAL") because that's not a feature of an employee.

> There were 146 rows of data pre-cleaning and 18 POIs and 126 Non-POIs in the dataset.

2.What features did you end up using in your POI identifier, and what selection process did you use to pick them? Did you have to do any scaling? Why or why not? As part of the assignment, you should attempt to engineer your own feature that does not come ready-made in the dataset -- explain what feature you tried to make, and the rationale behind it. (You do not necessarily have to use it in the final analysis, only engineer and test it.) In your feature selection step, if you used an algorithm like a decision tree, please also give the feature importances of the features that you use, and if you used an automated feature selection function like SelectKBest, please report the feature scores and reasons for your choice of parameter values.  [relevant rubric items: “create new features”, “intelligently select features”, “properly scale features”]

>As part of the EDA I looked at feature importances to understand which features were important and which weren't useful to include in my machine learning algorithms. I took all features which had an importance > 0, so that means the following 9 features: 'poi', 'salary', 'deferral_payments', 'total_payments','loan_advances', 'bonus', 'restricted_stock_deferred', 'deferred_income', 'total_stock_value', 'expenses'. As for the feature importances scores for the features I have used they are as follows:   
  1. feature: salary (0.220426513942)
  2. feature: deferral_payments (0.21197488041)
  3. feature: total_payments (0.132625994695)
  4. feature: loan_advances (0.106100795756)
  5. feature: bonus (0.105863661155)
  6. feature: restricted_stock_deferred (0.0757862826828)
  7. feature: deferred_income (0.0736811081639)
  8. feature: total_stock_value (0.0620731020005)
  9. feature: expenses (0.0114676611954)

> I have scaled the numerical features in my SVC model using MinMaxScaler to easier be able to compare the different financial measurements on a scale of 0 to 1.
> I have created my own feature bonus_salary_ratio which is bonus / salary based on the rationale that someone who has a high salary (top correlated feature with a POI) is likely also to have a high bonus however it made my models perform worse, especially GaussianNB due to the large amount of bonuses AND salaries which both were 0's so I was dividing 0 by 0, or some bonuses being negative leading to a -inf when divided by salary.

>As mentioned earlier I have decided to include any feature with a score > 0 because it is preferable to keep a model as simple as possible to avoid overfitting, i.e. if a model can use fewer features without dropping in its accuracy metrics then that's preferable to a model that has more features with the same score since that won't generalise as well on new data.

3.What algorithm did you end up using? What other one(s) did you try? How did model performance differ between algorithms?  [relevant rubric item: “pick an algorithm”]

> I ended it up using AdaBoost with default parameters but with features scaled using the MinMaxScaler because it returned a precision of .4 and recall of .3 which was the highest scores out of them all. I tried SVC, GBM, KNN, DecisionTree, GaussianNB and Random Forest but none returned scores above .3 for both precision and recall. Even when I fine-tuned the parameters using RanomizedSearchCV for all of the above models it was still the default parameters for AdaBoost that performed the best on tester.py.

4.What does it mean to tune the parameters of an algorithm, and what can happen if you don’t do this well?  How did you tune the parameters of your particular algorithm? What parameters did you tune? (Some algorithms do not have parameters that you need to tune -- if this is the case for the one you picked, identify and briefly explain how you would have done it for the model that was not your final choice or a different model that does utilize parameter tuning, e.g. a decision tree classifier).  [relevant rubric items: “discuss parameter tuning”, “tune the algorithm”]

> Tuning the parameters of an algorithm is really customising the model to match your dataset. For some of the parameters such as min_samples_split there is a trade-off between performance and accuracy: a high sample_split means you are avoiding over-fitting whereas a too high value means you are underfitting. Likewise with the learning_rate, if you give it too high a value you risk it might miss the optimal point whereas if it's too low it will take too long to converge and reach the local minima. For Random Forests, I've tuned the main parameters using RandomizedSearchCV but the best model didn't make it to the .3 precisio/recall scores. Below were the params and their ranges which I used for RandomizedSearchCV for Random Forest:

>params = {
          'n_estimators':np.arange(1, 5, 1),
          'min_samples_leaf':np.arange(1, 5, 1),
          'min_samples_split':np.arange(2, 20, 1),
          'max_depth':np.arange(1, 40, 1),
         }
         
> and for SVC I have used the following params:

> params = {'C': scipy.stats.expon(scale=100), 
          'gamma': scipy.stats.expon(scale=.1), 
          'kernel':['rbf', 'linear', 'poly', 'sigmoid'],
          'class_weight': [None, 'balanced']
         }
> if my AdaBoost classifier with default parameters weren't the top performing model I would have fine-tuned my model using RandomizedSearchCV like I did for Random Forest and SVC to find the optimal parameters to use in my model.

5.What is validation, and what’s a classic mistake you can make if you do it wrong? How did you validate your analysis?  [relevant rubric items: “discuss validation”, “validation strategy”]

> Validation, also known as cross-validation. It is used to prevent over-fitting to the training data. If you don't use cross-validation you risk overfitting your model to the training dataset which means it won't be able to perform well on un-seen new data because it is unable to generalise well. CV works by splitting the training dataset into smaller sets which the model is evaluated on and for each fold it will then return the score average accuracy score from all of the folds. CV is especially important with imbalanced classes as it increases the probability that your CV folds are more representative of the data.

6.Give at least 2 evaluation metrics and your average performance for each of them.  Explain an interpretation of your metrics that says something human-understandable about your algorithm’s performance. [relevant rubric item: “usage of evaluation metrics”]

> As mentioned above my best performing model was an AdaBoost with default parameters. When I ran the tester.py I got Precision score of 0.40000 and a Recall score 0.30300. For precision score, it measures how accurate our model was at predicting a POI when in fact they weren't. Such a metric is important as we do not want to label an employee for being a POI when in fact they weren't - we may shame them unjustly in the public when they haven't done anything wrong. On the other hand, recall score measures how accurate our model was at predicting whether someone wasn't a POI when they in fact were. I would say this metric, at least when compared to the precision score in this project, is less important because someone is innocent until proven otherwise.