# Importing libraries

In [1]:
import sys
import pickle
sys.path.append("../tools/")

import time
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
import matplotlib.pyplot as plt
%matplotlib inline
import scipy

import missingno as msno
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_selection import SelectKBest
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV



# Features by data type

In [2]:
### features_list selects which features to include.
features_list = ['poi', 'salary', 'deferral_payments', 'total_payments', 'loan_advances','bonus', 
                      'restricted_stock_deferred', 'deferred_income', 'total_stock_value', 
                      'expenses', 'exercised_stock_options', 'other', 'long_term_incentive', 
                      'restricted_stock', 'director_fees', 'to_messages', 'from_poi_to_this_person', 'from_messages', 
                       'from_this_person_to_poi', 'shared_receipt_with_poi'
                ]

# Identifying columns with financial values
financial_features = ['salary', 'deferral_payments', 'total_payments', 'loan_advances','bonus', 
                      'restricted_stock_deferred', 'deferred_income', 'total_stock_value', 
                      'expenses', 'exercised_stock_options', 'other', 'long_term_incentive', 
                      'restricted_stock', 'director_fees'
                     ]

# Identfying columns with numerical values
features_with_count = ['to_messages', 'from_poi_to_this_person', 'from_messages', 
                       'from_this_person_to_poi', 'shared_receipt_with_poi'
                      ]

In [3]:
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

In [4]:
# Ensuring that all keys refer to Enron employees
for k, v in data_dict.iteritems():
    print k

METTS MARK
BAXTER JOHN C
ELLIOTT STEVEN
CORDES WILLIAM R
HANNON KEVIN P
MORDAUNT KRISTINA M
MEYER ROCKFORD G
MCMAHON JEFFREY
HORTON STANLEY C
PIPER GREGORY F
HUMPHREY GENE E
UMANOFF ADAM S
BLACHMAN JEREMY M
SUNDE MARTIN
GIBBS DANA R
LOWRY CHARLES P
COLWELL WESLEY
MULLER MARK S
JACKSON CHARLENE R
WESTFAHL RICHARD K
WALTERS GARETH W
WALLS JR ROBERT H
KITCHEN LOUISE
CHAN RONNIE
BELFER ROBERT
SHANKMAN JEFFREY A
WODRASKA JOHN
BERGSIEKER RICHARD P
URQUHART JOHN A
BIBI PHILIPPE A
RIEKER PAULA H
WHALEY DAVID A
BECK SALLY W
HAUG DAVID L
ECHOLS JOHN B
MENDELSOHN JOHN
HICKERSON GARY J
CLINE KENNETH W
LEWIS RICHARD
HAYES ROBERT E
MCCARTY DANNY J
KOPPER MICHAEL J
LEFF DANIEL P
LAVORATO JOHN J
BERBERIAN DAVID
DETMERING TIMOTHY J
WAKEHAM JOHN
POWERS WILLIAM
GOLD JOSEPH
BANNANTINE JAMES M
DUNCAN JOHN H
SHAPIRO RICHARD S
SHERRIFF JOHN R
SHELBY REX
LEMAISTRE CHARLES
DEFFNER JOSEPH M
KISHKILL JOSEPH G
WHALLEY LAWRENCE G
MCCONNELL MICHAEL S
PIRO JIM
DELAINEY DAVID W
SULLIVAN-SHAKLOVITZ COLLEEN
WROBEL BRUC

In [8]:
# Shape of dataset
print "Rows in the dataset:", len(data_dict)

Rows in the dataset: 146


In [52]:
# POIs and non-POIs in the dataset
poi_num = 0
non_poi_num = 0
for poi in labels:
    if poi == 1.0:
        poi_num += 1
    else:
        non_poi_num += 1

# Imbalanced classes of POIs - more Non-POIs than POIs.
print "POIs in the dataset: ", poi_num
print "Non-POIs in the dataset: ", non_poi_num

POIs in the dataset:  18
Non-POIs in the dataset:  123


In [9]:
mat_dict = {}
for key in data_dict:
    mat_dict[key] = data_dict[key]['salary']
    
for key, value in sorted(mat_dict.iteritems(), key=lambda (k,v): (v, k), reverse=True):
   print key, ": ", value

YEAP SOON :  NaN
WROBEL BRUCE :  NaN
WODRASKA JOHN :  NaN
WINOKUR JR. HERBERT S :  NaN
WHALEY DAVID A :  NaN
WALTERS GARETH W :  NaN
WAKEHAM JOHN :  NaN
URQUHART JOHN A :  NaN
THE TRAVEL AGENCY IN THE PARK :  NaN
SHERRICK JEFFREY B :  NaN
SCRIMSHAW MATTHEW :  NaN
SAVAGE FRANK :  NaN
PRENTICE JAMES :  NaN
POWERS WILLIAM :  NaN
PIRO JIM :  NaN
PEREIRA PAULO V. FERRAZ :  NaN
NOLES JAMES L :  NaN
MORAN MICHAEL P :  NaN
MEYER ROCKFORD G :  NaN
MEYER JEROME J :  NaN
MENDELSOHN JOHN :  NaN
MCDONALD REBECCA :  NaN
MCCARTY DANNY J :  NaN
LOWRY CHARLES P :  NaN
LOCKHART EUGENE E :  NaN
LEWIS RICHARD :  NaN
LEMAISTRE CHARLES :  NaN
JAEDICKE ROBERT :  NaN
HUGHES JAMES A :  NaN
HORTON STANLEY C :  NaN
HIRKO JOSEPH :  NaN
HAYSLETT RODERICK J :  NaN
HAYES ROBERT E :  NaN
HAUG DAVID L :  NaN
GRAMM WENDY L :  NaN
GILLIS JOHN :  NaN
GIBBS DANA R :  NaN
GATHMANN WILLIAM D :  NaN
FUGH JOHN L :  NaN
FOY JOE :  NaN
FOWLER PEGGY :  NaN
DUNCAN JOHN H :  NaN
CORDES WILLIAM R :  NaN
CLINE KENNETH W :  NaN
CHRIS

In [10]:
# Removing keys 'TOTAL', 'THE TRAVEL AGENCY IN THE PARK', 'BANNANTINE JAMES M' and 'GRAY RODNEY' in data_dict because they are either not employees or in the case of Gray and Bannantine they are outliers.
del data_dict['TOTAL']
del data_dict['THE TRAVEL AGENCY IN THE PARK']
del data_dict['BANNANTINE JAMES M']
del data_dict['GRAY RODNEY']

### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

# EDA


In [11]:
# Transforming features into a df so that I won't have to remember to transform
# both features train and test.
df_features = pd.DataFrame(features)
print "Any null-values present in the features chosen? \n", df_features.isnull().any()
df_features.head()

Any null-values present in the features chosen? 
0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
dtype: bool


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,201955.0,2869717.0,4484442.0,0.0,4175000.0,-126027.0,-3081055.0,1729541.0,13868.0,1729541.0,152.0,304805.0,126027.0,0.0,2902.0,47.0,2195.0,65.0,1407.0
1,0.0,178980.0,182466.0,0.0,0.0,0.0,0.0,257817.0,3486.0,257817.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,267102.0,1295738.0,5634343.0,0.0,1200000.0,0.0,-1386055.0,10623258.0,11200.0,6680544.0,2660303.0,1586055.0,3942714.0,0.0,0.0,0.0,0.0,0.0,0.0
3,239671.0,260455.0,827696.0,0.0,400000.0,-82782.0,-201641.0,63014.0,129142.0,0.0,69.0,0.0,145796.0,0.0,0.0,0.0,0.0,0.0,0.0
4,80818.0,684694.0,860136.0,0.0,0.0,0.0,0.0,1599641.0,0.0,1599641.0,874.0,93750.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Splitting dataset into train and test for features and labels
features_train, features_test, labels_train, labels_test = \
    train_test_split(df_features, labels, test_size=0.3, random_state=42)
    

# Feature importances using Decision Tree

In [15]:
#######################
# Feature Importances #
#######################

# Fitting the model
clf = tree.DecisionTreeClassifier(random_state=42)
clf.fit(features_train, labels_train)

# Feature Importances to identify which features have a high variance to be included
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]

print len(importances), "features in Feature Importances." 
print "Feature ranking: "
for i in range(1, len(importances), 1):
    print "  {}.  feature: {} ({})".format(i, features_list[i], importances[indices[i]])

19 features in Feature Importances.
Feature ranking: 
  1.  feature: salary (0.21274406496)
  2.  feature: deferral_payments (0.189106111008)
  3.  feature: total_payments (0.177991452991)
  4.  feature: loan_advances (0.0751856363847)
  5.  feature: bonus (0.0660231271996)
  6.  feature: restricted_stock_deferred (0.0)
  7.  feature: deferred_income (0.0)
  8.  feature: total_stock_value (0.0)
  9.  feature: expenses (0.0)
  10.  feature: exercised_stock_options (0.0)
  11.  feature: other (0.0)
  12.  feature: long_term_incentive (0.0)
  13.  feature: restricted_stock (0.0)
  14.  feature: director_fees (0.0)
  15.  feature: to_messages (0.0)
  16.  feature: from_poi_to_this_person (0.0)
  17.  feature: from_messages (0.0)
  18.  feature: from_this_person_to_poi (0.0)


In [16]:
# New feature is bonus/salary and null-values replaced by 0
features_train['bonus_salary_ratio'] = features_train.loc[:, 5] / features_train.loc[:, 1]
features_train['bonus_salary_ratio'] = np.nan_to_num(features_train['bonus_salary_ratio'])

# Repeat same feature engineering for test data
features_test['bonus_salary_ratio'] = features_test.loc[:, 5] / features_test.loc[:, 1]
features_test['bonus_salary_ratio'] = np.nan_to_num(features_test['bonus_salary_ratio'])

# Explore the new feature created
print features_train['bonus_salary_ratio'].head()
print "Are null-values present in new feature?", features_train['bonus_salary_ratio'].isnull().any().any()

40      0.000000e+00
24    -1.797693e+308
25      0.000000e+00
127     0.000000e+00
76      0.000000e+00
Name: bonus_salary_ratio, dtype: float64
Are null-values present in new feature? False


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://panda

# Feature engineering

In [48]:
# Feature scaling using MinMaxScaling since numerical values span positive and negative

min_max_scaler = MinMaxScaler()

# Reshaping feature because it's a 1D-array
#bonus_salary_ratio_train_reshaped = features_train['bonus_salary_ratio'].values.reshape(-1, 1)
#bonus_salary_ratio_test_reshaped = features_test['bonus_salary_ratio'].values.reshape(-1, 1)

# Scaling new feature due to presence of negative values
#features_train = min_max_scaler.fit_transform(features_train)
#features_test = min_max_scaler.fit_transform(features_test)

# Slicing all rows but ignoring first column since it's a bool ("poi")
features_train_scaled = min_max_scaler.fit_transform(features_train.iloc[:, 1:])
features_test_scaled = min_max_scaler.fit_transform(features_test.iloc[:, 1:])

# Transforming both scaled train and test from np arrays to pandas dataframes
features_train_scaled = pd.DataFrame(features_train_scaled)
features_test_scaled = pd.DataFrame(features_test_scaled)

features_train_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,0.015698,0.002044,0.0,0.04375,0.914658,0.880984,0.13677,0.343377,0.142375,0.001251,0.0,0.121151,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.015698,0.0,0.0,0.0,0.0,1.0,0.004753,0.0,0.0,0.0,0.0,0.044852,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.019927,0.014391,0.0,0.15,0.914658,0.957214,0.015102,0.072188,0.0,0.009821,0.0,0.047301,0.0,0.137839,0.454545,0.002784,0.018062,0.205035,1.0
3,0.015698,0.002208,0.0,0.0,0.914658,0.98911,0.000897,0.999532,0.0,0.0,0.0,0.0,0.322242,0.0,0.0,0.0,0.0,0.0,1.0
4,0.015698,0.100674,0.0,1.0,0.914658,1.0,0.106018,0.216543,0.121083,0.00015,0.565383,0.068295,0.0,0.569155,1.0,0.179914,0.674877,0.717624,1.0


In [102]:
# Refining features_list to only include features with >0 importance 
#and to include the new feature in features_list

features_list = ['poi', 'salary', 'deferral_payments','total_payments', 
                'loan_advances', 'bonus']
clf = tree.DecisionTreeClassifier(random_state=42)
clf.fit(features_train, labels_train)

# Feature Importances to see whether new feature created has any importance.
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]


print "Feature ranking: "
for i in range(1, len(importances), 1):
    print "  {}.  feature: {} ({})".format(i, features_list[i], importances[indices[i]])
    
    # 6.  feature: bonus_salary_ratio (0.0660231271996)

Feature ranking: 
  1.  feature: salary (0.21274406496)
  2.  feature: deferral_payments (0.177991452991)
  3.  feature: total_payments (0.165550527903)
  4.  feature: loan_advances (0.0826807716413)
  5.  feature: bonus (0.0751856363847)
  6.  feature: bonus_salary_ratio (0.0660231271996)


IndexError: list index out of range

In [50]:
# Setting up CV using stratifiedshufflesplit due to class imbalance of POI.
# cross_validator to be used as cv parameter for grid/randomizedsearchcv.
cv = StratifiedShuffleSplit(n_splits=100, test_size=0.1, random_state=60)

In [110]:
###############
# Naive Bayes #
###############

def fitGNB():
    clf = GaussianNB()
    clf.fit(features_train_scaled, labels_train)

    pred = clf.predict(features_test_scaled)

    # Classifier scores
    precision_score_gnb = precision_score(pred, labels_test, average='weighted')
    recall_score_gnb = recall_score(pred, labels_test, average='weighted')

    print "Precision score: ", precision_score_gnb
    print "Recall score: ", recall_score_gnb

    # Dumping classifier, my_dataset and features_list as .pkl files to be used in tester.py
    dump_classifier_and_data(clf, my_dataset, features_list)

fitGNB()

Precision score:  1.0
Recall score:  0.883720930233


In [101]:
#######
# GBM #
#######
    
def fitGBM():
    '''Fit, predicts, prints scores and dump clf to pickle files for tester.py'''
    
    clf = GradientBoostingClassifier(random_state=42,
                                     min_samples_leaf=6,
                                     min_samples_split=20,
                                     n_estimators=98,
                                     max_features=5,
                                     max_depth=5
                                    )
    clf.fit(features_train, labels_train)
    pred = clf.predict(features_test)

    # Classifier scores
    f1_score_gbm = f1_score(pred, labels_test)
    precision_score_gbm = precision_score(pred, labels_test, average='weighted')
    recall_score_gbm = recall_score(pred, labels_test, average='weighted')
    print "F1-score", f1_score_gbm
    print "Precision score: ", precision_score_gbm
    print "Recall score: ", recall_score_gbm
    
    # returns the following in tester.py: Accuracy: 0.82931	Precision: 0.42724	Recall: 0.32150	F1: 0.36690	F2: 0.33824
	# Total predictions: 13000	True positives:  643	False positives:  862	False negatives: 1357	True negatives: 10138

    # Dumping classifier, my_dataset and features_list as .pkl files to be used in tester.py
    dump_classifier_and_data(clf, my_dataset, features_list)

def tuneGBM():
    '''Identifies optimal params for GBM to be used in fit function'''
    
    clf = GradientBoostingClassifier(random_state=42)

    params = {'learning_rate':np.linspace(0.1, 0.001, num=100),
                  'n_estimators':np.arange(1, 150, 1),
                  'max_depth':np.arange(1, 50, 1),
                  'max_features':np.arange(1, 20, 1),
                  'min_samples_split':np.arange(2, 20, 1),
                  'min_samples_leaf':np.arange(1, 20, 1)
             }
    clf_gbrt_rscv = RandomizedSearchCV(clf, param_distributions=params, cv=cv, n_iter=100, scoring='f1', n_jobs=-1, verbose=1)
    clf_gbrt_rscv.fit(features_train_scaled, labels_train)

    print clf_gbrt_rscv.best_params_
    print clf_gbrt_rscv.best_score_    
    
fitGBM()
#tuneGBM()

#Fitting GBM with the followingn features:

#features_list = ['poi', 'salary', 'deferral_payments', 'total_payments',
#                'loan_advances', 'bonus', 'restricted_stock_deferred',
#                'deferred_income', 'total_stock_value', 'expenses']
# Precision: 0.42724	Recall: 0.32150	F1: 0.36690

#features_list = ['poi', 'salary', 'deferral_payments', 'total_payments',
#                'loan_advances', 'bonus', 'restricted_stock_deferred',
#                'deferred_income', 'total_stock_value']
# Precision: 0.55044	Recall: 0.34650	F1: 0.42528

# features_list = ['poi', 'salary', 'deferral_payments', 'total_payments',
#                'loan_advances', 'bonus', 'restricted_stock_deferred',
#                'deferred_income']
# Precision: 0.48804	Recall: 0.27550	F1: 0.35219

# features_list = ['poi', 'salary', 'deferral_payments', 'total_payments',
#                  'loan_advances', 'bonus', 'restricted_stock_deferred']
# Precision: 0.44126	Recall: 0.31550	F1: 0.36793

# features_list = ['poi', 'salary', 'deferral_payments', 'total_payments',
#                  'loan_advances', 'bonus']
# Precision: 0.42724	Recall: 0.32150	F1: 0.36690

# features_list = ['poi', 'salary', 'deferral_payments', 'total_payments',
#                'loan_advances']

F1-score 0.222222222222
Precision score:  0.853977968176
Recall score:  0.837209302326


In [103]:
############
# AdaBoost #
############

def fitAda():
    '''Fit, predicts, prints scores and dump clf to pickle files for tester.py'''
    
    clf = AdaBoostClassifier(random_state=42, 
                             n_estimators=95, 
                             learning_rate=0.11399999999999999, 
                             algorithm='SAMME.R'
                            )
    clf.fit(features_train_scaled, labels_train)
    pred = clf.predict(features_test_scaled)

    # Classifier scores
    precision_score_ada = precision_score(pred, labels_test, average='weighted')
    recall_score_ada = recall_score(pred, labels_test, average='weighted')

    print "Precision score: ", precision_score_ada
    print "Recall score: ", recall_score_ada

    # Dumping classifier, my_dataset and features_list as .pkl files to be used in tester.py
    dump_classifier_and_data(clf, my_dataset, features_list)

def tuneAda():
    '''Identifies optimal params for AdaBoost to be used in fit function'''
    
    clf = AdaBoostClassifier(random_state=42)
    params = {
        'n_estimators':np.arange(1, 200, 1),
        'learning_rate':np.linspace(1.0, 0.001, num=1000),
        'algorithm':['SAMME', 'SAMME.R']
    }

    clf_ada_rscv = RandomizedSearchCV(clf, param_distributions=params, n_iter=50, cv=cv, scoring='f1', n_jobs=-1, verbose=1)
    clf_ada_rscv.fit(features_train_scaled, labels_train)
    
    print clf_ada_rscv.best_params_
    print clf_ada_rscv.best_score_
    
    # f1 score using all features > 0 feature importances:
    # {'n_estimators': 120, 'learning_rate': 0.18599999999999994, 'algorithm': 'SAMME.R'}
    # 0.30261037296

    # f1 score using top 7 features with > 0 feature importances:
    # {'n_estimators': 71, 'learning_rate': 0.18199999999999994, 'algorithm': 'SAMME.R'}
    # 0.307384648685

    # f1 score using top 4 features with > 0 feature importances:
    # {'n_estimators': 95, 'learning_rate': 0.11399999999999999, 'algorithm': 'SAMME.R'}
    # 0.309828543679

    # f1 score using top 3 features with > 0 feature importances:
    # {'n_estimators': 102, 'learning_rate': 0.125, 'algorithm': 'SAMME.R'}
    # 0.309473709624


fitAda()
#tuneAda()

Precision score:  0.846756425949
Recall score:  0.860465116279


In [105]:
#################
# Decision Tree #
#################

def fitTree():
    '''Fit, predicts, prints scores and dump clf to pickle files for tester.py'''
    
    clf = tree.DecisionTreeClassifier(random_state=42, 
                                      criterion='gini', 
                                      max_depth=17, 
                                      max_features=18, 
                                      class_weight='balanced', 
                                      splitter='random', 
                                      min_samples_leaf=14, 
                                      min_samples_split=63
                                     )
    
    clf.fit(features_train_scaled, labels_train)
    pred = clf.predict(features_test_scaled)

    # Classifier scores
    precision_score_tree = precision_score(pred, labels_test, average='weighted')
    recall_score_tree = recall_score(pred, labels_test, average='weighted')

    print "Precision score: ", precision_score_tree
    print "Recall score: ", recall_score_tree

    # Dumping classifier, my_dataset and features_list as .pkl files to be used in tester.py
    dump_classifier_and_data(clf, my_dataset, features_list)
    
def tuneTree():
    '''Identifies optimal params for Tree to be used in fit function'''   
    
    clf = tree.DecisionTreeClassifier(random_state=42)
    clf_tree_rscv = RandomizedSearchCV(clf, param_distributions=parameters, cv=70, n_iter=80, scoring='precision', n_jobs=-1, verbose=1)
    clf_tree_rscv.fit(features_train, labels_train)

    print clf_tree_rscv.best_params_
    print clf_tree_rscv.best_score_

    # {'splitter': 'random', 'min_samples_leaf': 14, 'max_features': 18, 'criterion': 'gini', 'min_samples_split': 49, 'max_depth': 17, 'class_weight': 'balanced'}
    # 0.25

fitTree()
#tuneTree()

Precision score:  1.0
Recall score:  0.883720930233


In [106]:
######################
# KNearest Neighbors #
######################

def fitKNN():
    '''Fit, predicts, prints scores and dump clf to pickle files for tester.py'''
    
    clf = KNeighborsClassifier()
    clf.fit(features_train_scaled, labels_train)
    pred = clf.predict(features_test_scaled)

    # Classifier scores
    precision_score_knn = precision_score(pred, labels_test, average='weighted')
    recall_score_knn = recall_score(pred, labels_test, average='weighted')

    print "Precision score: ", precision_score_knn
    print "Recall score: ", recall_score_knn

    # Dumping classifier, my_dataset and features_list as .pkl files to be used in tester.py
    dump_classifier_and_data(clf, my_dataset, features_list)

def tuneKNN():    
    '''Identifies optimal params for KNN to be used in fit function'''   
    
    clf = KNeighborsClassifier()
    params = {
        'p':[1, 2],
        'n_neighbors':np.arange(1, 50, 1),
        'weights':['uniform', 'distance'],
        'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
        'leaf_size':np.arange(10, 50, 1),
    }

    clf_knn_rscv = RandomizedSearchCV(clf, param_distributions=params, cv=cv, n_iter=50, scoring='f1', n_jobs=-1, verbose=1)
    clf_knn_rscv.fit(features_train_scaled, labels_train)


    print clf_knn_rscv.best_params_
    print clf_knn_rscv.best_score_

fitKNN()
#tuneKNN()

Precision score:  1.0
Recall score:  0.883720930233


In [108]:
#######
# SVC #
#######

def fitSVC():
    '''Fit, predicts, prints scores and dump clf to pickle files for tester.py'''    
    
    clf = SVC(random_state=42, 
              kernel='sigmoid', 
              C=188.24569042635585, 
              gamma=0.045051338986761162, 
              class_weight=None
             )
    clf.fit(features_train_scaled, labels_train)
    pred = clf.predict(features_test_scaled)

    # Classifier scores
    precision_score_svc = precision_score(pred, labels_test, average='weighted')
    recall_score_svc = recall_score(pred, labels_test, average='weighted')
    
    print "Precision score: ", precision_score_svc
    print "Recall score: ", recall_score_svc
    
    # Dumping classifier, my_dataset and features_list as .pkl files to be used in tester.py
    dump_classifier_and_data(clf, my_dataset, features_list)

def tuneSVC():
    '''Identifies optimal params for SVC to be used in fit function'''
    
    clf = SVC(random_state=42)
    params = {'C': scipy.stats.expon(scale=100), 
              'gamma': scipy.stats.expon(scale=.1), 
              'kernel':['rbf', 'linear', 'poly', 'sigmoid'],
              'class_weight': [None, 'balanced']
             }
    
    clf_svc_rscv = RandomizedSearchCV(clf, param_distributions=params, cv=cv,n_iter=20, scoring='f1_weighted', verbose=2, n_jobs=-1)
    clf_svc_rscv.fit(features_train_scaled, labels_train)

    print clf_svc_rscv.best_params_
    print clf_svc_rscv.best_score_

    # {'kernel': 'sigmoid', 'C': 188.24569042635585, 'gamma': 0.045051338986761162, 'class_weight': None}
    # 0.839949126647

fitSVC()
#tuneSVC()

Precision score:  0.933659730722
Recall score:  0.906976744186


In [109]:
#################
# Random Forest #
#################

def fitRF():
    '''Fit, predicts, prints scores and dump clf to pickle files for tester.py'''    
    
    clf = RandomForestClassifier(random_state=42, 
                                 n_estimators=1, 
                                 min_samples_leaf=2, 
                                 min_samples_split=2, 
                                 max_depth=31
                                )
    clf.fit(features_train, labels_train)
    pred = clf.predict(features_test)

    precision_score_rf = precision_score(pred, labels_test, average='weighted')
    recall_score_rf = recall_score(pred, labels_test, average='weighted')

    print "Precision score: ", precision_score_rf
    print "Recall score: ", recall_score_rf

    # Dumping classifier, my_dataset and features_list as .pkl files to be used in tester.py
    dump_classifier_and_data(clf, my_dataset, features_list)

def tuneRF():
    '''Identifies optimal params for RF to be used in fit function'''

    clf = RandomForestClassifier(random_state=42)
    params = {
        'n_estimators':np.arange(1, 200, 1),
        'min_samples_leaf':np.arange(1, 20, 1),
        'min_samples_split':np.arange(2, 20, 1),
        'max_depth':np.arange(1, 40, 1),
    }
    clf_rf_rscv = RandomizedSearchCV(clf, cv=cv, n_iter=50, param_distributions=params, scoring='precision', verbose=1, n_jobs=-1)
    clf_rf_rscv.fit(features_train, labels_train)

    print clf_rf_rscv.best_params_
    print clf_rf_rscv.best_score_

    # {'n_estimators': 1, 'min_samples_split': 2, 'max_depth': 31, 'min_samples_leaf': 2}
    # 0.21

fitRF()
#tuneRF()

Precision score:  0.701223990208
Recall score:  0.744186046512


# Q&A


1.Summarize for us the goal of this project and how machine learning is useful in trying to accomplish it. As part of your answer, give some background on the dataset and how it can be used to answer the project question. Were there any outliers in the data when you got it, and how did you handle those?  [relevant rubric items: “data exploration”, “outlier investigation”]

>Machine learning is powerful at predicting whether a certain outcome is likely to happen (classification) or continuous numbers (regression). In this example where we are asked to predict that a person is a POI (classification) and we have features such as salary, bonus, stock (financial) as well as how many emails they have sent/received (count) we can use such features to learn if these help us predict whether people are POIs or not.

> There were 146 rows of data pre-cleaning and 18 POIs and 126 Non-POIs in the dataset.

> I have removed 4 outliers from data_dict and they are as follows: 
'TOTAL', 'THE TRAVEL AGENCY IN THE PARK','BANNANTINE JAMES M','GRAY RODNEY'. I have removed the first two because they don't correspond to an employee, one is a column sum while the other looks to be a company they may have used its services at some point. The final two are outliers when studying salary: perhaps they were consultants or office cleaners because their salaries were 477 USD per year for Bannantine while Gray earned 6k USD over a year. Those are nowhere near what Enron pays for its employees and only add noise to any model.

2.What features did you end up using in your POI identifier, and what selection process did you use to pick them? Did you have to do any scaling? Why or why not? As part of the assignment, you should attempt to engineer your own feature that does not come ready-made in the dataset -- explain what feature you tried to make, and the rationale behind it. (You do not necessarily have to use it in the final analysis, only engineer and test it.) In your feature selection step, if you used an algorithm like a decision tree, please also give the feature importances of the features that you use, and if you used an automated feature selection function like SelectKBest, please report the feature scores and reasons for your choice of parameter values.  [relevant rubric items: “create new features”, “intelligently select features”, “properly scale features”]

>As part of the EDA I looked at feature importances to understand which features were important and which weren't useful to include in my machine learning algorithms. I took all features which had an importance > 0 because if a feature has no variance it will only make my model more complex and therefore overfit to the training data at hand and won't generalise well. In machine learning, a simple model is preferred over a more complex one for that reason. Below are the feature importances for all features   
>
  1.  feature: salary (0.21274406496)
  2.  feature: deferral_payments (0.189106111008)
  3.  feature: total_payments (0.177991452991)
  4.  feature: loan_advances (0.0751856363847)
  5.  feature: bonus (0.0660231271996)
  6.  feature: restricted_stock_deferred (0.0)
  7.  feature: deferred_income (0.0)
  8.  feature: total_stock_value (0.0)
  9.  feature: expenses (0.0)
  10.  feature: exercised_stock_options (0.0)
  11.  feature: other (0.0)
  12.  feature: long_term_incentive (0.0)
  13.  feature: restricted_stock (0.0)
  14.  feature: director_fees (0.0)
  15.  feature: to_messages (0.0)
  16.  feature: from_poi_to_this_person (0.0)
  17.  feature: from_messages (0.0)
  18.  feature: from_this_person_to_poi (0.0)

> In the GBM cell in this notebook I have commented out the precision, recall and f1 scores of several feature selections and identified the followingn features to give the highest scores: #features_list = 'poi', 'salary','deferral_payments','total_payments','loan_advances','bonus','restricted_stock_deferred','deferred_income', 'total_stock_value'
Precision: 0.55044	Recall: 0.34650	F1: 0.42528

> I have created my own feature bonus_salary_ratio which is bonus / salary based on the rationale that someone who has a high salary is likely also to have a high bonus. The feature itself had a feature importance greater than 0 (0.0660231271996) but when I ran GBM using the new feature my recall score dropped below the .3 threshold so I have removed it. Interestingly, if I remove both salary and bonus the feature importance of bonus_salary_ratio jumps up significantly but still doesn't improve my models performance compared to using salary and bonus.

> I have trained my models on both scaled as well as unscaled training data and had the best precision and recall score when I trained on unscaled features - at least for my GBM which has the highest accuracy scores.

>As mentioned earlier I have decided to include any feature with a score > 0 because it is preferable to keep a model as simple as possible to avoid overfitting, i.e. if a model can use fewer features without dropping in its accuracy metrics then that's preferable to a model that has more features with the same score since that won't generalise as well on new data.

3.What algorithm did you end up using? What other one(s) did you try? How did model performance differ between algorithms?  [relevant rubric item: “pick an algorithm”]

> I ended it up using a GBM with the following params found through RandomizedSearchCV: 
> GradientBoostingClassifier(criterion='friedman_mse', init=None,
>              learning_rate=0.1, loss='deviance', max_depth=5,
>              max_features=5, max_leaf_nodes=None,
>              min_impurity_decrease=0.0, min_impurity_split=None,
>              min_samples_leaf=6, min_samples_split=20,
>              min_weight_fraction_leaf=0.0, n_estimators=98,
>              presort='auto', random_state=42, subsample=1.0, verbose=0,
>              warm_start=False). 

> I tried SVC, AdaBoost, KNN, DecisionTree, GaussianNB and Random Forest but none returned scores above .3 for both precision and recall. Even when I fine-tuned the parameters using RanomizedSearchCV for all of the above models. For SVC, I would often get an error due to trying to divide by 0 and struggled with even getting the model to work.

4.What does it mean to tune the parameters of an algorithm, and what can happen if you don’t do this well?  How did you tune the parameters of your particular algorithm? What parameters did you tune? (Some algorithms do not have parameters that you need to tune -- if this is the case for the one you picked, identify and briefly explain how you would have done it for the model that was not your final choice or a different model that does utilize parameter tuning, e.g. a decision tree classifier).  [relevant rubric items: “discuss parameter tuning”, “tune the algorithm”]

> Tuning the parameters of an algorithm is really customising the model to match your dataset. For some of the parameters such as min_samples_split there is a trade-off between performance and accuracy: a high sample_split means you are avoiding over-fitting whereas a too high value means you are underfitting. Likewise with the learning_rate, if you give it too high a value you risk it might miss the optimal point whereas if it's too low it will take too long to converge and reach the local minima. For GBM I have tuned the following metrics: n_estimators, max_depth, max_features, learning_rate, min_samples_split, min_samples_leaf and of course set random_state to 42 so my results can be reproduced.
         
> and for SVC I have used the following params:

> params = {'C': scipy.stats.expon(scale=100), 
          'gamma': scipy.stats.expon(scale=.1), 
          'kernel':['rbf', 'linear', 'poly', 'sigmoid'],
          'class_weight': [None, 'balanced']
         }

5.What is validation, and what’s a classic mistake you can make if you do it wrong? How did you validate your analysis?  [relevant rubric items: “discuss validation”, “validation strategy”]

> Validation, also known as cross-validation. It is used to prevent over-fitting to the training data. If you don't use cross-validation you risk overfitting your model to the training dataset which means it won't be able to perform well on un-seen new data because it is unable to generalise well. CV works by splitting the training dataset into smaller sets which the model is evaluated on and for each fold it will then return the score average accuracy score from all of the folds. CV is especially important with imbalanced classes as it increases the probability that your CV folds are more representative of the data.

> I have used StratifiedShuffleSplit due to the imbalanced classes present in labels (i.e. only 18 POIs). SSS takes random rows from the dataset to increase the likelihood of including the imbalanced classes.

6.Give at least 2 evaluation metrics and your average performance for each of them.  Explain an interpretation of your metrics that says something human-understandable about your algorithm’s performance. [relevant rubric item: “usage of evaluation metrics”]

> As mentioned above my best performing model was a GBM model. When I ran the tester.py I got the following scores:Precision: 0.47601	Recall: 0.31250. For precision score, it measures how accurate our model was at predicting a POI out of both POIs + those that weren't. Such a metric is important as we do not want to label an employee for being a POI when in fact they weren't - we may shame them unjustly in the public when they haven't done anything wrong. On the other hand, recall score measures how accurate our model was at predicting a POI out of both POI + whether someone wasn't a POI when they in fact were. This is less important compared to precision score due to the concept of being innocent until proven guilty.