# Importing libraries

In [36]:
import sys
import pickle
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
import missingno as msno
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# Features by data type

In [23]:
### features_list selects which features to include.
features_list = ['poi', 'salary', 'total_stock_value'
                ]

# Identifying columns with financial values
financial_features = ['salary', 'deferral_payments', 'total_payments', 'loan_advances','bonus', 
                      'restricted_stock_deferred', 'deferred_income', 'total_stock_value', 
                      'expenses', 'exercised_stock_options', 'other', 'long_term_incentive', 
                      'restricted_stock', 'director_fees'
                     ]

# Identfying columns with numerical values
features_with_count = ['to_messages', 'from_poi_to_this_person', 'from_messages', 
                       'from_this_person_to_poi', 'shared_receipt_with_poi'
                      ]

In [24]:
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

# Removing the 'TOTAL' value in data_dict because it is a column sum of salaries and doesn't belong to any single employee.
del data_dict['TOTAL']

### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

# EDA


In [30]:
# Transforming features into a df so that I won't have to remember to transform both features train and test.
df_features = pd.DataFrame(features)
df_features.columns = ['salary', 'total_stock_value'
                      ]
print "Any null-values present in the features chosen? \n", df_features.isnull().any()
df_features.head()

Any null-values present in the features chosen? 
salary               False
total_stock_value    False
dtype: bool


Unnamed: 0,salary,total_stock_value
0,201955.0,1729541.0
1,0.0,257817.0
2,477.0,5243487.0
3,267102.0,10623258.0
4,239671.0,63014.0


In [31]:
# Splitting dataset into train and test for features and labels
features_train, features_test, labels_train, labels_test = \
    train_test_split(df_features, labels, test_size=0.3, random_state=42)

# Initialising classifiers using default values

In [32]:
##############
# GaussianNB #
##############

clf = GaussianNB()
clf.fit(features_train, labels_train)

pred = clf.predict(features_test)

# Classifier scores
precision_score_gnb = precision_score(pred, labels_test)
recall_score_gnb = recall_score(pred, labels_test)
accuracy_score_gnb = accuracy_score(pred, labels_test)

print "Accuracy score: ", accuracy_score_gnb
print "Precision score: ", precision_score_gnb
print "Recall score: ", recall_score_gnb

Accuracy score:  0.923076923077
Precision score:  0.333333333333
Recall score:  0.5


In [33]:
#######
# SVC #
#######

clf = SVC(random_state=42)

# Scaling both train and test features due to salary variance
min_max_scaler = MinMaxScaler()
features_train_scaled = min_max_scaler.fit_transform(features_train)
features_test_scaled = min_max_scaler.fit_transform(features_test)

clf.fit(features_train, labels_train)

pred = clf.predict(features_test)

# Classifier scores
precision_score_svc = precision_score(pred, labels_test)
recall_score_svc = recall_score(pred, labels_test)
accuracy_score_svc = accuracy_score(pred, labels_test)

print "Accuracy score: ", accuracy_score_svc
print "Precision score: ", precision_score_svc
print "Recall score: ", recall_score_svc

Accuracy score:  0.923076923077
Precision score:  0.0
Recall score:  0.0


In [34]:
################
# DecisionTree #
################

# Fitting the model
clf = tree.DecisionTreeClassifier(random_state=42)
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)

# Feature Importances to identify which features have a high variance to be included in final model and which to exclude.
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
print "Feature Ranking: "
for i in range(len(importances)):
    print "  {} feature {} ({})".format(i+1, features_list[i+1], importances[indices[i]])

# Classifier scores
precision_score_tree = precision_score(pred, labels_test)
recall_score_tree = recall_score(pred, labels_test)
accuracy_score_tree = accuracy_score(pred, labels_test)

print "Accuracy score: ", accuracy_score_tree
print "Precision score: ", precision_score_tree
print "Recall score: ", recall_score_tree

Feature Ranking: 
  1 feature salary (0.603190174167)
  2 feature total_stock_value (0.396809825833)
Accuracy score:  0.820512820513
Precision score:  0.0
Recall score:  0.0


In [None]:
#################
# Random Forest #
#################

# Fitting the model
clf = RandomForestClassifier(random_state=42)
# clf.fit(features_train, labels_train)
#pred = clf.predict(features_test)

# GridsearchCV to find optimal hyper params
params = {'n_estimators':np.arange(1, 20, 5),
           'max_features':np.arange(1, 20, 5),
           'max_depth':np.arange(1, 20, 5),
           'min_samples_leaf':np.arange(1, 10, 5)
         }

clf_rf_grid = GridSearchCV(clf, cv=20, param_grid=params, scoring='precision', verbose=2, n_jobs=-1)
clf_rf_grid.fit(features_train, labels_train)

print clf_rf_grid.best_params_
print clf_rf_grid.best_score_

Fitting 20 folds for each of 128 candidates, totalling 2560 fits


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:    8.1s


In [None]:
# Dumping classifier, my_dataset and features_list as .pkl files to be used in tester.py
dump_classifier_and_data(clf, my_dataset, features_list)

# Q&A


1. Summarize for us the goal of this project and how machine learning is useful in trying to accomplish it. As part of your answer, give some background on the dataset and how it can be used to answer the project question. Were there any outliers in the data when you got it, and how did you handle those?  [relevant rubric items: “data exploration”, “outlier investigation”]

2. What features did you end up using in your POI identifier, and what selection process did you use to pick them? Did you have to do any scaling? Why or why not? As part of the assignment, you should attempt to engineer your own feature that does not come ready-made in the dataset -- explain what feature you tried to make, and the rationale behind it. (You do not necessarily have to use it in the final analysis, only engineer and test it.) In your feature selection step, if you used an algorithm like a decision tree, please also give the feature importances of the features that you use, and if you used an automated feature selection function like SelectKBest, please report the feature scores and reasons for your choice of parameter values.  [relevant rubric items: “create new features”, “intelligently select features”, “properly scale features”]

3. What algorithm did you end up using? What other one(s) did you try? How did model performance differ between algorithms?  [relevant rubric item: “pick an algorithm”]

4. What does it mean to tune the parameters of an algorithm, and what can happen if you don’t do this well?  How did you tune the parameters of your particular algorithm? What parameters did you tune? (Some algorithms do not have parameters that you need to tune -- if this is the case for the one you picked, identify and briefly explain how you would have done it for the model that was not your final choice or a different model that does utilize parameter tuning, e.g. a decision tree classifier).  [relevant rubric items: “discuss parameter tuning”, “tune the algorithm”]

5. What is validation, and what’s a classic mistake you can make if you do it wrong? How did you validate your analysis?  [relevant rubric items: “discuss validation”, “validation strategy”]

6. Give at least 2 evaluation metrics and your average performance for each of them.  Explain an interpretation of your metrics that says something human-understandable about your algorithm’s performance. [relevant rubric item: “usage of evaluation metrics”]
