In [None]:
#!/usr/bin/python
import sys
import pickle
sys.path.append("../tools/")

In [None]:
import numpy as np
import pandas as pd
from collections import Counter
import pprint
from IPython.display import display

# ref: https://github.com/Corvids/ud120-projects (includes all module 5 starter code)
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

### Task 1: Load the Dataset and Initial Features

#### Question 1:

**Summarize for us the goal of this project and how machine learning is useful in trying to accomplish it. As part of your answer, give some background on the dataset and how it can be used to answer the project question. Were there any outliers in the data when you got it, and how did you handle those?  [relevant rubric items: “data exploration”, “outlier investigation”]**

The goal of this project is to identify Enron Employees who may have committed fraud based on the available financial and email dataset using machine learning.  We will use a label of 'poi' to identify these persons of interest and use machine learning on the available data to predict who a poi is.

A major outlier in the dataset was the "Total" data point, which was removed, as seen in the following code.  No other data points were removed as this is a fairly small data set.

In [None]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi', 'salary', 'deferral_payments', 'total_payments',
                 'loan_advances', 'bonus', 'restricted_stock_deferred',
                 'deferred_income', 'total_stock_value', 'expenses',
                 'exercised_stock_options', 'other', 'long_term_incentive',
                 'restricted_stock', 'director_fees',
                 'to_messages', 'from_poi_to_this_person',
                 'from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi']

### Task 2: Understanding the Dataset and Question

In [None]:
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

if len(data_dict) > 0:
    print 'data loaded!'
    print 'Number of initial data points: ', len(data_dict)
    print 'Number of initial features used: ', len(features_list)

In [None]:
# get number of POI in data
poi_count = []
for key in data_dict.keys():
    poi_count.append(data_dict[key]['poi'])

print 'Number of POI: ', poi_count.count(1)
print 'Number of non-POI: ', poi_count.count(0)

In [None]:
print 'list of all people in the dataset: '
print data_dict.keys()

In [None]:
# get number of missing values per person
missing_feature = [0 for i in range(0, len(features_list))]
features_in_data = data_dict.values()

for loc_p, person in enumerate(features_in_data):
    for loc_f, feature in enumerate(features_list):
        if person[feature] == 'NaN':
            missing_feature[loc_f] += 1

In [None]:
print 'Number of missing from each feature: '
for feature, num_missing in zip(features_list, missing_feature):
    print feature, ' -- ', num_missing

In [None]:
# print structure
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(data_dict["SKILLING JEFFREY K"])

##### Removing the outlier

In [None]:
import matplotlib.pyplot

%matplotlib inline

features = ["salary", "bonus"]
data = featureFormat(data_dict, features)

In [None]:
for point in data:
    salary = point[0]
    bonus = point[1]
    matplotlib.pyplot.scatter( salary, bonus )

matplotlib.pyplot.xlabel("salary")
matplotlib.pyplot.ylabel("bonus")
matplotlib.pyplot.show()

In [None]:
### Task 2: Remove outliers
##following outlier cleaner adapted
## from outliers mini-project

# remove total -- top right point separate from bottom left cluster
print 'Now removing outliers . . . '
print 'Removing point named "Total" from data...'
data_dict.pop('TOTAL')
print 'Number of data points after removal: ', len(data_dict)

In [None]:
data = featureFormat(data_dict, features)

for point in data:
    salary = point[0]
    bonus = point[1]
    matplotlib.pyplot.scatter( salary, bonus )

matplotlib.pyplot.xlabel("salary")
matplotlib.pyplot.ylabel("bonus")
matplotlib.pyplot.show()

### Task 3: Optimize Feature Selection/Engineering

#### Question 2:

**What features did you end up using in your POI identifier, and what selection process did you use to pick them? Did you have to do any scaling? Why or why not? As part of the assignment, you should attempt to engineer your own feature that does not come ready-made in the dataset -- explain what feature you tried to make, and the rationale behind it. (You do not necessarily have to use it in the final analysis, only engineer and test it.) In your feature selection step, if you used an algorithm like a decision tree, please also give the feature importances of the features that you use, and if you used an automated feature selection function like SelectKBest, please report the feature scores and reasons for your choice of parameter values.  [relevant rubric items: “create new features”, “properly scale features”, “intelligently select feature”]**

I used 20 initial features and created two of my own, the ratio of messages to and from person's of interest to the person associated with each data point. I figured these two extra features may be useful because I believe it's likely that a person of interest would frequently receive and send emails to other persons of interest.  I scaled any feature which contained a large range, using the max-min formula.

In [None]:
### Task 3: Create new feature(s)

### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [None]:
for key in my_dataset:
    #messages to POI
    if my_dataset[key]['from_messages'] != 'NaN':
        my_dataset[key]['to_poi_message_ratio'] = \
                1.0*my_dataset[key]['from_this_person_to_poi']/my_dataset[key]['from_messages']
    else:
        my_dataset[key]['to_poi_message_ratio'] = 'NaN'
    #messages from POI
    if my_dataset[key]['to_messages'] != 'NaN':
        my_dataset[key]['from_poi_message_ratio'] = \
                1.0*my_dataset[key]['from_poi_to_this_person']/my_dataset[key]['to_messages']
    else:
        my_dataset[key]['from_poi_message_ratio'] = 'NaN'

In [None]:
# update the feature list
features_list = ['poi', 'salary', 'deferral_payments', 'total_payments',
                 'loan_advances', 'bonus', 'restricted_stock_deferred',
                 'deferred_income', 'total_stock_value', 'expenses',
                 'exercised_stock_options', 'other', 'long_term_incentive',
                 'restricted_stock', 'director_fees',
                 'to_messages', 'from_poi_to_this_person',
                 'from_messages', 'from_this_person_to_poi',
                 'shared_receipt_with_poi',
                 'to_poi_message_ratio', 'from_poi_message_ratio']


In [None]:
pp.pprint(my_dataset["SKILLING JEFFREY K"])

In [None]:
if len(data_dict) > 0:
    print 'After adding two new features: to_poi_message_ratio, from_poi_message_ratio'
    print 'Number of initial data points: ', len(my_dataset)
    print 'Number of initial features used: ', len(features_list)

##### More descriptive statistics

In [None]:
df_pd = pd.DataFrame(my_dataset)

df_pd = df_pd.convert_objects(convert_numeric=True).transpose()
df_pd.reset_index(level=0, inplace=True)

columns = list(df_pd.columns)

df_pd.columns = columns

display( df_pd.describe().transpose() )

##### Scaling our features

In [None]:
def scaleFeatures(my_dataset, feature):
    min_scale = np.inf
    max_scale = -np.inf
    
    for key in my_dataset:
        # get min, max
        if my_dataset[key][feature] == 'NaN':
            pass
        else:
            if my_dataset[key][feature] < min_scale:
                min_scale = my_dataset[key][feature]
            if my_dataset[key][feature] > max_scale:
                max_scale =  my_dataset[key][feature]
    
    print 'min ' + str(feature) + ' is: ' + str(min_scale)
    print 'max ' + str(feature) + ' is: ' + str(max_scale)
    
    for key in my_dataset:
        if my_dataset[key][feature] == 'NaN':
            pass
        else:
            my_dataset[key][feature] = 1.0*(my_dataset[key][feature] - min_scale) / (max_scale - min_scale)
    
    print 'scaled feature ' + str(feature) + '!'
    print '=========='

In [None]:
features_to_scale = ['bonus', 'salary', 'restricted_stock',
                     'long_term_incentive', 'deferral_payments',
                     'deferred_income', 'director_fees',
                     'loan_advances', 'other',
                     'restricted_stock_deferred', 'total_payments',
                     'total_stock_value', 'exercised_stock_options',
                     'expenses']

for feature in features_to_scale:
    scaleFeatures(my_dataset, feature)

In [None]:
pp.pprint(my_dataset["SKILLING JEFFREY K"])

In [None]:
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

##### Feature Selection

In [None]:
from sklearn.cross_validation import train_test_split

features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

In [None]:
from sklearn import tree
from sklearn.metrics import accuracy_score

clf = tree.DecisionTreeClassifier(random_state = 42)
clf = clf.fit(features_train, labels_train)

features_importance = zip(map(lambda x: round(x, 6), clf.feature_importances_), features_list)
features_importance.sort(key = lambda t: t[0], reverse = True)

print 'feature importances: ', features_importance

In [None]:
n_most = features_importance[0:10] # all points with feature importance > 0.0

selected_features = [x[1] for x in n_most]
print selected_features

### Task 4: Pick and Tune an Algorithm

#### Question 3:

**What algorithm did you end up using? What other one(s) did you try? How did model performance differ between algorithms?  [relevant rubric item: “pick an algorithm”]**

I used the Ada Boost Classifier algorithm out of the following algorithms I tried: Gaussian Naive-Bayes, Decision Tree, Ada Boost, and PCA.  Since the un-tuned Ada Boost model had precision and recall both above 0.3, I chose this model and decided to try improving upon these results.  I couldn't quite get the PCA model to work out with the test_classifier function but I left the code in below nonetheless.

The Decision Tree model had the worst precision/recall at 0.30508/0.29150.  The Gaussian Naive Bayes model had good precision at 0.35890 but worse recall at 0.24800.  The Ada Boost model had a precision score of 0.40507 and recall score of 0.31150, which makes this the best performing model when we use precision and recall as the scoring criteria.

In [None]:
from sklearn.cross_validation import train_test_split

features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

In [None]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html


# Provided to give you a starting point. Try a variety of classifiers.

# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from time import time
from tester import test_classifier

clf_GNB = GaussianNB()

t0_GNB = time()
clf_GNB.fit(features_train, labels_train)
print "training time:", round(time()-t0_GNB, 3), "s"

t1_GNB = time()
pred_GNB = clf_GNB.predict(features_test)
print "predicting time:", round(time()-t1_GNB, 3), "s"

accuracy_GNB = accuracy_score(labels_test, pred_GNB)
print 'accuracy of Naive Bayes: ' + str(accuracy_GNB)
print '\n'
test_classifier(clf_GNB, my_dataset, features_list, folds = 1000)

In [None]:
# Decision Tree

from sklearn import tree

clf_tree = tree.DecisionTreeClassifier(random_state=42)

t0_tree = time()
clf_tree.fit(features_train, labels_train)
print "training time:", round(time()-t0_tree, 3), "s"

t1_tree = time()
pred_tree = clf_tree.predict(features_test)
print "predicting time:", round(time()-t1_tree, 3), "s"

accuracy_tree = accuracy_score(labels_test, pred_tree)
print 'accuracy of Decision Tree: ' + str(accuracy_tree)
print '\n'
test_classifier(clf_tree, my_dataset, features_list, folds = 1000)

In [None]:
# Ada Boost
from sklearn.ensemble import AdaBoostClassifier

clf_Ada = AdaBoostClassifier(random_state=42)

t0_Ada = time()
clf_Ada.fit(features_train, labels_train)
print "training time:", round(time()-t0_Ada, 3), "s"

t1_Ada = time()
pred_Ada = clf_Ada.predict(features_test)
print "predicting time:", round(time()-t1_Ada, 3), "s"

accuracy_Ada = accuracy_score(labels_test, pred_Ada)
print 'accuracy of Ada Boost: ' + str(accuracy_Ada)
print '\n'
test_classifier(clf_Ada, my_dataset, features_list, folds = 1000)

In [None]:
# pipeline -- PCA
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA

estimators = [('reduce_dim', PCA()), ('svm', SVC(random_state=42))]
clf_pipe = Pipeline(estimators)

t0_pipe = time()
clf_pipe.fit(features_train, labels_train)
print "training time:", round(time()-t0_pipe, 3), "s"

t1_pipe = time()
pred_pipe = clf_pipe.predict(features_test)
print "predicting time:", round(time()-t1_pipe, 3), "s"

accuracy_pipe = accuracy_score(labels_test, pred_pipe)
print 'accuracy of pipeline: ' + str(accuracy_pipe)

print '\n'
test_classifier(clf_pipe, my_dataset, features_list, folds = 1000)

### Task 5

#### Question 4:

**What does it mean to tune the parameters of an algorithm, and what can happen if you don’t do this well?  How did you tune the parameters of your particular algorithm? (Some algorithms do not have parameters that you need to tune -- if this is the case for the one you picked, identify and briefly explain how you would have done it for the model that was not your final choice or a different model that does utilize parameter tuning, e.g. a decision tree classifier).  [relevant rubric item: “tune the algorithm”]**

We tune a model by modifying the inputs to a model's parameters; for example, we can modify the n_estimators of the Ada Boost Classifier to be different from the default value of 50.  We tune models for the primary reason of improving model accuracy, precision, and/or recall.  Tuning a model badly could decrease all or one of these.

I tuned the n_estimators, learning_rate, and algorithm parameters of the Ada Boost Classifier and used GridSearchCV to obtain the optimal inputs for each of these using some initial values (as see in the function fit_model below).  I also used make_scorer to make sure that the f_beta score was maximized (the f_beta score being the weighted harmonic mean of precision and recall).

In [None]:
AdaBoostClassifier().get_params().keys()

In [None]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

from tester import test_classifier
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn import grid_search
from sklearn.metrics import fbeta_score, make_scorer

def fit_model(X, y):
    cv_sets = StratifiedShuffleSplit(y, n_iter = 10, test_size = 0.333, random_state = 42)

    model = AdaBoostClassifier(random_state=42)

    parameters = {'n_estimators':[10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
              'learning_rate':[0.01, 0.1, 0.5, 0.75, 1.0],
              'algorithm':['SAMME','SAMME.R']} 

    scoring_fnc = make_scorer(fbeta_score, beta=2)

    grid = grid_search.GridSearchCV(model, parameters, scoring=scoring_fnc, cv=cv_sets)

    grid = grid.fit(X, y)

    return grid.best_estimator_

In [None]:
"""
default ada boost classifier:
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=42)
"""

t0 = time()
clf = fit_model(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s"
print clf

t1 = time()
pred = clf.predict(features_test)
print "predicting time:", round(time()-t1, 3), "s"

accuracy = accuracy_score(labels_test, pred)
print 'accuracy of tuned Ada Boost model: ' + str(accuracy)


In [None]:
# tuned
print 'tuned model:'
t3 = time()
test_classifier(clf, my_dataset, features_list, folds = 1000)
print round(time()-t3, 3), "s"

In [None]:
# not tuned
print 'non-tuned model:'
t3 = time()
test_classifier(clf_Ada, my_dataset, features_list, folds = 1000)
print round(time()-t3, 3), "s"

#### Question 5:

**What is validation, and what’s a classic mistake you can make if you do it wrong? How did you validate your analysis?  [relevant rubric item: “validation strategy”]**

Validation is important for machine learning because we want to avoid overfitting, which is when a model predicts well on the data that we have but would do poorly on data that we haven't seen yet.  We can avoid overfitting by splitting our data into testing and training sets; I used sklearn's train_test_split function and later the StratifiedShuffleSplit function to split the data as randomly as possible.

#### Question 6:

**Give at least 2 evaluation metrics and your average performance for each of them.  Explain an interpretation of your metrics that says something human-understandable about your algorithm’s performance. [relevant rubric item: “usage of evaluation metrics”]**

The two evaulation metrics I used were precision and recall.  In human-understandable speech, good precision means that when a POI is flagged from the data, it's likely to be a true POI and not a false positive. The tradeoff of this means that sometimes the model will miss real POI's and instead flag them as non-POI's.  The precision of my tuned model was 0.42436.

Good recall means that whenever a POI shows up in the data, the model is able to recognize that data point as a POI.  The tradeoff of a good recall score, however, is that sometimes, a non-POI gets flagged as a POI (i.e. there are false positives).  The recall score of the tuned model was 0.33100.

### Task 6

In [None]:
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)