## Person of Interest Identifier

Building a supervised classification algorithm that can tell, based on the Enron dataset, who at Enron was a suspected POI in the fraud case.

In [22]:
import numpy as np
import pickle

In [2]:
data_dict_file = open('../5_datasets_and_questions/tools/final_project_dataset_unix.pkl', "rb")
data_dict = pickle.load(data_dict_file)

In [3]:
from feature_format import featureFormat, targetFeatureSplit

In [4]:
import inspect

In [11]:
#inspect.getsourcelines(featureFormat) 

In [12]:
#inspect.getsourcelines(targetFeatureSplit) 

In [7]:
### first element is our labels, any added elements are predictor
### features. Keep this the same for the mini-project, but you'll
### have a different feature list when you do the final project.
features_list = ["poi", "salary"]

In [8]:
data = featureFormat(data_dict, features_list)
labels, features = targetFeatureSplit(data)

### #1 First Overfit POI Identifier

* Create a decision tree classifier (just use the default parameters), 
* train it on all the data (you will fix this in the next part!), 
* and print out the accuracy. 

In [18]:
# import decision tree algortihm for classification
from sklearn.tree import DecisionTreeClassifier 

In [154]:
# instantiate the classifier with default paramters
clf = DecisionTreeClassifier()
# train the classifier with all of the data
clf.fit(features, labels)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')

### #2THIS IS AN OVERFIT TREE, DO NOT TRUST THIS NUMBER! Nonetheless, what’s the accuracy?

In [155]:
clf.score(features, labels)

0.9894736842105263

Yet another case where testing on the training data would make you think you were doing amazingly well, but as you already know, that's exactly what holdout test data is for...

### #3 Deploying a Training/Testing Regime

Now you’ll add in training and testing, so that you get a trustworthy accuracy number. 

* Use the train_test_split validation available in sklearn.model_selection; 
* hold out 30% of the data for testing and set the random_state parameter to 42 (random_state controls which points go into the training set and which are used for testing;). 

### #4 What’s your updated accuracy?

In [156]:
from sklearn.model_selection import train_test_split

In [157]:
# create training/testing set
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)

In [158]:
# train the classifier withtraining data
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')

In [159]:
# make a prediction on testing set
pred = clf.predict_proba(X_test)

In [160]:
from sklearn.metrics import accuracy_score

In [161]:
clf.score(X_test, y_test)

0.7241379310344828

Aaaand the testing data brings us back down to earth after that 99% accuracy

#### #5 Using cross_val_score

In [165]:
from sklearn.model_selection import cross_val_score
# calculate the cv scores
scores = cross_val_score(clf, features, labels, cv=5)
scores.mean()

0.6757309941520468

#### #6 Using ShuffleSplit cv

In [167]:
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
scores = cross_val_score(clf, features, labels, cv=cv)
scores.mean()

0.6827586206896552

### #7 Data Transformation with held out data

In [169]:
from sklearn import preprocessing

# create the scaler and fit the data
scaler = preprocessing.StandardScaler().fit(X_train)
# transform the data on training set
X_train_transformed = scaler.transform(X_train)
# create the classifier and train
clf.fit(X_train_transformed, y_train)
# transform the data on testing set
X_test_transformed = scaler.transform(X_test)
# get the score on the testing set
clf.score(X_test_transformed, y_test)

0.7241379310344828

In [174]:
clf.get_params()

{'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': False,
 'random_state': 42,
 'splitter': 'best'}

### #8 Using Stratified k-fold validation

In [181]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=3, random_state=42)
scores = cross_val_score(clf, features, labels, cv=skf)
scores.mean()

0.726478494623656

In [183]:
from sklearn.model_selection import GridSearchCV
# set the parameters
parameters = {'criterion':('gini','entropy'), 
              'max_depth':[3, 5, 6, 10],
              'min_samples_split':[20, 30, 40, 50], 
              'min_samples_split':[2, 4, 6, 8]}
# intsntaiate the tree
tree = DecisionTreeClassifier(random_state=42)
# instantiate the grid search with cross validation
gr_src = GridSearchCV(tree, parameters, cv=skf)

In [184]:
# train the grid
gr_src.fit(features, labels)

GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=False),
       error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'criterion': ('gini', 'entropy'), 'max_depth': [3, 5, 6, 10], 'min_samples_split': [2, 4, 6, 8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [185]:
gr_src.best_params_

{'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 4}

In [186]:
gr_src.best_score_

0.7684210526315789