# IMPORTS AND OPTIONS

In [8]:
#ORIGINAL CODE, IMPORTS
import sys
import pickle
import itertools
from time import time

sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data


#My imports
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats.stats import pearsonr
import math

from collections import defaultdict

import mpld3
from mpld3 import plugins

from IPython.display import display, HTML

#Plot all figures within the html page
%pylab inline

mpld3.enable_notebook()

#Make display better, like printing dataframes as tables
pd.set_option('display.notebook_repr_html', True)

Populating the interactive namespace from numpy and matplotlib


In [2]:
from sklearn.preprocessing import Imputer, MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.cross_validation  import StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.cross_validation import train_test_split
from sklearn import svm, grid_search
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import Imputer, MinMaxScaler


# HELPER FUNCTIONS

A function that takes a Pandas dataframe, and strategies about how to deal with missing values and if scaling the input data is desired. It returns numpy arrays of the features (X) and labels (y)
Possible options for imputing missing data are:<br>
 - 'impute_mean': replaces missing values with the mean of the feature<br>
 - 'impute_media': uses the median value. Can be better in case of large valued outliers<br>
 - 'fill_zeros': replaces them with zeros
 
The only supported value for scaling is 'min_max'. Basically scales the feature between zero and one. Any other value passed will not raise an error, only the function will ignore scaling and will return the data as is.

In [3]:
def preprocess_df_for_ML(df, missing_strategy, scaler_type):
    #Build X and y
    features_list = list(df.columns.values)
    features_list.remove('poi')
    X = df[features_list].values
    y = df['poi'].values
    
    #Handling Missing Data
    #Should we use sklearn imputer
    if missing_strategy == 'impute_mean':
        imputer = Imputer(strategy="mean")
        X = imputer.fit_transform(X)
    elif missing_strategy == 'impute_median':
        imputer = Imputer(strategy = "median")
        X = imputer.fit_transform(X)
    elif missing_strategy == 'fill_zeros':
        #Fill NaN with zeros
        df.fillna(0)
        X = df[features_list].values
    
    #Scaling
    if scaler_type == 'min_max':
        min_max_scaler = MinMaxScaler()
        X = min_max_scaler.fit_transform(X)
        
    return [X, y]

This function is a reimplementation of GridSearchCV. I did that because I wanted to see intermediate results while looking for the best parameters. Later on I found out that I could have found what I wanted by using the verbose parameter, but it was too late, so I continued using my own function since I undertsood much more its internals. So it was not an attempt to reinvent the wheel, it was more not knowing that a wheel already existed.

The function will take a set of features, and will generate all combinations of these different features, from a minimum length of the set of features all the way in using all the features together for training the estimator. So for example if we pass it a list of 5 features [1, 2, 3, 4, 5] and a minimum number of 3, the function will start by training the estimator over features [1, 2, 3], then [1, 2, 4], ... [2, 4, 5]... [1, 3, 4, 5]......... [1, 2, 3, 4, 5]. 

WARNING: This function is exhaustive, and it can take a really long time to run.

The function goes one extra step though, that it will test the data immediately over the estimator if its performance was deemed acceptable. This acceptable criteria is judged by its performance during the cross validation: if it was able to identify both true positives and true negatives for POIs for a majority of the iteration, then it is considered as a good indicator, regardless of the actual scores. This came after being hit-hard by the problem that the estimators at first had good scores, but this was because its bias towards Non-POIs. Because the imbalance in numbers of samples, a biased estimator for non-POI gave an overall good score, and there was no way for me to know that using GridSearchCV.

The function takes the following argument:<br>
 - param_dict: dictionary of different parameters to try. Same as param_grid in GridSearchCV.
 - estimator: just pass the constructor of the estimator we wish to use.
 - training_data, training_labels, testing_data, testing_labels: The split of the features and labels to be used in training and testing.
 - features_list, minimum_number_of_features_to_use: Used to generate the different combinations of features to use for training the estimator
 - poi_min_acceptable_precision, poi_min_acceptable_recall: The part where these parameters are used is commented. But these control if we should display the result of a certain estimator if its performance for identifying POIs is greater than these values.

In [19]:
#This function is really brutal, so use it with care. Especially with the number of features to try, it can take forever.
def get_high_scores(param_dict, 
                    estimator, 
                    trainin_data, 
                    training_labels,
                    testing_data,
                    testing_labels,
                    features_list, 
                    minimum_number_of_features_to_use,#Must be less than the length of the features_list
                    poi_min_acceptable_precision = 0.3, 
                    poi_min_acceptable_recall = 0.3):
    
    high_scoring_features_count = defaultdict(int)
    
    t0 = time()
    parameters_list = sorted(param_dict)
    params_combination_list = [dict(zip(parameters_list, value)) for value in itertools.product(*(param_dict[param] for param in parameters_list))]
    #params_combination_list = [ [ {parameters_list: value} for parameters_list, value in zip(parameters_list, value) ] for value in itertools.product(*(param_dict[param] for param in parameters_list))]
    
    best_score = 0.0
    target_names = ['NON POI', 'POI']
    
    #Loop over all different combinations of estimator's parameters
    for current_params in params_combination_list:
        #Loop over all different combinations of featues        
        indexes = range(0, len(features_list)) #Indexes used to point to the features within X
                    
        #Loop over the ranges of number of features per combination
        for i in range(minimum_number_of_features_to_use, len(features_list) + 1):
            
            
            #Build the current combination of features
            features_to_use = list(itertools.combinations(indexes, i))
            
            
            for current_combination in features_to_use:
                params_positive_score_count = 0
                
                #print current_params
                clf = estimator()
                clf.set_params(**current_params) #Unpack the dictionary, it wouldn't work otherwise
                #print clf
                #print **current_params
                #Test split
                skf = StratifiedKFold(training_labels, n_folds = 7)
                
                #A string to hold all crossfold training result. This string will only be displayed in case we find a good result.
                result_str = "__________________________________________________________________________________________________\n"
                
                #Loop over all different validation sets
                for train_index, test_index in skf:
                    
                    #Fit the classifier over the current training set
                    current_training_set = trainin_data[[ [x] for x in train_index ], current_combination]
                    clf.fit( current_training_set, training_labels[train_index])
                    
                    #predict the current test set
                    current_testing_set = trainin_data[[ [x] for x in test_index ], current_combination]
                    prediction_result = clf.predict(current_testing_set)
                    
                    cm = confusion_matrix(training_labels[test_index], prediction_result) #Confusion matrix
                    
                    result_str = result_str + (classification_report(training_labels[test_index], prediction_result, target_names=target_names))
                    result_str = result_str + "------------------------------------------------------------------------------------------------\n"
                    
                    #Make sure that we have all possibilities, false positives and negatives, true positives and negatives. A realistic model
                    #if cm[0][0] != 0 and cm[0][1] != 0 and cm[1][0] != 0 and cm[1][1] != 0:
                    if cm[0][0] != 0 and cm[1][1] != 0:
                        params_positive_score_count += 1
                        #UNCOMMENT THIS PART IF YOU WANT TO MAKE USE OF THE LAST PASSED PARAMETERS.  IT CAN BE USEFUL AT FIRST WHEN TRYING
                        #TO BUILD AND INTUITION ABOUT THE DATA, I DID USE IT. BUT AFTER A WHILE, THE AMOUNT OF TEXT GENERATED BECOMES
                        #OVERWHELMING, SO COMMENTING IT CAN BE BETTER
                        
                        #if params_positive_score_count > 3:
                        #    print "__________________________________________________________________________________________________"
                        #    print params_positive_score_count
                        #print features_to_use[current_combination]
                        #print current_combination
                        #print current_params
                        #print (classification_report(training_labels[test_index], prediction_result, target_names=target_names))
                        
                        
                        #non_poi_precision = float(cm[0][0])/(cm[0][0] + cm[1][0] )
                        #non_poi_recall = float(cm[0][0])/(cm[0][0] + cm[0][1] )

                        #poi_precision = float(cm[1][1])/(cm[1][1] + cm[0][1] )
                        #poi_recall = float(cm[1][1])/(cm[1][1] + cm[1][0] )
                        
                        

                        #if poi_precision >= poi_min_acceptable_precision and poi_recall >= poi_min_acceptable_recall:
                            #print "------------------------------------------------------------------------------------------------"
                            #print "\t\t\tHIGH SCORE FOUND"
                            #print current_params
                            #print "PARAMETERS:", current_params
                            #print "CONFUSION MATRIX:\n", cm
                            #print (classification_report(training_labels[test_index], prediction_result, target_names=target_names))
                            #print "------------------------------------------------------------------------------------------------"
                            
                #After doing the cross validation, if the model was promising, ie showed more than 4 good cofusion matrix,
                #fit and check results over the testing set.
                #Train over test data
                test_prediction_result = clf.predict(testing_data[:, current_combination])
                cm = confusion_matrix(testing_labels, test_prediction_result) #Confusion matrix
                if cm[0][0] != 0 and cm[1][1] != 0 and params_positive_score_count > 5:
                    #Do the testing of over the testing set
                    print result_str
                    print "########################################################################################################"
                    print "##########                          GOOD SCORES FOUND FOR TESTING                            ###########"
                    print "########################################################################################################"
                    print "Number of positive iterations in training:", params_positive_score_count
                    print current_combination
                    high_scoring_features_count[current_combination] += 1
                    print current_params
                    print (classification_report(testing_labels, test_prediction_result, target_names=target_names))
                    print cm
                    print "########################################################################################################"
                    
    print "Function ran in %0.1fs" % (time() - t0)
    print high_scoring_features_count #How many times a specific combination of features appeared to perform well

# GET AND PREPROCESS DATA

In [5]:
#Load the data from pickled dictionary and transform it into a dataframe
data_dict = pickle.load(open("final_project_dataset.pkl", "r") )
enron_df = pd.DataFrame.from_dict(data_dict, orient = 'index')
enron_df = enron_df.reset_index()
enron_df = enron_df.rename(columns = {'index':'name'} )

#categorize features
numerical_columns = ['salary', 'to_messages', 'deferral_payments', 'total_stock_value', 'exercised_stock_options', 'bonus', \
                     'restricted_stock', 'shared_receipt_with_poi', 'loan_advances', 'from_messages', 'other', \
                     'from_this_person_to_poi', 'restricted_stock_deferred', 'director_fees', 'deferred_income', \
                     'long_term_incentive', 'from_poi_to_this_person', 'total_payments', 'expenses' ]

#Typecast features into their correct datatypes
enron_df[numerical_columns] = enron_df[numerical_columns].astype(float)
enron_df['poi'] = enron_df['poi'].astype(bool)

#remove outliers and totally corrupt entries
enron_df = enron_df[enron_df.name != 'TOTAL']
enron_df = enron_df[enron_df.name != 'THE TRAVEL AGENCY IN THE PARK']
enron_df = enron_df[enron_df.name != 'LOCKHART EUGENE E']

#Reset index. The second line deletes the original index. For some reason, pandas does not remove the old index automatically upon reset,
#it needs to be done manually
enron_df = enron_df.reset_index()
del enron_df['index']

financial_features = ['salary', 'deferral_payments', 'total_payments', 'loan_advances', 'bonus', 'restricted_stock_deferred', 
                      'deferred_income', 'total_stock_value', 'expenses', 'exercised_stock_options', 'other', 
                      'long_term_incentive', 'restricted_stock', 'director_fees']

email_features = ['to_messages', 'email_address', 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 
                 'shared_receipt_with_poi']

email_numerical_features = ['to_messages', 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi']

# TO DROP THE SPARSE FEATURES, PREPARE A LIST OF THEM
features_to_remove = ['restricted_stock_deferred', 'director_fees', 'loan_advances']

all_num_features = email_numerical_features + financial_features

#ALL df
enron_full_features_df = enron_df[list(all_num_features) + ['poi']]

#Add my features
enron_full_features_df['lti_ratio'] = enron_df['long_term_incentive']/enron_df['total_payments']
#('from_this_person_to_poi', 'from_messages')
enron_full_features_df['sent_to_poi_ratio'] = enron_df['from_this_person_to_poi']/enron_df['from_messages']


enron_full_features_df = enron_full_features_df.drop(features_to_remove, axis=1)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [9]:
#Preprocess for sklearn
X_and_y = preprocess_df_for_ML(enron_full_features_df, 'impute_median', 'min_max')
#All features, imputed and scaled
X = X_and_y[0]
#Labels, as a numpy array
y = X_and_y[1]

# MACHINE LEARNING

## Split data into training and testing sets

In [10]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42, stratify=y)

## Features to use for training

In [11]:
features_to_use = ['bonus', 'total_stock_value', 'shared_receipt_with_poi', 'salary', 'exercised_stock_options',
                 'total_payments', 'long_term_incentive', 'lti_ratio', 'sent_to_poi_ratio', 'other']

## Logistic Regression

In [12]:
logistic_regression_grid_param = {'penalty' : ['l1'],# ('l1', 'l2'),
                                  'dual' : [False],
                                  'C': [10, 100, 1000], 
                                  'solver' : ['liblinear'],
                                  'class_weight': [{0: 1, 1: poi_weight} for poi_weight in [5,6,7] ]   , # in range(1,7)],
                                  'n_jobs' : [-1]
                                 }

In [20]:
get_high_scores(logistic_regression_grid_param,#param_dict, 
                LogisticRegression,#estimator, 
                X_train,#trainin_data, 
                y_train,#training_labels, 
                X_test,
                y_test,
                features_to_use,#features_list, 
                8,#minimum_number_of_features_to_use,#Must be less than the length of the features_list
                0.6,#poi_min_acceptable_precision, 
                0.7)#poi_min_acceptable_recall)

__________________________________________________________________________________________________
             precision    recall  f1-score   support

    NON POI       1.00      0.77      0.87        13
        POI       0.40      1.00      0.57         2

avg / total       0.92      0.80      0.83        15
------------------------------------------------------------------------------------------------
             precision    recall  f1-score   support

    NON POI       0.87      1.00      0.93        13
        POI       0.00      0.00      0.00         2

avg / total       0.75      0.87      0.80        15
------------------------------------------------------------------------------------------------
             precision    recall  f1-score   support

    NON POI       1.00      0.92      0.96        13
        POI       0.67      1.00      0.80         2

avg / total       0.96      0.93      0.94        15
-----------------------------------------------------------------

## Support Vector Machines, non-Poly kernels

In [21]:
svm_grid_params = {'kernel':('linear', 'rbf', 'sigmoid'), 
                   'C': [0.0001, 0.01, 0.1, 1, 10, 100, 1000], 
                   'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
                   'class_weight': [{0: 1, 1: poi_weight} for poi_weight in range(1,7)]
                 }

In [22]:
get_high_scores(svm_grid_params,#param_dict, 
                svm.SVC,#estimator, 
                X_train,#trainin_data, 
                y_train,#training_labels, 
                X_test,
                y_test,
                features_to_use,#features_list, 
                len(features_to_use),#minimum_number_of_features_to_use,#Must be less than the length of the features_list
                0.6,#poi_min_acceptable_precision, 
                0.7)#poi_min_acceptable_recall)

__________________________________________________________________________________________________
             precision    recall  f1-score   support

    NON POI       1.00      0.54      0.70        13
        POI       0.25      1.00      0.40         2

avg / total       0.90      0.60      0.66        15
------------------------------------------------------------------------------------------------
             precision    recall  f1-score   support

    NON POI       0.92      0.85      0.88        13
        POI       0.33      0.50      0.40         2

avg / total       0.84      0.80      0.82        15
------------------------------------------------------------------------------------------------
             precision    recall  f1-score   support

    NON POI       1.00      0.85      0.92        13
        POI       0.50      1.00      0.67         2

avg / total       0.93      0.87      0.88        15
-----------------------------------------------------------------

## Support Vector Machines, Poly Kernel

In [23]:
svm_grid_params = {'kernel':['poly'],
                   'degree':[2,3],
                   'C': [0.0001, 0.01, 0.1, 1, 10, 100, 1000],
                   'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
                   'class_weight': [{0: 1, 1: poi_weight} for poi_weight in range(1,7)]
                 }

In [24]:
get_high_scores(svm_grid_params,#param_dict, 
                svm.SVC,#estimator, 
                X_train,#trainin_data, 
                y_train,#training_labels, 
                X_test,
                y_test,
                features_to_use,#features_list, 
                len(features_to_use),#minimum_number_of_features_to_use,#Must be less than the length of the features_list
                0.6,#poi_min_acceptable_precision, 
                0.7)#poi_min_acceptable_recall)

__________________________________________________________________________________________________
             precision    recall  f1-score   support

    NON POI       0.89      0.62      0.73        13
        POI       0.17      0.50      0.25         2

avg / total       0.79      0.60      0.66        15
------------------------------------------------------------------------------------------------
             precision    recall  f1-score   support

    NON POI       0.92      0.92      0.92        13
        POI       0.50      0.50      0.50         2

avg / total       0.87      0.87      0.87        15
------------------------------------------------------------------------------------------------
             precision    recall  f1-score   support

    NON POI       0.92      0.92      0.92        13
        POI       0.50      0.50      0.50         2

avg / total       0.87      0.87      0.87        15
-----------------------------------------------------------------

## k Nearest Neighbours

In [25]:
knn_grid_params = {'n_neighbors' : range(1,31,2),#Only odd numbers
                   'metric': ['euclidean'], #minkowski p=1 is manhatten, p=2 is eucledian
                   'n_jobs' : [-1],
                   'weights': ['distance'] #range(1,7)]
                  }

In [26]:
get_high_scores(knn_grid_params,#param_dict, 
                KNeighborsClassifier,#estimator, 
                X_train,#trainin_data, 
                y_train,#training_labels, 
                X_test,
                y_test,
                features_to_use,#features_list, 
                4,#minimum_number_of_features_to_use,#Must be less than the length of the features_list
                0.6,#poi_min_acceptable_precision, 
                0.7)#poi_min_acceptable_recall)

__________________________________________________________________________________________________
             precision    recall  f1-score   support

    NON POI       0.86      0.92      0.89        13
        POI       0.00      0.00      0.00         2

avg / total       0.74      0.80      0.77        15
------------------------------------------------------------------------------------------------
             precision    recall  f1-score   support

    NON POI       0.92      0.92      0.92        13
        POI       0.50      0.50      0.50         2

avg / total       0.87      0.87      0.87        15
------------------------------------------------------------------------------------------------
             precision    recall  f1-score   support

    NON POI       0.92      0.92      0.92        13
        POI       0.50      0.50      0.50         2

avg / total       0.87      0.87      0.87        15
-----------------------------------------------------------------

## Decision Trees

In [29]:
decision_tree_grid_param = {'criterion' : ['gini'], #Discard entropy, since these are just numerical data, not classes.
                            'max_depth' : [None], #1, 2, 3, 4, 5, 6, 7, 8, 9],
                            'max_features' : [None, 'sqrt', 'log2'] ,
                            'presort' : [True], #this is a small dataset, so let's speed things up,
                            'class_weight': [{0: 1, 1: poi_weight} for poi_weight in [5, 6, 7]]#range(1,7)]
                           }

In [28]:
get_high_scores(decision_tree_grid_param,#param_dict, 
                DecisionTreeClassifier,#estimator, 
                X_train,#trainin_data, 
                y_train,#training_labels, 
                X_test,
                y_test,
                features_to_use,#features_list, 
                4,#minimum_number_of_features_to_use,#Must be less than the length of the features_list
                0.6,#poi_min_acceptable_precision, 
                0.7)#poi_min_acceptable_recall)

__________________________________________________________________________________________________
             precision    recall  f1-score   support

    NON POI       0.91      0.77      0.83        13
        POI       0.25      0.50      0.33         2

avg / total       0.82      0.73      0.77        15
------------------------------------------------------------------------------------------------
             precision    recall  f1-score   support

    NON POI       0.93      1.00      0.96        13
        POI       1.00      0.50      0.67         2

avg / total       0.94      0.93      0.92        15
------------------------------------------------------------------------------------------------
             precision    recall  f1-score   support

    NON POI       0.92      0.85      0.88        13
        POI       0.33      0.50      0.40         2

avg / total       0.84      0.80      0.82        15
-----------------------------------------------------------------

______________________________

# OLDER WORK

By here ends the work needed to explain my final conclusions for my submitted work. But of course this was not all the work, and how I came up with these was after trying a lot of things, a lot of them are failed attempts. If I would include them, the work will be very cluttered, so I have decided to include them at the end just as a reference, but they are not needed at all.

Very Important: This is not everything, not even half of it. Also, I have not cleaned what is coming in, so the code is not well commented. The amount of experimentation I tried is simply overwhelming to be included in a single file, and unnecessary as well to see how I came up with my conclusions. I do have everything I have tried in seperate notebooks (like 5 or 6 notebooks), in case that Udacity wants to take a look, but it is simply too much work to clean them to become presentable.

## Helper Functions

In [30]:
def score_by_order(fsl):
    max_score = len(fsl)
    scores = {}
    for item in fsl:
        scores[item] = max_score - fsl.index(item)
        
    return scores

def add_dicos(dic1, dic2):
    result = {}
    for key, val in dic2.iteritems():
        
        if key in dic1.keys():
            result[key] = dic1[key] + val
        else:
            result[key] = val
            
    return result

# Data Exploration

There were lots of graphs, but considering how huge is this file already, I have decided to just include the summary of the data as a whole, POI data and non-POI data only.

In [60]:
enron_df.describe()

Unnamed: 0,salary,to_messages,deferral_payments,total_payments,exercised_stock_options,bonus,restricted_stock,shared_receipt_with_poi,restricted_stock_deferred,total_stock_value,expenses,loan_advances,from_messages,other,from_this_person_to_poi,director_fees,deferred_income,long_term_incentive,from_poi_to_this_person
count,94.0,86.0,38.0,123.0,101.0,81.0,109.0,86.0,17.0,125.0,94.0,3.0,86.0,91.0,86.0,16.0,48.0,65.0,86.0
mean,284087.5,2073.860465,841602.5,2641806.0,2959559.0,1201773.0,1147424.0,1176.465116,621892.8,3352073.0,54192.010638,27975000.0,608.790698,466410.5,41.232558,89822.875,-581049.8,746491.2,64.895349
std,177131.1,2582.700981,1289323.0,9524694.0,5499450.0,1441679.0,2249770.0,1178.317641,3845528.0,6532883.0,46108.377454,46382560.0,1841.033949,1397376.0,100.073111,41112.700735,942076.4,862917.4,86.979244
min,477.0,57.0,-102500.0,148.0,3285.0,70000.0,-2604490.0,2.0,-1787380.0,-44093.0,148.0,400000.0,12.0,2.0,0.0,3285.0,-3504386.0,69223.0,0.0
25%,211802.0,541.25,79644.5,396934.0,506765.0,425000.0,252055.0,249.75,-329825.0,494136.0,22479.0,1200000.0,22.75,1203.0,1.0,83674.5,-611209.2,275000.0,10.0
50%,258741.0,1211.0,221063.5,1101393.0,1297049.0,750000.0,441096.0,740.5,-140264.0,1095040.0,46547.5,2000000.0,41.0,51587.0,8.0,106164.5,-151927.0,422158.0,35.0
75%,308606.5,2634.75,867211.2,2087530.0,2542813.0,1200000.0,985032.0,1888.25,-72419.0,2606763.0,78408.5,41762500.0,145.5,331983.0,24.75,112815.0,-37926.0,831809.0,72.25
max,1111258.0,15149.0,6426990.0,103559800.0,34348380.0,8000000.0,14761690.0,5521.0,15456290.0,49110080.0,228763.0,81525000.0,14368.0,10359730.0,609.0,137864.0,-833.0,5145434.0,528.0


In [61]:
enron_df[enron_df['poi'] == True].describe()

Unnamed: 0,salary,to_messages,deferral_payments,total_payments,exercised_stock_options,bonus,restricted_stock,shared_receipt_with_poi,restricted_stock_deferred,total_stock_value,expenses,loan_advances,from_messages,other,from_this_person_to_poi,director_fees,deferred_income,long_term_incentive,from_poi_to_this_person
count,17.0,14.0,5.0,18.0,12.0,16.0,17.0,14.0,0.0,18.0,18.0,1.0,14.0,18.0,14.0,0.0,11.0,12.0,14.0
mean,383444.9,2417.142857,519894.2,7913590.0,10463790.0,2075000.0,2318621.0,1783.0,,9165671.0,59873.833333,81525000.0,300.357143,802997.4,66.714286,,-1035313.0,1204862.0,97.785714
std,278359.7,1961.858101,912889.5,23965490.0,12382590.0,2047437.0,3620811.0,1264.996625,,13841170.0,37524.658812,,805.844574,2417568.0,158.289622,,1334972.0,991658.3,76.058862
min,158403.0,225.0,10259.0,91093.0,384728.0,200000.0,126027.0,91.0,,126027.0,16514.0,81525000.0,16.0,486.0,4.0,,-3504386.0,71023.0,13.0
25%,240189.0,1115.75,27610.0,1142396.0,1456581.0,775000.0,393818.0,1059.25,,1016450.0,31323.25,81525000.0,33.0,4979.5,12.5,,-1860244.0,368978.0,44.5
50%,278601.0,1875.0,202911.0,1754028.0,3914557.0,1275000.0,985032.0,1589.0,,2206836.0,50448.5,81525000.0,44.5,149204.0,15.5,,-262500.0,1134637.0,62.0
75%,415189.0,2969.25,214678.0,2665345.0,19386040.0,2062500.0,2502063.0,2165.25,,10511330.0,84125.0,81525000.0,101.5,260772.5,28.75,,-122031.0,1646772.0,135.75
max,1111258.0,7991.0,2144013.0,103559800.0,34348380.0,7000000.0,14761690.0,5521.0,,49110080.0,127017.0,81525000.0,3069.0,10359730.0,609.0,,-833.0,3600000.0,240.0


In [62]:
enron_df[enron_df['poi'] == False].describe()

Unnamed: 0,salary,to_messages,deferral_payments,total_payments,exercised_stock_options,bonus,restricted_stock,shared_receipt_with_poi,restricted_stock_deferred,total_stock_value,expenses,loan_advances,from_messages,other,from_this_person_to_poi,director_fees,deferred_income,long_term_incentive,from_poi_to_this_person
count,77.0,72.0,33.0,105.0,89.0,65.0,92.0,72.0,17.0,107.0,76.0,2.0,72.0,73.0,72.0,16.0,37.0,53.0,72.0
mean,262151.5,2007.111111,890346.2,1738072.0,1947752.0,986824.9,931007.3,1058.527778,621892.8,2374085.0,52846.315789,1200000.0,668.763889,383416.5,36.277778,89822.875,-445998.5,642709.0,58.5
std,139231.7,2693.165955,1341381.0,2627417.0,2547068.0,1173880.0,1843451.0,1132.503757,3845528.0,3535017.0,48036.089983,1131371.0,1978.997801,1012686.0,85.13969,41112.700735,762791.4,805590.4,87.995198
min,477.0,57.0,-102500.0,148.0,3285.0,70000.0,-2604490.0,2.0,-1787380.0,-44093.0,148.0,400000.0,12.0,2.0,0.0,3285.0,-3367011.0,69223.0,0.0
25%,206121.0,513.75,85430.0,319941.0,436515.0,400000.0,211999.5,191.5,-329825.0,424684.5,18254.0,800000.0,20.5,947.0,0.0,83674.5,-575000.0,256191.0,10.0
50%,251654.0,944.0,260455.0,1057548.0,1030329.0,700000.0,413586.5,594.0,-140264.0,1030329.0,44601.0,1200000.0,41.0,12961.0,6.0,106164.5,-121284.0,375304.0,26.5
75%,288589.0,2590.75,875307.0,2014835.0,2165172.0,1000000.0,909759.0,1635.5,-72419.0,2307584.0,76764.75,1600000.0,216.5,374689.0,23.25,112815.0,-36666.0,694862.0,61.75
max,1060932.0,15149.0,6426990.0,17252530.0,15364170.0,8000000.0,13847070.0,4527.0,15456290.0,23817930.0,228763.0,2000000.0,14368.0,7427621.0,411.0,137864.0,-1042.0,5145434.0,528.0


## Feature Selection

Feature selection will be performed automatically, and on all data. The way features will be scored is as follows:

1- Dataset will be split-stratified into 10 folds, the same way as cross validation works

2- For each fold (ie 9 for training, the one for testing will not be used), we will use SelectKBest (k = 10) twice to compute the features score:

 2a- Once using Chi-square as a uni-variate analysis

 2b- Once using F scores

3- After each iteration, we will score the features as follows: Most important one gets a score = 10, second's score = 9, third = 8..etc.

4- We sum the scores of both metrics for all folds to get a final score

In [31]:
features_only_df = enron_full_features_df.copy()
features_only_df = features_only_df.drop('poi', 1)

stratified_kfold = StratifiedKFold(y, n_folds=10)

feature_ranking_metrics = [chi2, f_classif]

chi2_feature_dict = {}
f_classif_feature_dict = {}

for train_index, test_index in stratified_kfold:
    X_train = X_all[train_index]
    y_train = y[train_index]
    
    for ranking_metric in feature_ranking_metrics:
        temp_dict = {}
        fs = SelectKBest(ranking_metric, k=10)
        fs.fit_transform(X_train, y_train)
        
        ordered_fsl = list(features_only_df.columns[fs.get_support()]) #fsl = feature selection list
        temp_dict = score_by_order(ordered_fsl)
                
        if ranking_metric == chi2:
            chi2_feature_dict = add_dicos(chi2_feature_dict, temp_dict)
        else:
            f_classif_feature_dict = add_dicos(f_classif_feature_dict, temp_dict)
        
        
print "F-Score for Features:"
print f_classif_feature_dict
print "\nChi2 for Features:"
print chi2_feature_dict
print "\nOverall Score for Features:"
print add_dicos(f_classif_feature_dict, chi2_feature_dict)
    #X = SelectKBest(chi2, k=5).fit_transform(X_all, y)

F-Score for Features:
{'salary': 91, 'lti_ratio': 10, 'total_payments': 25, 'bonus': 73, 'total_stock_value': 53, 'shared_receipt_with_poi': 20, 'exercised_stock_options': 43, 'sent_to_poi_ratio': 10, 'deferred_income': 63, 'restricted_stock': 3}

Chi2 for Features:
{'salary': 9, 'lti_ratio': 10, 'total_payments': 24, 'bonus': 68, 'total_stock_value': 58, 'shared_receipt_with_poi': 81, 'exercised_stock_options': 48, 'sent_to_poi_ratio': 10, 'other': 12, 'long_term_incentive': 29}

Overall Score for Features:
{'salary': 100, 'lti_ratio': 20, 'total_payments': 49, 'bonus': 141, 'total_stock_value': 111, 'shared_receipt_with_poi': 101, 'exercised_stock_options': 91, 'sent_to_poi_ratio': 20, 'other': 12, 'long_term_incentive': 29}


## Correlations between different features

In [32]:
def build_all_permutations(cols):
    import itertools
    perm_list = list(itertools.permutations(cols, 2))
    ret_list = []
    for i in perm_list:
        i = ( str(i[0]) + " vs " + str(i[1]), ) + i
        ret_list.append(i)
    return ret_list

In [36]:
financial_columns = ['salary', 'deferral_payments', 'total_payments', 'exercised_stock_options', 'bonus', 'restricted_stock',\
                     'restricted_stock_deferred','total_stock_value', 'expenses', 'loan_advances', 'director_fees', \
                     'deferred_income', 'long_term_incentive']

email_columns = ['to_messages', 'shared_receipt_with_poi', 'from_messages', 'other', 'from_this_person_to_poi','from_poi_to_this_person' ]

In [37]:
#EMAIL FEATURES CORRELATIONS
all_email_perm = build_all_permutations(email_columns)    

#for feature in new_features_list:
for feature in all_email_perm:
    feature_poi_count = 0
    feature_non_poi_count = 0
    temp_df = pd.DataFrame(columns = (feature, 'poi'))
    
    for index, employee in enron_df.iterrows():
        if not (math.isnan(employee[feature[1]]) or math.isnan(employee[feature[2]])):
            #Avoid division by zero
            if employee[feature[2]] != 0:
                temp_df = temp_df.append( {feature : (employee[feature[1]] / employee[feature[2]]),\
                                 'poi':employee['poi']}, ignore_index=True)
                if employee['poi'] == True:
                    feature_poi_count += 1
                else:
                    feature_non_poi_count += 1

    corr_coeff = pearsonr(temp_df[feature], temp_df['poi'])[0]
    if abs(corr_coeff) > 0.1:
        print feature_poi_count, feature_non_poi_count
        print "Pearson Correlation between POI and " + str(feature[0]) + ":", corr_coeff
        #print temp_df.corr()
        print "___________________________________________________________________"

14 72
Pearson Correlation between POI and to_messages vs shared_receipt_with_poi: -0.13940372251
___________________________________________________________________
14 72
Pearson Correlation between POI and to_messages vs from_messages: 0.16792327818
___________________________________________________________________
14 52
Pearson Correlation between POI and to_messages vs from_this_person_to_poi: -0.16265432207
___________________________________________________________________
14 60
Pearson Correlation between POI and to_messages vs from_poi_to_this_person: -0.137433578028
___________________________________________________________________
14 72
Pearson Correlation between POI and shared_receipt_with_poi vs to_messages: 0.265224936669
___________________________________________________________________
14 72
Pearson Correlation between POI and shared_receipt_with_poi vs from_messages: 0.17922713095
___________________________________________________________________
14 52
Pearson Corre

In [38]:
all_perm = build_all_permutations(email_columns + financial_columns)    

#for feature in new_features_list:
for feature in all_perm:
    feature_poi_count = 0
    feature_non_poi_count = 0
    temp_df = pd.DataFrame(columns = (feature, 'poi'))
    
    for index, employee in enron_df.iterrows():
        if not (math.isnan(employee[feature[1]]) or math.isnan(employee[feature[2]])):
            #Avoid division by zero
            if employee[feature[2]] != 0:
                temp_df = temp_df.append( {feature : (employee[feature[1]] / employee[feature[2]]),\
                                 'poi':employee['poi']}, ignore_index=True)
                if employee['poi'] == True:
                    feature_poi_count += 1
                else:
                    feature_non_poi_count += 1

    if (feature_non_poi_count + feature_poi_count) > 50:
        corr_coeff = pearsonr(temp_df[feature], temp_df['poi'])[0]
        if abs(corr_coeff) > 0.10:
            print feature_poi_count, feature_non_poi_count
            print "Pearson Correlation between POI and " + str(feature[0]) + ":", corr_coeff
            #print temp_df.corr()
            print "___________________________________________________________________"
        
        
"""
Criterias:

1- Correlation greater than 20
2- Either absolutely enough points (ie more than 50% of both sides is present)
3- OR
    Over 70% of the available data points were used. For example, suppose that a parameter has only 30 points available and the
    rest is NaN. If we used up at least 21 points and a strong-ish correlation was present, this parameter would still be 
    displayed
4- Parameters and their inverse would be displayed one after another


Var1 Var2, Count1, Count2, %Count1, %Count2, Available1, Available2, %Available1, %Available2, Correlation
"""

14 72
Pearson Correlation between POI and to_messages vs shared_receipt_with_poi: -0.13940372251
___________________________________________________________________
14 72
Pearson Correlation between POI and to_messages vs from_messages: 0.16792327818
___________________________________________________________________
14 52
Pearson Correlation between POI and to_messages vs from_this_person_to_poi: -0.16265432207
___________________________________________________________________
14 60
Pearson Correlation between POI and to_messages vs from_poi_to_this_person: -0.137433578028
___________________________________________________________________
14 47
Pearson Correlation between POI and to_messages vs bonus: -0.187943736217
___________________________________________________________________
14 46
Pearson Correlation between POI and to_messages vs expenses: -0.106232261329
___________________________________________________________________
14 72
Pearson Correlation between POI and shared_re

  def _ipython_display_formatter_default(self):
  def _singleton_printers_default(self):


'\nCriterias:\n\n1- Correlation greater than 20\n2- Either absolutely enough points (ie more than 50% of both sides is present)\n3- OR\n    Over 70% of the available data points were used. For example, suppose that a parameter has only 30 points available and the\n    rest is NaN. If we used up at least 21 points and a strong-ish correlation was present, this parameter would still be \n    displayed\n4- Parameters and their inverse would be displayed one after another\n\n\nVar1 Var2, Count1, Count2, %Count1, %Count2, Available1, Available2, %Available1, %Available2, Correlation\n'

## Dimensionality Reduction

Principle Component Analysis will be used to further reduce the data. The way I am thinking about it is as follows: The data that we have is divided into two main categories:<br>
1- Financial<br>
2- Email Data<br>
<br>
The financial data is further divided into two categories:<br>
A) Income<br>
B) Stocks data (Will not be used, see explanation below)<br>
<br>
So when thinking about latent variables, I will try to find the PCA of the income-part variables alone, stocks-part variables alone and email part alone, i.e. I will not try to find a PCA for all the data combined. I think doing it all combined does not make sense, even if it yields good results (I have not tried it). In that hypothetical situation that a global PCA gives a good intuition, I think this would be only due to chance, as I cannot think of a hidden logic about a relationship between email counts and the amount of stocks owned for example. 

For the stocks data, we do not have all that much variables to use from the first place, so we would try to reduce them. There are some variables that were discarded already (due to their sparcity), a variable that is already chosen to be used as is so we are left with only one variable, the restricted stock. So, no PCA here.

Also, I will discard the totals variables just to avoid its dominance, and variables that have scored high in the feature selection because I don't want redunduncy with the features I will be feeding to the estimator. These high scoring features will be given to the model as is, without any reduction in the amount of information they carry.

The last thing to mention is that I will try an exhautive combination of related variables, get their PCA and then see if their fitted data correlates well with being a POI or not. If they correlate well, these PCA will be used with the model, otherwise no PCA will be used in training the model.

In [34]:
income_features = ['deferral_payments', 'deferred_income', 'expenses', 'other', 'long_term_incentive']

email_features = ['to_messages', 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 
                  'shared_receipt_with_poi']

In [39]:
def features_combinations_pca_corr(feature_list, df):
    for i in range(2, len(feature_list)+1):
        all_features_combinations = list(itertools.combinations (feature_list, i))

        for current_combination in all_features_combinations:
            features_pca = PCA(n_components=len(current_combination))
            features_np_arr = get_features_as_numpy(df, current_combination)
            features_pca.fit(features_np_arr)
            transformed_features_pca = features_pca.transform(features_np_arr)

            print current_combination, i
            print features_pca.explained_variance_ratio_
            print "Correlation with POI:", pearsonr(transformed_features_pca[:,0], enron_full_features_df['poi'])[0]
            print "____________________"
        print "__________________________________________________________________"


In [41]:
def get_features_as_numpy(df, feature_list):
    extracted_df = df[list(feature_list)].copy()
    features_np_array = extracted_df.values
    mean_imputer = Imputer(strategy="mean")
    features_np_array = mean_imputer.fit_transform(features_np_array)
    return features_np_array

In [42]:
features_combinations_pca_corr(income_features, enron_full_features_df)

('deferral_payments', 'deferred_income') 2
[ 0.76388292  0.23611708]
Correlation with POI: -0.0423361435926
____________________
('deferral_payments', 'expenses') 2
[ 0.99679582  0.00320418]
Correlation with POI: 0.0517073904483
____________________
('deferral_payments', 'other') 2
[ 0.77758592  0.22241408]
Correlation with POI: -0.100967303632
____________________
('deferral_payments', 'long_term_incentive') 2
[ 0.56629226  0.43370774]
Correlation with POI: 0.0251578007385
____________________
('deferred_income', 'expenses') 2
[ 0.99530602  0.00469398]
Correlation with POI: -0.195024272169
____________________
('deferred_income', 'other') 2
[ 0.81661286  0.18338714]
Correlation with POI: -0.1247655413
____________________
('deferred_income', 'long_term_incentive') 2
[ 0.5970841  0.4029159]
Correlation with POI: 0.254405140704
____________________
('expenses', 'other') 2
[ 0.99887945  0.00112055]
Correlation with POI: -0.115218927613
____________________
('expenses', 'long_term_incenti

In [43]:
features_combinations_pca_corr(email_features, enron_full_features_df)

('to_messages', 'from_poi_to_this_person') 2
[  9.99180408e-01   8.19592106e-04]
Correlation with POI: -0.0509533912028
____________________
('to_messages', 'from_messages') 2
[ 0.77765891  0.22234109]
Correlation with POI: -0.0226813734024
____________________
('to_messages', 'from_this_person_to_poi') 2
[ 0.99898589  0.00101411]
Correlation with POI: -0.0509415317924
____________________
('to_messages', 'shared_receipt_with_poi') 2
[ 0.95819255  0.04180745]
Correlation with POI: -0.0753257592387
____________________
('from_poi_to_this_person', 'from_messages') 2
[ 0.99785071  0.00214929]
Correlation with POI: -0.0640695487359
____________________
('from_poi_to_this_person', 'from_this_person_to_poi') 2
[ 0.73110939  0.26889061]
Correlation with POI: -0.135047466465
____________________
('from_poi_to_this_person', 'shared_receipt_with_poi') 2
[ 0.99694332  0.00305668]
Correlation with POI: -0.197108235937
____________________
('from_messages', 'from_this_person_to_poi') 2
[ 0.99807693

There was not any interesting correlations to be used, but still there were a few insights for the email variables that are worth to mention. A lot of these combinations have a **very high** variance explanation of their first component. For example, 'from_poi_to_this_person', 'from_messages' and 'from_this_person_to_poi' can be compacted into a single variable and still hold 99% of the variance. I am not sure about how to interpret this, is it a latent variable or not, but for this project I am going to leave this as is and move on with my model building.

## Algorithm Selection

As my first Machine Learning project, I am tempted to survery a few algorithms, and not just pick one or two to compare. The agorithms that I am going to survery are the following:<br>
1) Support Vector Machine<br>
2) k-Nearest-Neighbors<br>
3) Ensemble Metods<br>
  3a) AdaBoost<br>
  3b) Random Forest<br>
4) Logistic Regression, with a few kernels<br>
5) Decision Tree<br>
<br>
The plan of action is as follows:<br>
1) Build a few configuartions for each algorithm, so they can be used with GridSearchCV<br>
2) For each configuration grid, I will iteratively use KBest features ascendingly and see how the algorithm scores<br>
3) Cross Validate the scores within the training set<br>
4) Measure the time it took to train the algorithm<br>
5) See which configuration scored best, and with how many features was this score<br>

In [46]:
best_features = ['bonus', 'total_stock_value', 'shared_receipt_with_poi', 'salary', 'exercised_stock_options',
                 'total_payments', 'long_term_incentive', 'lti_ratio', 'sent_to_poi_ratio', 'other']

### SVM

In [44]:
# SEARCH FOR EACH ESTIMATOR GRID PARAMS
svm_grid_params = {'kernel':('linear', 'rbf', 'poly', 'sigmoid'), 
                   'degree':[1,2,3],
                   'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 
                   'gamma': ['auto', 0.001, 0.0001],
                   'class_weight': [{0: 1, 1: poi_weight} for poi_weight in range(1,7)]
                 }

svm_estimator = svm.SVC()

svm_grid_clf = grid_search.GridSearchCV(svm_estimator, 
                                        svm_grid_params, cv=10)

In [47]:
# CROSS VALIDATE OVER ALL PARAMETERS AND FEATURES
svm_grids = []
for i in range(0, len(best_features) ):
    print str(i+1) + " best feature(s):", best_features[:i+1]
    t0 = time()
    svm_grid_clf.fit(X_train[:, 0: (i+1)], y_train)
    print "Trained in %0.1fs" % (time() - t0) 
    print "Best Score: ", svm_grid_clf.best_score_ 
    print "Best Parameters: ", svm_grid_clf.best_params_
    svm_grids.append(svm_grid_clf)
    print "_______________________________________________________"

1 best feature(s): ['bonus']
Trained in 37.0s
Best Score:  0.869230769231
Best Parameters:  {'kernel': 'linear', 'C': 0.001, 'gamma': 'auto', 'degree': 1, 'class_weight': {0: 1, 1: 1}}
_______________________________________________________
2 best feature(s): ['bonus', 'total_stock_value']
Trained in 39.5s
Best Score:  0.869230769231
Best Parameters:  {'kernel': 'linear', 'C': 0.001, 'gamma': 'auto', 'degree': 1, 'class_weight': {0: 1, 1: 1}}
_______________________________________________________
3 best feature(s): ['bonus', 'total_stock_value', 'shared_receipt_with_poi']
Trained in 40.5s
Best Score:  0.869230769231
Best Parameters:  {'kernel': 'linear', 'C': 0.001, 'gamma': 'auto', 'degree': 1, 'class_weight': {0: 1, 1: 1}}
_______________________________________________________
4 best feature(s): ['bonus', 'total_stock_value', 'shared_receipt_with_poi', 'salary']
Trained in 35.6s
Best Score:  0.869230769231
Best Parameters:  {'kernel': 'linear', 'C': 0.001, 'gamma': 'auto', 'degree'

### kNN

In [48]:
knn_grid_params = {'n_neighbors' : range(1,30, 2),
                   'metric': ('euclidean', 'manhattan', 'chebyshev'), #minkowski p=1 is manhatten, p=2 is eucledian
                   'n_jobs' : [-1]
                  }

knn_estimator = KNeighborsClassifier()

knn_grid_clf = grid_search.GridSearchCV(knn_estimator, 
                                        knn_grid_params, cv=10)

In [49]:
knn_grids = []
for i in range(0, len(best_features) ):
    print str(i+1) + " best feature(s):", best_features[:i+1]
    t0 = time()
    knn_grid_clf.fit(X_train[:, 0: (i+1)], y_train)
    print "Trained in %0.1fs" % (time() - t0) 
    print "Best Score: ", knn_grid_clf.best_score_ 
    print "Best Parameters: ", knn_grid_clf.best_params_
    knn_grids.append(knn_grid_clf)
    print "_______________________________________________________"

1 best feature(s): ['bonus']
Trained in 54.7s
Best Score:  0.869230769231
Best Parameters:  {'n_neighbors': 7, 'metric': 'euclidean', 'n_jobs': -1}
_______________________________________________________
2 best feature(s): ['bonus', 'total_stock_value']
Trained in 54.5s
Best Score:  0.869230769231
Best Parameters:  {'n_neighbors': 9, 'metric': 'euclidean', 'n_jobs': -1}
_______________________________________________________
3 best feature(s): ['bonus', 'total_stock_value', 'shared_receipt_with_poi']
Trained in 54.4s
Best Score:  0.869230769231
Best Parameters:  {'n_neighbors': 9, 'metric': 'euclidean', 'n_jobs': -1}
_______________________________________________________
4 best feature(s): ['bonus', 'total_stock_value', 'shared_receipt_with_poi', 'salary']
Trained in 54.4s
Best Score:  0.869230769231
Best Parameters:  {'n_neighbors': 11, 'metric': 'euclidean', 'n_jobs': -1}
_______________________________________________________
5 best feature(s): ['bonus', 'total_stock_value', 'share

### Ensembles Methods

#### Ada-Boost

In [50]:
ada_boost_grid_params = {#'base_estimator' : ['DecisionTreeClassifier', 'BernoulliNB', 'Perceptron'],
                         'n_estimators' : range(10,110,10),
                         'learning_rate' : [0.1, 0.5, 1.0]
                        }

ada_boost_estimator = AdaBoostClassifier()

ada_boost_grid_clf = grid_search.GridSearchCV(ada_boost_estimator, 
                                              ada_boost_grid_params, cv=10)

In [51]:
ada_boost_grids = []
for i in range(0, len(best_features) ):
    print str(i+1) + " best feature(s):", best_features[:i+1]
    t0 = time()
    ada_boost_grid_clf.fit(X_train[:, 0: (i+1)], y_train)
    print "Trained in %0.1fs" % (time() - t0) 
    print "Best Score: ", ada_boost_grid_clf.best_score_ 
    print "Best Parameters: ", ada_boost_grid_clf.best_params_
    ada_boost_grids.append(ada_boost_grid_clf)
    print "_______________________________________________________"

1 best feature(s): ['bonus']
Trained in 29.2s
Best Score:  0.869230769231
Best Parameters:  {'n_estimators': 10, 'learning_rate': 0.1}
_______________________________________________________
2 best feature(s): ['bonus', 'total_stock_value']
Trained in 25.9s
Best Score:  0.869230769231
Best Parameters:  {'n_estimators': 10, 'learning_rate': 0.1}
_______________________________________________________
3 best feature(s): ['bonus', 'total_stock_value', 'shared_receipt_with_poi']
Trained in 25.8s
Best Score:  0.869230769231
Best Parameters:  {'n_estimators': 10, 'learning_rate': 0.1}
_______________________________________________________
4 best feature(s): ['bonus', 'total_stock_value', 'shared_receipt_with_poi', 'salary']
Trained in 26.0s
Best Score:  0.876923076923
Best Parameters:  {'n_estimators': 40, 'learning_rate': 0.1}
_______________________________________________________
5 best feature(s): ['bonus', 'total_stock_value', 'shared_receipt_with_poi', 'salary', 'exercised_stock_optio

#### Random Forests

In [52]:
random_forest_grid_params = {'criterion' : ['gini'],
                             'n_estimators' : range(10,100,10),
                             'n_jobs' : [-1]
                            }

random_forest_estimator = RandomForestClassifier()

random_forest_grid_clf = grid_search.GridSearchCV(random_forest_estimator, 
                                                  random_forest_grid_params, cv=10)

In [53]:
random_forest_grids = []
for i in range(0, len(best_features) ):
    random_forest_grid_params = {'criterion' : ['gini'],
                                 'n_estimators' : range(10,100,10),
                                 'n_jobs' : [-1]
                                }
    random_forest_grid_clf = grid_search.GridSearchCV(random_forest_estimator, 
                                                  random_forest_grid_params, cv=10)
    
    print str(i+1) + " best feature(s):", best_features[:i+1]
    t0 = time()
    random_forest_grid_clf.fit(X_train[:, 0: (i+1)], y_train)
    print "Trained in %0.1fs" % (time() - t0) 
    print "Best Score: ", random_forest_grid_clf.best_score_ 
    print "Best Parameters: ", random_forest_grid_clf.best_params_
    random_forest_grids.append(random_forest_grid_clf)
    print "_______________________________________________________"

1 best feature(s): ['bonus']
Trained in 30.1s
Best Score:  0.8
Best Parameters:  {'n_estimators': 10, 'n_jobs': -1, 'criterion': 'gini'}
_______________________________________________________
2 best feature(s): ['bonus', 'total_stock_value']
Trained in 29.9s
Best Score:  0.846153846154
Best Parameters:  {'n_estimators': 10, 'n_jobs': -1, 'criterion': 'gini'}
_______________________________________________________
3 best feature(s): ['bonus', 'total_stock_value', 'shared_receipt_with_poi']
Trained in 30.1s
Best Score:  0.846153846154
Best Parameters:  {'n_estimators': 40, 'n_jobs': -1, 'criterion': 'gini'}
_______________________________________________________
4 best feature(s): ['bonus', 'total_stock_value', 'shared_receipt_with_poi', 'salary']
Trained in 30.0s
Best Score:  0.853846153846
Best Parameters:  {'n_estimators': 70, 'n_jobs': -1, 'criterion': 'gini'}
_______________________________________________________
5 best feature(s): ['bonus', 'total_stock_value', 'shared_receipt_wi

### Loigistic Regression

#### LIBLINEAR

In [54]:
logistic_regression_grid_param = {'penalty' : ('l1', 'l2'),
                                  'dual' : [False],
                                  'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 
                                  'solver' : ['liblinear'],
                                  'n_jobs' : [-1]
                                 }

logistic_regression_estimator = LogisticRegression()

logistic_regression_grid_clf = grid_search.GridSearchCV(logistic_regression_estimator, 
                                                        logistic_regression_grid_param,
                                                        scoring = 'f1_weighted',
                                                        cv=10)

In [55]:
logistic_regression_grids = []
for i in range(0, len(best_features) ):
    print str(i+1) + " best feature(s):", best_features[:i+1]
    t0 = time()
    logistic_regression_grid_clf.fit(X_train[:, 0: (i+1)], y_train)
    print "Trained in %0.1fs" % (time() - t0) 
    print "Best Score: ", logistic_regression_grid_clf.best_score_ 
    print "Best Parameters: ", logistic_regression_grid_clf.best_params_
    logistic_regression_grids.append(logistic_regression_grid_clf)
    print "_______________________________________________________"

1 best feature(s): ['bonus']


  'precision', 'predicted', average, warn_for)


Trained in 0.5s
Best Score:  0.80868707658
Best Parameters:  {'penalty': 'l1', 'C': 0.001, 'n_jobs': -1, 'dual': False, 'solver': 'liblinear'}
_______________________________________________________
2 best feature(s): ['bonus', 'total_stock_value']
Trained in 0.3s
Best Score:  0.80868707658
Best Parameters:  {'penalty': 'l1', 'C': 0.001, 'n_jobs': -1, 'dual': False, 'solver': 'liblinear'}
_______________________________________________________
3 best feature(s): ['bonus', 'total_stock_value', 'shared_receipt_with_poi']
Trained in 0.5s
Best Score:  0.80868707658
Best Parameters:  {'penalty': 'l1', 'C': 0.001, 'n_jobs': -1, 'dual': False, 'solver': 'liblinear'}
_______________________________________________________
4 best feature(s): ['bonus', 'total_stock_value', 'shared_receipt_with_poi', 'salary']
Trained in 0.4s
Best Score:  0.853504759455
Best Parameters:  {'penalty': 'l1', 'C': 100, 'n_jobs': -1, 'dual': False, 'solver': 'liblinear'}
_______________________________________________

#### L2 Only Logisitic Regressors

In [56]:
l2_logistic_regression_grid_param = {'penalty' :  ['l2'],
                                  'dual' : [False],
                                  'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 
                                  'solver' : ('newton-cg', 'lbfgs', 'sag'),
                                  'n_jobs' : [-1]
                                 }

l2_logistic_regression_estimator = LogisticRegression()

l2_logistic_regression_grid_clf = grid_search.GridSearchCV(l2_logistic_regression_estimator, 
                                                           l2_logistic_regression_grid_param, cv=10)

In [57]:
l2_logistic_regression_grids = []
for i in range(0, len(best_features) ):
    print str(i+1) + " best feature(s):", best_features[:i+1]
    t0 = time()
    l2_logistic_regression_grid_clf.fit(X_train[:, 0: (i+1)], y_train)
    print "Trained in %0.1fs" % (time() - t0) 
    print "Best Score: ", l2_logistic_regression_grid_clf.best_score_ 
    print "Best Parameters: ", l2_logistic_regression_grid_clf.best_params_
    l2_logistic_regression_grids.append(l2_logistic_regression_grid_clf)
    print "_______________________________________________________"

1 best feature(s): ['bonus']
Trained in 129.0s
Best Score:  0.869230769231
Best Parameters:  {'penalty': 'l2', 'C': 0.001, 'n_jobs': -1, 'dual': False, 'solver': 'newton-cg'}
_______________________________________________________
2 best feature(s): ['bonus', 'total_stock_value']
Trained in 109.6s
Best Score:  0.869230769231
Best Parameters:  {'penalty': 'l2', 'C': 0.001, 'n_jobs': -1, 'dual': False, 'solver': 'newton-cg'}
_______________________________________________________
3 best feature(s): ['bonus', 'total_stock_value', 'shared_receipt_with_poi']




Trained in 117.0s
Best Score:  0.869230769231
Best Parameters:  {'penalty': 'l2', 'C': 0.001, 'n_jobs': -1, 'dual': False, 'solver': 'newton-cg'}
_______________________________________________________
4 best feature(s): ['bonus', 'total_stock_value', 'shared_receipt_with_poi', 'salary']
Trained in 130.7s
Best Score:  0.869230769231
Best Parameters:  {'penalty': 'l2', 'C': 0.001, 'n_jobs': -1, 'dual': False, 'solver': 'newton-cg'}
_______________________________________________________
5 best feature(s): ['bonus', 'total_stock_value', 'shared_receipt_with_poi', 'salary', 'exercised_stock_options']
Trained in 114.2s
Best Score:  0.869230769231
Best Parameters:  {'penalty': 'l2', 'C': 0.001, 'n_jobs': -1, 'dual': False, 'solver': 'newton-cg'}
_______________________________________________________
6 best feature(s): ['bonus', 'total_stock_value', 'shared_receipt_with_poi', 'salary', 'exercised_stock_options', 'total_payments']
Trained in 114.2s
Best Score:  0.869230769231
Best Parameters

### Decision Trees

In [58]:
decision_tree_grid_param = {'criterion' : ('gini', 'entropy'),
                            'max_depth' : [None, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                            'max_features' : [None, 'sqrt', 'log2'] + range(1, len(X[0])) ,
                            'presort' : [True] #this is a small dataset, so let's speed things up,
                           }

decision_tree_estimator = DecisionTreeClassifier()

decision_tree_grid_clf = grid_search.GridSearchCV(decision_tree_estimator, 
                                                  decision_tree_grid_param, cv=10)

In [59]:
decision_tree_grids = []
for i in range(0, len(best_features) ):
    decision_tree_grid_param = {'criterion' : ['gini'],
                            'max_depth' : [None, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                            'max_features' : [None, 'sqrt', 'log2'] + range(1, len((X_train[:, 0: (i+1)][0]) ) ) ,
                            'presort' : [True] #this is a small dataset, so let's speed things up,
                           }
    decision_tree_grid_clf = grid_search.GridSearchCV(decision_tree_estimator, 
                                                  decision_tree_grid_param, cv=10)
    
    print str(i+1) + " best feature(s):", best_features[:i+1]
    t0 = time()
    decision_tree_grid_clf.fit(X_train[:, 0: (i+1)], y_train)
    print "Trained in %0.1fs" % (time() - t0) 
    print "Best Score: ", decision_tree_grid_clf.best_score_ 
    print "Best Parameters: ", decision_tree_grid_clf.best_params_
    decision_tree_grids.append(decision_tree_grid_clf)
    print "_______________________________________________________"

1 best feature(s): ['bonus']
Trained in 0.5s
Best Score:  0.876923076923
Best Parameters:  {'max_features': None, 'presort': True, 'criterion': 'gini', 'max_depth': 2}
_______________________________________________________
2 best feature(s): ['bonus', 'total_stock_value']
Trained in 0.7s
Best Score:  0.869230769231
Best Parameters:  {'max_features': None, 'presort': True, 'criterion': 'gini', 'max_depth': 1}
_______________________________________________________
3 best feature(s): ['bonus', 'total_stock_value', 'shared_receipt_with_poi']
Trained in 0.8s
Best Score:  0.869230769231
Best Parameters:  {'max_features': None, 'presort': True, 'criterion': 'gini', 'max_depth': 1}
_______________________________________________________
4 best feature(s): ['bonus', 'total_stock_value', 'shared_receipt_with_poi', 'salary']
Trained in 1.0s
Best Score:  0.869230769231
Best Parameters:  {'max_features': None, 'presort': True, 'criterion': 'gini', 'max_depth': 1}
_________________________________

There is a very interesting thing in here. For a given set of features, all algorithms more or less perform the same. So I had the idea to brute-force search for the best features combinations and find the highest scoring features. Of course I know that this is not a practical thing to do with larger datasets, but it is pretty doable on a 2 years old machine like mine. I will use the linear regression since it was the fastest one to train.