In [1]:
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import sys
import pickle
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = [
    'poi', 'salary', 'bonus', 'long_term_incentive', 'deferred_income',
    'deferral_payments', 'loan_advances', 'other', 'expenses', 'director_fees',
    'total_payments', 'exercised_stock_options', 'restricted_stock',
    'restricted_stock_deferred', 'total_stock_value',
    'from_poi_to_this_person', 'shared_receipt_with_poi', 'to_messages',
    'from_this_person_to_poi', 'from_messages'
]


In [3]:
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "rb") as data_file:
    enron_data = pickle.load(data_file)

enron_data = pd.DataFrame.from_dict(enron_data)
enron_data = enron_data.T
enron_data = enron_data[features_list]
#Data Cleaning
enron_data.replace(to_replace='NaN', value=np.nan, inplace=True)
#enron_data.count().sort_values()
enron_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 146 entries, METTS MARK to GLISAN JR BEN F
Data columns (total 20 columns):
poi                          146 non-null bool
salary                       95 non-null float64
bonus                        82 non-null float64
long_term_incentive          66 non-null float64
deferred_income              49 non-null float64
deferral_payments            39 non-null float64
loan_advances                4 non-null float64
other                        93 non-null float64
expenses                     95 non-null float64
director_fees                17 non-null float64
total_payments               125 non-null float64
exercised_stock_options      102 non-null float64
restricted_stock             110 non-null float64
restricted_stock_deferred    18 non-null float64
total_stock_value            126 non-null float64
from_poi_to_this_person      86 non-null float64
shared_receipt_with_poi      86 non-null float64
to_messages                  86 non-null floa

In [4]:
temp_frame = enron_data.drop(["poi"], axis=1)
temp_frame[temp_frame.isnull().all(axis=1)]

#LOCKHART EUGENE E has all values NaN, so we remove him
enron_data = enron_data.drop(["LOCKHART EUGENE E"], axis=0)

print ("Number of Data Points (People):", len(enron_data['bonus']))
print ("Number of features: ", enron_data.shape[1])
type(enron_data)

Number of Data Points (People): 145
Number of features:  20


pandas.core.frame.DataFrame

In [5]:
#Printing no. of Poi and Non-poi
poi_nonpoi = enron_data.poi.value_counts()
print (poi_nonpoi)
type(enron_data)
print ("Amount of NaN values in the dataset: ", enron_data.isnull().sum().sum())

False    127
True      18
Name: poi, dtype: int64
Amount of NaN values in the dataset:  1304


In [21]:
#According to the financial data from FindLaw, NaN values represent values of 0 but not the missing value. 
#Replace all NaNs with 0.
f1 = ['poi', 'salary', 'bonus', 'long_term_incentive', 'deferred_income',
    'deferral_payments', 'loan_advances', 'other', 'expenses', 'director_fees',
    'total_payments', 'exercised_stock_options', 'restricted_stock',
    'restricted_stock_deferred', 'total_stock_value']
df1 = enron_data[f1]
df1.fillna(value = 0, inplace = True)

#NaN values in email features mean the information is missing. 
#Impute the missing values with median of each class.

f2 = ['from_poi_to_this_person', 'shared_receipt_with_poi', 'to_messages',
    'from_this_person_to_poi', 'from_messages']
df2 = enron_data[f2]
from sklearn.impute import SimpleImputer
#imp = SimpleImputer(missing_values=np.nan, strategy='median')
#imp.fit_transform(df2)

imp = SimpleImputer(missing_values=np.nan, strategy='median', copy=False)
imp.fit_transform(df2)

result = pd.concat([df1, df2], axis=1, sort=False)
enron_data = result
enron_data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


Unnamed: 0,poi,salary,bonus,long_term_incentive,deferred_income,deferral_payments,loan_advances,other,expenses,director_fees,total_payments,exercised_stock_options,restricted_stock,restricted_stock_deferred,total_stock_value,from_poi_to_this_person,shared_receipt_with_poi,to_messages,from_this_person_to_poi,from_messages
METTS MARK,False,365788.0,600000.0,0.0,0.0,0.0,0.0,1740.0,94299.0,0.0,1061827.0,0.0,585062.0,0.0,585062.0,38.0,702.0,807.0,1.0,29.0
BAXTER JOHN C,False,267102.0,1200000.0,1586055.0,-1386055.0,1295738.0,0.0,2660303.0,11200.0,0.0,5634343.0,6680544.0,3942714.0,0.0,10623258.0,26.5,594.0,944.0,6.0,41.0
ELLIOTT STEVEN,False,170941.0,350000.0,0.0,-400729.0,0.0,0.0,12961.0,78552.0,0.0,211725.0,4890344.0,1788391.0,0.0,6678735.0,26.5,594.0,944.0,6.0,41.0
CORDES WILLIAM R,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,651850.0,386335.0,0.0,1038185.0,10.0,58.0,764.0,0.0,12.0
HANNON KEVIN P,True,243293.0,1500000.0,1617011.0,-3117011.0,0.0,0.0,11350.0,34039.0,0.0,288682.0,5538001.0,853064.0,0.0,6391065.0,32.0,1035.0,1045.0,21.0,32.0


In [22]:
### Task 2: Remove outliers


In [8]:
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
#my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html


NameError: name 'my_dataset' is not defined

In [8]:
# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html


In [14]:
# Example starting point. Try investigating other evaluation techniques!
from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

#dump_classifier_and_data(clf, my_dataset, features_list)