In [1]:

import sys
import pickle
import pprint
from collections import OrderedDict
import pandas as pd
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
from tester import test_classifier


### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi','salary', 'deferral_payments', 'total_payments',
                 'loan_advances', 'bonus', 'restricted_stock_deferred',
                 'deferred_income', 'total_stock_value', 'expenses',
                 'exercised_stock_options', 'other', 'long_term_incentive',
                 'restricted_stock', 'director_fees','to_messages',
                 'from_poi_to_this_person',
                 'from_messages', 'from_this_person_to_poi',
                 'shared_receipt_with_poi']
# did not include email_address, unnecessary and had "@" character


### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

#Converting the dataset from a python dictionary to a pandas dataframe
data_df = pd.DataFrame.from_dict(data_dict, orient='index')
data_df.shape
print data_df.head()

                    salary to_messages deferral_payments total_payments  \
ALLEN PHILLIP K     201955        2902           2869717        4484442   
BADUM JAMES P          NaN         NaN            178980         182466   
BANNANTINE JAMES M     477         566               NaN         916197   
BAXTER JOHN C       267102         NaN           1295738        5634343   
BAY FRANKLIN R      239671         NaN            260455         827696   

                   exercised_stock_options    bonus restricted_stock  \
ALLEN PHILLIP K                    1729541  4175000           126027   
BADUM JAMES P                       257817      NaN              NaN   
BANNANTINE JAMES M                 4046157      NaN          1757552   
BAXTER JOHN C                      6680544  1200000          3942714   
BAY FRANKLIN R                         NaN   400000           145796   

                   shared_receipt_with_poi restricted_stock_deferred  \
ALLEN PHILLIP K                       1407  



In [2]:
print "Take a look at the data set."
print
print "Number of individuals in dataset: ", len(data_dict)
print
print "Single example of dataset, Ken Lay: "
print
pprint.pprint(data_dict["LAY KENNETH L"], width =1)    
print

Take a look at the data set.

Number of individuals in dataset:  146

Single example of dataset, Ken Lay: 

{'bonus': 7000000,
 'deferral_payments': 202911,
 'deferred_income': -300000,
 'director_fees': 'NaN',
 'email_address': 'kenneth.lay@enron.com',
 'exercised_stock_options': 34348384,
 'expenses': 99832,
 'from_messages': 36,
 'from_poi_to_this_person': 123,
 'from_this_person_to_poi': 16,
 'loan_advances': 81525000,
 'long_term_incentive': 3600000,
 'other': 10359729,
 'poi': True,
 'restricted_stock': 14761694,
 'restricted_stock_deferred': 'NaN',
 'salary': 1072321,
 'shared_receipt_with_poi': 2411,
 'to_messages': 4273,
 'total_payments': 103559793,
 'total_stock_value': 49110078}



In [3]:
#find number of poi's in data set
print 'Persons of Interest (POI):'
count_poi = 0

for key, value in data_dict.items():
    if value['poi']:
        count_poi = count_poi + 1
        print key

print "number of persons of interest (poi): ", count_poi
print

### Task 2: Remove outliers

print 'Observed Outlier # 1, "Total"...this is not a person, it is the sum \
line of the data which lists, for example, a salary of ', \
data_dict["TOTAL"]['salary']
print
print 'Observed Outlier # 2, "THE TRAVEL AGENCY IN THE PARK"...this is not a \
person. '
print
del data_dict["TOTAL"] #remove Outlier # 1
del data_dict["THE TRAVEL AGENCY IN THE PARK"] #remove Outlier # 2

print "Number of individuals in dataset after removing the outliers: ", \
len(data_dict)
print

Persons of Interest (POI):
HANNON KEVIN P
COLWELL WESLEY
RIEKER PAULA H
KOPPER MICHAEL J
SHELBY REX
DELAINEY DAVID W
LAY KENNETH L
BOWEN JR RAYMOND M
BELDEN TIMOTHY N
FASTOW ANDREW S
CALGER CHRISTOPHER F
RICE KENNETH D
SKILLING JEFFREY K
YEAGER F SCOTT
HIRKO JOSEPH
KOENIG MARK E
CAUSEY RICHARD A
GLISAN JR BEN F
number of persons of interest (poi):  18

Observed Outlier # 1, "Total"...this is not a person, it is the sum line of the data which lists, for example, a salary of  26704229

Observed Outlier # 2, "THE TRAVEL AGENCY IN THE PARK"...this is not a person. 

Number of individuals in dataset after removing the outliers:  144



In [4]:
### Task 3: Create new feature(s)

#create new feature to show what percentage of each individual's emails 
#involved POI's
for key, value in data_dict.items():
    if value['from_messages'] != 'NaN' and value['to_messages'] != 'NaN':
        value['total_emails'] = value['from_messages'] + value['to_messages']
        value['total_poi_emails'] = value['from_poi_to_this_person']\
                                + value['from_this_person_to_poi']
        value['pct_poi_emails'] = float(value['total_poi_emails']) / \
        float(value['total_emails'])
    else:
        value['pct_poi_emails'] = 0
print "Show newly created feature for Ken Lay that gives the percentage of \
POI emails, 'pct_poi_emails', including both from and to:"
print

pprint.pprint(data_dict["LAY KENNETH L"], width =1) 
print

#sort dict descending by new feature
data_dict_descending = OrderedDict(sorted(data_dict.items(), 
                                          key=lambda kv: 
                                              kv[1]['pct_poi_emails'], 
                                              reverse = True))

print "Five highest pct_poi_emails values: "
print

for key, value in data_dict_descending.items()[:5]:
    print key, "percentage of poi emails = ", value['pct_poi_emails']

print
print "None of these five are in the POI list, but will test further."
print

### add new feature to features list
print 'Revised features list:'
features_list.append('pct_poi_emails')

print features_list
### Store to my_dataset for easy export below.
my_dataset = data_dict

Show newly created feature for Ken Lay that gives the percentage of POI emails, 'pct_poi_emails', including both from and to:

{'bonus': 7000000,
 'deferral_payments': 202911,
 'deferred_income': -300000,
 'director_fees': 'NaN',
 'email_address': 'kenneth.lay@enron.com',
 'exercised_stock_options': 34348384,
 'expenses': 99832,
 'from_messages': 36,
 'from_poi_to_this_person': 123,
 'from_this_person_to_poi': 16,
 'loan_advances': 81525000,
 'long_term_incentive': 3600000,
 'other': 10359729,
 'pct_poi_emails': 0.03225806451612903,
 'poi': True,
 'restricted_stock': 14761694,
 'restricted_stock_deferred': 'NaN',
 'salary': 1072321,
 'shared_receipt_with_poi': 2411,
 'to_messages': 4273,
 'total_emails': 4309,
 'total_payments': 103559793,
 'total_poi_emails': 139,
 'total_stock_value': 49110078}

Five highest pct_poi_emails values: 

DONAHUE JR JEFFREY M percentage of poi emails =  0.224351747463
HUMPHREY GENE E percentage of poi emails =  0.186206896552
DEFFNER JOSEPH M percentage of

In [5]:
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)
    
clf = DecisionTreeClassifier()
scaler = MinMaxScaler()

select = SelectKBest(k=10)

steps = [('scaler', scaler),
        ('feature_selection', select),
        ('classifier', clf)]

pipeline = Pipeline(steps)

pipeline.fit(features_train, labels_train)

print
support = pipeline.named_steps['feature_selection'].get_support(indices=True)

print "Selected features indices: ",support
#X = pd.DataFrame(features_train)
#y = pd.DataFrame(labels_train)

selected_indices = support.tolist()

selected_features = [features_list[i+1] for i in selected_indices]
#https://stackoverflow.com/questions/18272160/access-multiple-elements-of-list-knowing-their-index
print "Selected Features: ",selected_features


Selected features indices:  [ 0  2  3  4  6  7  9 11 12 18]
Selected Features:  ['salary', 'total_payments', 'loan_advances', 'bonus', 'deferred_income', 'total_stock_value', 'exercised_stock_options', 'long_term_incentive', 'restricted_stock', 'shared_receipt_with_poi']
