In [1]:
#!/usr/bin/python

import sys
import pickle
import json
import pandas as pd
import numpy as np
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

In [3]:
### Load the dictionary containing the dataset
# pickle compatible with python3
with open("final_project_dataset_python3.pkl", "rb") as data_file:
    data_dict = pickle.load(data_file)

### Tarefa 2: Análise das Características e Remoção de Outliers

Para manter os dados ainda compatíveis com o `tester.py` na análise será
realizado sobre um dataframe, porém posteriormente será aplicado os ajustes
no dicionário.

In [4]:
data_df = pd.DataFrame.from_dict(data_dict).T

print(data_df.shape)
print(data_df.dtypes)

(146, 21)
bonus                        object
deferral_payments            object
deferred_income              object
director_fees                object
email_address                object
exercised_stock_options      object
expenses                     object
from_messages                object
from_poi_to_this_person      object
from_this_person_to_poi      object
loan_advances                object
long_term_incentive          object
other                        object
poi                          object
restricted_stock             object
restricted_stock_deferred    object
salary                       object
shared_receipt_with_poi      object
to_messages                  object
total_payments               object
total_stock_value            object
dtype: object


Todos os campos são importados como strings, assim para uma avaliação dos
outliers primeiro temos que converter os campos para numéricos.

In [5]:
# select columns that are not numbers
cols_string = ['email_address', 'poi']
cols = list(data_df)

# from all columns, select all that are not numbers
cols_numeric = np.setdiff1d(cols, cols_string)

# transform to numbers, when 'NaN' will become NaN
data_df[cols_numeric] = data_df[cols_numeric].apply(pd.to_numeric, errors='coerce')
data_df['poi'] = data_df['poi'].astype('bool')
print(data_df.describe().T)

                           count          mean           std         min  \
bonus                       82.0  2.374235e+06  1.071333e+07     70000.0   
deferral_payments           39.0  1.642674e+06  5.161930e+06   -102500.0   
deferred_income             49.0 -1.140475e+06  4.025406e+06 -27992891.0   
director_fees               17.0  1.668049e+05  3.198914e+05      3285.0   
exercised_stock_options    102.0  5.987054e+06  3.106201e+07      3285.0   
expenses                    95.0  1.087289e+05  5.335348e+05       148.0   
from_messages               86.0  6.087907e+02  1.841034e+03        12.0   
from_poi_to_this_person     86.0  6.489535e+01  8.697924e+01         0.0   
from_this_person_to_poi     86.0  4.123256e+01  1.000731e+02         0.0   
loan_advances                4.0  4.196250e+07  4.708321e+07    400000.0   
long_term_incentive         66.0  1.470361e+06  5.942759e+06     69223.0   
other                       93.0  9.190650e+05  4.589253e+06         2.0   
restricted_s

Pode ser notado que algumas das variáveis não podemos utilizar pela grande
quantidade de informação faltando

In [6]:
def tag_outlier(df, columns):
    """ Identify possibles outliers
    Args:
        variable: pandas dataframe
    Returns:
        row index
    Raises:
    """
    def column_outlier_eval(variable):
        q1, q3 = variable.quantile([.25, .75])
        inter_q = q3 - q1
        max_val = q3 + 5 * inter_q
        min_val = q1 - 5 * inter_q
        return variable[(variable > max_val) | (variable < min_val)].index

    rows_to_look = []
    for var in columns:
        rows_to_look.append(list(column_outlier_eval(df[var])))

    possible_outlier = np.unique([j for i in rows_to_look for j in i])
    return possible_outlier

In [7]:
tag_outlier(data_df, cols_numeric)

array(['ALLEN PHILLIP K', 'BAXTER JOHN C', 'BECK SALLY W',
       'BELDEN TIMOTHY N', 'BELFER ROBERT', 'BHATNAGAR SANJAY',
       'BUY RICHARD B', 'DELAINEY DAVID W', 'DERRICK JR. JAMES V',
       'FREVERT MARK A', 'HAEDICKE MARK E', 'HAYSLETT RODERICK J',
       'HIRKO JOSEPH', 'HORTON STANLEY C', 'KAMINSKI WINCENTY J',
       'KEAN STEVEN J', 'KITCHEN LOUISE', 'LAVORATO JOHN J',
       'LAY KENNETH L', 'MARTIN AMANDA K', 'MCCONNELL MICHAEL S',
       'PAI LOU L', 'POWERS WILLIAM', 'RICE KENNETH D',
       'SHANKMAN JEFFREY A', 'SHAPIRO RICHARD S', 'SKILLING JEFFREY K',
       'TOTAL', 'WHITE JR THOMAS E'],
      dtype='<U19')

Observando os dados podemos observar o aparecimento de uma pessoa chamada
`TOTAL` o que nos aponta para um povável erro de digitação que podemos
eliminar dos dados.

In [8]:
for outlier in ['TOTAL','THE TRAVEL AGENCY IN THE PARK']:
    data_dict.pop(outlier, 0)

### Criação de Novas Variáveis

In [None]:
for item in data_dict:
    pass

### Escala das variáveis

Os dados apresentam grandes variações de magnitude, assim será aplicado um
escalonamento das variáveis para que não afete os modelos sensiveis a essas
variações.

### Seleção das Variáveis

In [2]:
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi','salary']

In [None]:
### Store to my_dataset for easy export below.
my_dataset = data_dict

In [None]:
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

### Aplicação de Classificadores

In [None]:
### Please name your classifier clf for easy export below.

# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

### Melhora do Modelo pelos Parâmetros

In [88]:
### Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function.

# Example starting point. Try investigating other evaluation techniques!
from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)



NameError: name 'features' is not defined

### Salvar os Resultados 

In [None]:
dump_classifier_and_data(clf, my_dataset, features_list)