# Importações para o programa

In [1]:
from pandas import read_csv
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix

# Carga dos dados

In [129]:
dados = read_csv('census.csv')

# Análise Exploratória

In [77]:
dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
final-weight      32561 non-null int64
education         32561 non-null object
education-num     32561 non-null int64
marital-status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital-gain      32561 non-null int64
capital-loos      32561 non-null int64
hour-per-week     32561 non-null int64
native-country    32561 non-null object
income            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [78]:
dados.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [79]:
dados.describe()

Unnamed: 0,age,final-weight,education-num,capital-gain,capital-loos,hour-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [80]:
dados.columns

Index(['age', 'workclass', 'final-weight', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loos', 'hour-per-week', 'native-country',
       'income'],
      dtype='object')

In [81]:
dados.isnull().values.any()

False

In [82]:
dados.income.unique()

array([' <=50K', ' >50K'], dtype=object)

# Pré-processamento

## Divisão das variáveis

In [130]:
previsores = dados.iloc[:, :-1].values

In [131]:
classe = dados.iloc[:, -1].values

In [85]:
type(classe)

numpy.ndarray

## mudança de label dos atributos previsores


In [132]:
labelEncoder = LabelEncoder()

In [133]:
previsores[:, 1] = labelEncoder.fit_transform(previsores[:,1])
previsores[:, 3] = labelEncoder.fit_transform(previsores[:,3])
previsores[:, 5] = labelEncoder.fit_transform(previsores[:,5])
previsores[:, 6] = labelEncoder.fit_transform(previsores[:,6])
previsores[:, 7] = labelEncoder.fit_transform(previsores[:,7])
previsores[:, 8] = labelEncoder.fit_transform(previsores[:,8])
previsores[:, 9] = labelEncoder.fit_transform(previsores[:,9])
previsores[:, 13] = labelEncoder.fit_transform(previsores[:,13])

In [88]:
label_classe = LabelEncoder()
classe = label_classe.fit_transform(classe)

## Variáveis dummy

In [134]:
oneHotEncoder = OneHotEncoder(categorical_features=[1,3,5,6,7,8,9,13])

In [135]:
previsores = oneHotEncoder.fit_transform(previsores).toarray()

In [91]:
len(previsores)

32561

## Padronização

In [106]:
scaler = StandardScaler()

In [107]:
previsores = scaler.fit_transform(previsores)



## Divisão treino-teste

In [136]:
x_train, x_test, y_train, y_test = train_test_split(previsores, classe, test_size=0.3)

# Modelo

In [137]:
modelo = GaussianNB()

In [138]:
modelo.fit(x_train, y_train)

GaussianNB(priors=None)

In [139]:
previsao = modelo.predict(x_test)

# Métricas
## Usando variáveis dummy e padronização

In [98]:
score = accuracy_score(y_test, previsao)

In [99]:
print('Acerto de %s%%' % (score))

Acerto de 0.4405773364725151%


In [100]:
confusion_matrix(y_test, previsao)

array([[2008, 5385],
       [  80, 2296]], dtype=int64)

## Usando apenas a padronização e label da classe

In [74]:
score = accuracy_score(y_test, previsao)
print('Acerto de ', score)
print('Matriz de confusão\n', confusion_matrix(y_test, previsao))

Acerto de  0.8053024874603337
Matriz de confusão
 [[7116  346]
 [1556  751]]


## Usando apenas a padronização

In [112]:
score = accuracy_score(y_test, previsao)
print('Acerto de ', score)
print('Matriz de confusão\n', confusion_matrix(y_test, previsao))

Acerto de  0.8068379568021292
Matriz de confusão
 [[7067  346]
 [1541  815]]


## Apenas com as variáveis dummy

In [140]:
score = accuracy_score(y_test, previsao)
print('Acerto de ', score)
print('Matriz de confusão\n', confusion_matrix(y_test, previsao))

Acerto de  0.7970109530146381
Matriz de confusão
 [[7031  388]
 [1595  755]]


## Sem tratamento

In [128]:
score = accuracy_score(y_test, previsao)
print('Acerto de ', score)
print('Matriz de confusão\n', confusion_matrix(y_test, previsao))

Acerto de  0.7985464223564336
Matriz de confusão
 [[7056  376]
 [1592  745]]
