In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

## Base Census

##### Objetivo: Prever se uma pessoa ganha mais ou menos que 50k dol.

### Exploração dos Dados

In [None]:
base_census = pd.read_csv('../database/census.csv')

In [None]:
base_census

In [None]:
base_census.describe()

In [None]:
base_census.isnull().sum()

In [None]:
np.unique(base_census['income'], return_counts=True)

#### Visualização dos dados

In [None]:
sns.countplot(x=base_census['income']);

In [None]:
plt.hist(x=base_census['age']);

In [None]:
plt.hist(x=base_census['hour-per-week']);

In [None]:
grafico = px.treemap(base_census, path=['workclass', 'age'])
grafico.show()

In [None]:
grafico = px.treemap(base_census, path=['occupation', 'relationship'])
grafico.show()

In [None]:
grafico = px.parallel_categories(base_census, dimensions=['occupation', 'relationship'])
grafico.show()

In [None]:
grafico = px.parallel_categories(base_census, dimensions=['occupation', 'relationship', 'income'])
grafico.show()

In [None]:
grafico = px.parallel_categories(base_census, dimensions=['education', 'income'])
grafico.show()

#### Divisão entre Previsores e Classe

In [None]:
base_census.columns

In [None]:
# Previsores
x_census = base_census.iloc[:, 0:14].values

In [None]:
x_census

In [None]:
y_census = base_census.iloc[:, 14].values

In [None]:
y_census

#### Tratamento dos Atributos Categóricos

##### LabelEncoder

In [None]:
from sklearn.preprocessing import LabelEncoder 

In [None]:
base_census.columns

In [None]:
label_encoder_workclass = LabelEncoder()
label_encoder_education = LabelEncoder()
label_encoder_marital_status = LabelEncoder()
label_encoder_occupation = LabelEncoder()
label_encoder_relationship = LabelEncoder()
label_encoder_race = LabelEncoder()
label_encoder_sex = LabelEncoder()
label_encoder_native_country = LabelEncoder()

In [None]:
x_census[:, 1] = label_encoder_workclass.fit_transform(x_census[:, 1])
x_census[:, 3] = label_encoder_education.fit_transform(x_census[:, 3])
x_census[:, 5] = label_encoder_marital_status.fit_transform(x_census[:, 5])
x_census[:, 6] = label_encoder_occupation.fit_transform(x_census[:, 6])
x_census[:, 7] = label_encoder_relationship.fit_transform(x_census[:, 7])
x_census[:, 8] = label_encoder_race.fit_transform(x_census[:, 8])
x_census[:, 9] = label_encoder_sex.fit_transform(x_census[:, 9])
x_census[:, 13] = label_encoder_native_country.fit_transform(x_census[:, 13])


In [None]:
x_census

##### OneHotEncoder

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [None]:
one_hot_enconder_census = ColumnTransformer(transformers=[("OneHot", OneHotEncoder(), [1, 3, 5, 6, 7, 8, 9, 13])], remainder='passthrough') 

In [None]:
x_census = one_hot_enconder_census.fit_transform(x_census).toarray()

In [None]:
x_census[0]

In [None]:
x_census.shape

##### Escalonamento dos Atributos

In [None]:
from sklearn.preprocessing import StandardScaler
scaler_census = StandardScaler()
x_census = scaler_census.fit_transform(x_census)

In [None]:
x_census[0]

#### Divisão de Treinamento e Teste

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_census_treinament, x_census_test, y_census_treinament, y_census_test = train_test_split(x_census, y_census, test_size=0.15, random_state=0)

In [None]:
x_census_treinament.shape, x_census_test.shape, y_census_treinament.shape, y_census_test.shape

In [None]:
import pickle 

In [None]:
with open('../database/census.pkl', mode='wb') as f:
    pickle.dump([x_census_treinament, y_census_treinament, x_census_test, y_census_test], f)