# Prepare Dataset for Modeling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import pickle
from sklearn.model_selection import train_test_split

## Import and Join Datasets

In [4]:
grid = pd.read_pickle("../data/compacted_data/grade.pkl")
y = pd.read_pickle("../data/compacted_data/y.pkl").rename(columns={"nuis": "y"})[["gid", "y"]]
df = grid.merge(y, on='gid', how='left').drop(["gid", "ID", "UF", "Row", "Col", "geometry"], axis=1)

In [5]:
df.head()

Unnamed: 0,Polo,Declividade,Curvatura,APP30m,UCIntegral,AltaTensao,Vias50m,Dutovias,IndiceForma,DomSIden,...,t_vulner_mais1h,t_renda_trab,t_carteira_18m,t_scarteira_18m,t_setorpublico_18m,t_contapropria_18m,t_empregador_18m,t_formal_18m,t_atividade10a15,y
0,Porto Alegre,4.919002,6.333333,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,,,,,,,,,,0.0
1,Porto Alegre,4.900816,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,,,,,,,,,,0.0
2,Porto Alegre,4.152381,4.666667,0.333333,0.0,0.0,1.0,0.0,0.0,0.0,...,,,,,,,,,,0.0
3,Porto Alegre,3.205212,7.777778,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,,,,,,,,,,0.0
4,Porto Alegre,5.218916,4.166667,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,,,,,,,,,,0.0


## Handle missing values

In [6]:
percent_missing = df.isnull().sum() * 100 / len(df)
percent_missing.sort_values(ascending=True)
drop_cols = percent_missing.loc[percent_missing > 50].index.tolist()
df.drop(drop_cols, axis = 1, inplace=True)

In [7]:
df = df.fillna(df.median())

  df = df.fillna(df.median())


In [8]:
percent_missing = df.isnull().sum() * 100 / len(df)
percent_missing.sort_values(ascending=True)

Polo            0.0
LixoRio         0.0
LixoJogado      0.0
LixoAterrado    0.0
LixoQueimado    0.0
               ... 
DomCLixAc       0.0
DomSRedeEsg     0.0
DomSEsg         0.0
NResp30NAlf     0.0
y               0.0
Length: 72, dtype: float64

## One Hot Encode

In [11]:
one_hot = pd.get_dummies(df["Polo"]).add_prefix('polo_')
df = pd.concat([df, one_hot], axis=1).drop(["Polo"], axis=1)

In [12]:
df.head()


Unnamed: 0,Declividade,Curvatura,APP30m,UCIntegral,AltaTensao,Vias50m,Dutovias,IndiceForma,DomSIden,DomSIlu,...,NRespIdade,NResp30,NResp30NAlf,y,polo_Belo Horizonte,polo_Brasília,polo_Juazeiro do Norte,polo_Marabá,polo_Porto Alegre,polo_Recife
0,4.919002,6.333333,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,47.122677,0.828996,0.838565,0.0,0,0,0,0,1,0
1,4.900816,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,47.122677,0.825195,0.83472,0.0,0,0,0,0,1,0
2,4.152381,4.666667,0.333333,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,47.122677,0.527855,0.533947,0.0,0,0,0,0,1,0
3,3.205212,7.777778,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,47.122677,0.828996,0.838565,0.0,0,0,0,0,1,0
4,5.218916,4.166667,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,47.122677,0.828996,0.838565,0.0,0,0,0,0,1,0


In [13]:
len(df.columns)

77

## Split training and test

In [14]:
X = df.loc[:, df.columns != "y"]
y = df["y"]

In [16]:
X.to_pickle("../data/model_input/X.pkl")
y.to_pickle("../data/model_input/y.pkl")

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [71]:
X_train.to_pickle("../data/model_input/X_train.pkl")
y_train.to_pickle("../data/model_input/y_train.pkl")
X_test.to_pickle("../data/model_input/X_test.pkl")
y_test.to_pickle("../data/model_input/y_test.pkl")