# Prepare Dataset for Modeling

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import pickle
from sklearn.model_selection import train_test_split

## Import and Join Datasets

In [3]:
grid = pd.read_pickle("../../data/compacted_data/grade.pkl")
y = pd.read_pickle("../../data/compacted_data/y.pkl").rename(columns={"nuis": "y"})[["gid", "y"]]
df = grid.merge(y, on='gid', how='left').drop(["gid", "ID", "UF", "Row", "Col", "geometry"], axis=1)

In [5]:
y.loc[y["y"] != 0]

Unnamed: 0,gid,y
2239,100ME5233450N7998750,1.0
2260,100ME5233550N7998750,1.0
2261,100ME5233550N7998850,1.0
2279,100ME5233650N7998350,1.0
2280,100ME5233650N7998450,1.0
...,...,...
735045,100ME7090050N10419250,1.0
735053,100ME7090050N10420050,1.0
735054,100ME7090050N10420150,1.0
735078,100ME7090150N10420050,1.0


In [3]:
df.head()

Unnamed: 0,Polo,Declividade,Curvatura,APP30m,UCIntegral,AltaTensao,Vias50m,Dutovias,IndiceForma,DomSIden,...,t_vulner_mais1h,t_renda_trab,t_carteira_18m,t_scarteira_18m,t_setorpublico_18m,t_contapropria_18m,t_empregador_18m,t_formal_18m,t_atividade10a15,y
0,Porto Alegre,4.919002,6.333333,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,,,,,,,,,,0.0
1,Porto Alegre,4.900816,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,,,,,,,,,,0.0
2,Porto Alegre,4.152381,4.666667,0.333333,0.0,0.0,1.0,0.0,0.0,0.0,...,,,,,,,,,,0.0
3,Porto Alegre,3.205212,7.777778,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,,,,,,,,,,0.0
4,Porto Alegre,5.218916,4.166667,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,,,,,,,,,,0.0


## Handle missing values

In [4]:
percent_missing = df.isnull().sum() * 100 / len(df)
percent_missing.sort_values(ascending=True)
drop_cols = percent_missing.loc[percent_missing > 50].index.tolist()
df.drop(drop_cols, axis = 1, inplace=True)

In [5]:
df = df.fillna(df.median())

  df = df.fillna(df.median())


In [6]:
percent_missing = df.isnull().sum() * 100 / len(df)
percent_missing.sort_values(ascending=True)

Polo            0.0
LixoRio         0.0
LixoJogado      0.0
LixoAterrado    0.0
LixoQueimado    0.0
               ... 
DomCLixAc       0.0
DomSRedeEsg     0.0
DomSEsg         0.0
NResp30NAlf     0.0
y               0.0
Length: 72, dtype: float64

## Split training and test

In [7]:
polos = ['Porto Alegre', 'Marabá', 'Brasília', 'Belo Horizonte', 'Juazeiro do Norte', 'Recife']

In [8]:
for p in polos:
    test = df[df.iloc[:, 0] == p]
    X_test = test.loc[:, test.columns != "y"]
    y_test = test["y"]
    X_test.to_pickle(f"../../data/model_input/X_{p}.pkl")
    y_test.to_pickle(f"../../data/model_input/y_{p}.pkl")