<a href="https://colab.research.google.com/github/Felipanjos/a3_ia_2022.2/blob/main/Projeto_IA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### D. Preparação dos dados

In [946]:
import pandas as pd
import numpy as np
import random
import string
import sklearn
from itertools import groupby

In [947]:
df = pd.read_csv('houses_to_rent.csv', index_col=[0])
df = df[['area', 'rooms', 'bathroom', 'parking spaces', 'floor', 'animal', 'furniture', 'hoa']]

In [948]:
sem_aluguel = df.index[df['hoa'] == 'R$0'].tolist()
df.drop(sem_aluguel, inplace=True)
df['hoa'] = df['hoa'].str.replace(',', '')
df['hoa'] = df['hoa'].str.replace(r'\D+','', regex=True)

In [949]:
def obj_column_to_int(column):
    column.convert_dtypes()
    pd.to_numeric(column, errors='coerce')
    return pd.to_numeric(column.convert_dtypes(), errors='coerce').convert_dtypes() 

df['floor'] = obj_column_to_int(df['floor'])
df['hoa'] = obj_column_to_int(df['hoa'])


##### a) Selecionando 10% das colunas

No dataset utilizado existem somente 9 colunas, dessa forma:

*    10% de 9 = 0.9
*    Aproximando para 1, fica somente uma coluna a ser selecionada como referência para a modificação dos dados



In [950]:
qtd_colunas = len(df.columns)
p_colunas = round(qtd_colunas * 10 / 100)
coluna_aleatoria = df.sample(n=p_colunas, axis='columns').keys()[0]
colunas_numericas = df.describe().columns
colunas_categoricas = df.describe(exclude=np.number).columns
coluna_aleatoria # escolhendo uma coluna aleatoriamente

'area'

##### a) Selecionando 3% dos dados

In [951]:
p_3 = int(df.shape[0] * 3 / 100)
p_3 # 3% de 1000 é igual a 30

143

##### a) Excluindo 3% dos dados aleatoriamente

In [952]:
reg_nulos = df[df[coluna_aleatoria].isnull()]
qtd_reg_nulos = reg_nulos.shape[0]
reg_excluidos = df.sample(n=p_3)
reg_exc_index = reg_excluidos.index.array
df.loc[reg_exc_index, coluna_aleatoria] = np.nan

##### a) Alterando 3% dos dados aleatoriamente

In [953]:
def gerar_string_aleatoria():
  letters = string.ascii_lowercase
  return ''.join(random.choice(letters) for i in range(10)) 

def gerar_numero_aleatorio():
  return random.randint(0, 999)

In [954]:
df_selecao_nao_nulo = df[~df[coluna_aleatoria].isnull()]
reg_alterados = df_selecao_nao_nulo.sample(n=p_3)
reg_alter_index = reg_alterados.index.array

In [955]:
if coluna_aleatoria in colunas_numericas:
    for index in reg_alter_index:
        df.loc[[index],[coluna_aleatoria]] = gerar_numero_aleatorio()
else:
    for index in reg_alter_index:
        df.loc[[index],[coluna_aleatoria]] = gerar_string_aleatoria()

#### b) Limpeza dos dados

i. Codificação One-Hot

pos_encoded.toarray()

A atribuição da mediana geral foi utilizada para lidar com os valores nulos das colunas em que existiam, pelos seguintes motivos:

*   Número relativamente pequeno de linhas (1000), o que ocasionaria numa maior escassez de dados caso as linhas com valores nulos fossem removidas
*   Número pequeno de colunas, logo a remoção completa de cada coluna em que houvesse valor nulo acarretaria na perda desnecessária de informações
*   Facilidade na implementação em comparação com o treinamento de predição e atribuição
*   Coesão com o dataset, visto que logo no primeiro quartil a coluna 'Meta_score' apresenta valor próximo à média

Após implementação da correção, o dataset não conta mais com valores nulos.


In [956]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="mean")),        
        ('std_scaler', StandardScaler()),
    ])

In [957]:
from sklearn.preprocessing import OneHotEncoder

num_attribs = list(df.describe().columns)
cat_attribs = ['animal', 'furniture']

In [958]:
from sklearn.compose import ColumnTransformer

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

In [959]:
hoa = df['hoa']
df.drop('hoa', axis=1, inplace=True)
df.insert(5, 'hoa', hoa.values)
df.insert(6, 'no_animal', df['animal'].values)
df.insert(9, 'no_furniture', df['furniture'].values)
df.rename({'no_animal': 'animal', 'animal':'no_animal'}, axis=1, inplace=True)

df_prepared = full_pipeline.fit_transform(df)

In [960]:
scaled_df = pd.DataFrame(df_prepared, index=df.index, columns=df.columns)

In [961]:
df

Unnamed: 0,area,rooms,bathroom,parking spaces,floor,hoa,animal,no_animal,furniture,no_furniture
1,64.0,2,1,1,10,540,acept,acept,not furnished,not furnished
2,443.0,5,5,4,3,4172,acept,acept,furnished,furnished
3,,2,2,1,12,700,acept,acept,not furnished,not furnished
6,55.0,1,1,1,2,1950,acept,acept,furnished,furnished
7,55.0,2,2,1,2,600,acept,acept,not furnished,not furnished
...,...,...,...,...,...,...,...,...,...,...
6073,88.0,2,2,1,1,1200,acept,acept,not furnished,not furnished
6075,50.0,2,1,1,2,420,acept,acept,not furnished,not furnished
6076,91.0,2,2,1,16,768,not acept,not acept,furnished,furnished
6077,48.0,1,1,0,13,250,acept,acept,not furnished,not furnished


In [962]:
scaled_df

Unnamed: 0,area,rooms,bathroom,parking spaces,floor,hoa,animal,no_animal,furniture,no_furniture
1,-1.964962e-01,-0.374044,-0.919321,-0.443862,0.388718,-0.203734,1.0,0.0,0.0,1.0
2,7.074122e-01,2.496623,2.044414,1.804716,-0.823438,0.615993,1.0,0.0,1.0,0.0
3,-6.778528e-17,-0.374044,-0.178387,-0.443862,0.735049,-0.167623,1.0,0.0,0.0,1.0
6,-2.179610e-01,-1.330933,-0.919321,-0.443862,-0.996603,0.114497,1.0,0.0,1.0,0.0
7,-2.179610e-01,-0.374044,-0.178387,-0.443862,-0.996603,-0.190192,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
6073,-1.392566e-01,-0.374044,-0.178387,-0.443862,-1.169769,-0.054775,1.0,0.0,0.0,1.0
6075,-2.298859e-01,-0.374044,-0.919321,-0.443862,-0.996603,-0.230818,1.0,0.0,0.0,1.0
6076,-1.321016e-01,-0.374044,-0.178387,-0.443862,1.427709,-0.152275,0.0,1.0,1.0,0.0
6077,-2.346559e-01,-1.330933,-0.919321,-1.193388,0.908214,-0.269186,1.0,0.0,0.0,1.0
