<a href="https://colab.research.google.com/github/Felipanjos/a3_ia_2022.2/blob/main/Projeto_IA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### D. Preparação dos dados

In [92]:
import pandas as pd
import numpy as np
import random
import string
import sklearn
from itertools import groupby

In [93]:
df = pd.read_csv('houses_to_rent.csv', index_col=[0])
df = df[['area', 'rooms', 'bathroom', 'parking spaces', 'floor', 'animal', 'furniture', 'hoa']]

In [94]:
np.unique(df['floor'])

array(['-', '1', '10', '11', '12', '13', '14', '15', '16', '17', '18',
       '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', '28',
       '29', '3', '31', '32', '35', '4', '5', '51', '6', '68', '7', '8',
       '85', '9', '99'], dtype=object)

In [95]:
sem_aluguel = df.index[df['hoa'] == 'R$0'].tolist()
terreo = df.index[df['floor'] == '-'].tolist()
df['hoa'] = df['hoa'].str.replace(',', '')
df['hoa'] = df['hoa'].str.replace(r'\D+','', regex=True)

for index in sem_aluguel:
    df['hoa'][index] = float("nan")
for index in terreo:
    df['floor'][index] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hoa'][index] = float("nan")
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['floor'][index] = 0


In [96]:
df

Unnamed: 0,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa
0,240,3,3,4,0,acept,furnished,
1,64,2,1,1,10,acept,not furnished,540
2,443,5,5,4,3,acept,furnished,4172
3,73,2,2,1,12,acept,not furnished,700
4,19,1,1,0,0,not acept,not furnished,
...,...,...,...,...,...,...,...,...
6075,50,2,1,1,2,acept,not furnished,420
6076,84,2,2,1,16,not acept,furnished,768
6077,48,1,1,0,13,acept,not furnished,250
6078,160,3,2,2,0,not acept,not furnished,


In [97]:
def obj_column_to_int(column):
    column.convert_dtypes()
    pd.to_numeric(column, errors='coerce')
    return pd.to_numeric(column.convert_dtypes(), errors='coerce').convert_dtypes() 

df['floor'] = obj_column_to_int(df['floor'])
df['hoa'] = obj_column_to_int(df['hoa'])


##### a) Selecionando 10% das colunas

No dataset utilizado existem somente 9 colunas, dessa forma:

*    10% de 9 = 0.9
*    Aproximando para 1, fica somente uma coluna a ser selecionada como referência para a modificação dos dados



In [98]:
qtd_colunas = len(df.columns)
p_colunas = round(qtd_colunas * 10 / 100)
coluna_aleatoria = df.sample(n=p_colunas, axis='columns').keys()[0]
colunas_numericas = df.describe().columns
colunas_categoricas = df.describe(exclude=np.number).columns
coluna_aleatoria # escolhendo uma coluna aleatoriamente

'bathroom'

##### a) Selecionando 3% dos dados

In [99]:
p_3 = int(df.shape[0] * 3 / 100)
p_3 # 3% de 1000 é igual a 30

182

##### a) Excluindo 3% dos dados aleatoriamente

In [100]:
reg_nulos = df[df[coluna_aleatoria].isnull()]
qtd_reg_nulos = reg_nulos.shape[0]
reg_excluidos = df.sample(n=p_3)
reg_exc_index = reg_excluidos.index.array
df.loc[reg_exc_index, coluna_aleatoria] = np.nan

##### a) Alterando 3% dos dados aleatoriamente

In [101]:
def gerar_string_aleatoria():
  letters = string.ascii_lowercase
  return ''.join(random.choice(letters) for i in range(10)) 

def gerar_numero_aleatorio():
  return random.randint(0, 999)

In [102]:
df_selecao_nao_nulo = df[~df[coluna_aleatoria].isnull()]
reg_alterados = df_selecao_nao_nulo.sample(n=p_3)
reg_alter_index = reg_alterados.index.array

In [103]:
if coluna_aleatoria in colunas_numericas:
    for index in reg_alter_index:
        df.loc[[index],[coluna_aleatoria]] = gerar_numero_aleatorio()
else:
    for index in reg_alter_index:
        df.loc[[index],[coluna_aleatoria]] = gerar_string_aleatoria()

#### b) Limpeza dos dados

i. Codificação One-Hot

pos_encoded.toarray()

A atribuição da mediana geral foi utilizada para lidar com os valores nulos das colunas em que existiam, pelos seguintes motivos:

*   Número relativamente pequeno de linhas (1000), o que ocasionaria numa maior escassez de dados caso as linhas com valores nulos fossem removidas
*   Número pequeno de colunas, logo a remoção completa de cada coluna em que houvesse valor nulo acarretaria na perda desnecessária de informações
*   Facilidade na implementação em comparação com o treinamento de predição e atribuição
*   Coesão com o dataset, visto que logo no primeiro quartil a coluna 'Meta_score' apresenta valor próximo à média

Após implementação da correção, o dataset não conta mais com valores nulos.


In [104]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="mean")),        
        ('std_scaler', StandardScaler()),
    ])

In [105]:
from sklearn.preprocessing import OneHotEncoder

num_attribs = list(df.describe().columns)
cat_attribs = ['animal', 'furniture']

In [106]:
from sklearn.compose import ColumnTransformer

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

In [107]:
hoa = df['hoa']
df.drop('hoa', axis=1, inplace=True)
df.insert(5, 'hoa', hoa.values)
df.insert(6, 'no_animal', df['animal'].values)
df.insert(9, 'no_furniture', df['furniture'].values)
df.rename({'no_animal': 'animal', 'animal':'no_animal'}, axis=1, inplace=True)

df_prepared = full_pipeline.fit_transform(df)

In [108]:
scaled_df = pd.DataFrame(df_prepared, index=df.index, columns=df.columns)

In [109]:
df

Unnamed: 0,area,rooms,bathroom,parking spaces,floor,hoa,animal,no_animal,furniture,no_furniture
0,240,3,3.0,4,0,,acept,acept,furnished,furnished
1,64,2,1.0,1,10,540,acept,acept,not furnished,not furnished
2,443,5,5.0,4,3,4172,acept,acept,furnished,furnished
3,73,2,140.0,1,12,700,acept,acept,not furnished,not furnished
4,19,1,1.0,0,0,,not acept,not acept,not furnished,not furnished
...,...,...,...,...,...,...,...,...,...,...
6075,50,2,1.0,1,2,420,acept,acept,not furnished,not furnished
6076,84,2,2.0,1,16,768,not acept,not acept,furnished,furnished
6077,48,1,1.0,0,13,250,acept,acept,not furnished,not furnished
6078,160,3,2.0,2,0,,not acept,not acept,not furnished,not furnished


In [110]:
scaled_df

Unnamed: 0,area,rooms,bathroom,parking spaces,floor,hoa,animal,no_animal,furniture,no_furniture
0,0.236616,0.449198,-0.150305,1.392098,-0.919557,-5.782201e-17,1.0,0.0,1.0,0.0
1,-0.232057,-0.436093,-0.171133,-0.469203,0.701606,-2.295584e-01,1.0,0.0,0.0,1.0
2,0.777187,2.219781,-0.129476,1.392098,-0.433208,6.940734e-01,1.0,0.0,1.0,0.0
3,-0.208090,-0.436093,1.276421,-0.469203,1.025839,-1.888697e-01,1.0,0.0,0.0,1.0
4,-0.351888,-1.321385,-0.171133,-1.089636,-0.919557,-5.782201e-17,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
6075,-0.269337,-0.436093,-0.171133,-0.469203,-0.595324,-2.600748e-01,1.0,0.0,0.0,1.0
6076,-0.178798,-0.436093,-0.160719,-0.469203,1.674304,-1.715771e-01,0.0,1.0,1.0,0.0
6077,-0.274663,-1.321385,-0.171133,-1.089636,1.187955,-3.033065e-01,1.0,0.0,0.0,1.0
6078,0.023583,0.449198,-0.160719,0.151231,-0.919557,-5.782201e-17,0.0,1.0,0.0,1.0
