<a href="https://colab.research.google.com/github/Felipanjos/a3_ia_2022.2/blob/main/Projeto_IA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [320]:
import pandas as pd
import numpy as np
import random
import string
import sklearn
from itertools import groupby

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt

In [321]:
df = pd.read_csv('houses_to_rent.csv', index_col=[0])
df = df[['area', 'rooms', 'bathroom', 'parking spaces', 'floor', 'animal', 'furniture', 'total']]

In [322]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6080 entries, 0 to 6079
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   area            6080 non-null   int64 
 1   rooms           6080 non-null   int64 
 2   bathroom        6080 non-null   int64 
 3   parking spaces  6080 non-null   int64 
 4   floor           6080 non-null   object
 5   animal          6080 non-null   object
 6   furniture       6080 non-null   object
 7   total           6080 non-null   object
dtypes: int64(4), object(4)
memory usage: 427.5+ KB


In [323]:
def obj_column_to_int(column):
    column.convert_dtypes()
    pd.to_numeric(column, errors='coerce')
    return pd.to_numeric(column.convert_dtypes(), errors='coerce').convert_dtypes() 

terreo = df.index[df['floor'] == '-'].tolist()
df['total'] = df['total'].str.replace(',', '')
df['total'] = df['total'].str.replace(r'\D+','', regex=True)

for index in terreo:
    df['floor'][index] = 0

df['floor'] = obj_column_to_int(df['floor'])
df['total'] = obj_column_to_int(df['total'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['floor'][index] = 0


In [324]:
num_attribs = list(df.describe().columns)
cat_attribs = ['animal', 'furniture']

### C. Divisão 

In [325]:
ranges = [df.min()['total'], df['total'].describe()['25%'], 4500, 9000, 10000, 25000, 100000, 200000, np.inf]
tiers = ['F', 'E', 'D', 'C', 'B', 'A', 'S', 'S+']

df['total_cat'] = pd.cut(df['total'], bins = ranges, labels = tiers)

In [326]:
# df_num = pd.DataFrame(df_num, index=df.index, columns=df.columns)

In [327]:
# splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# for train_index, test_index in splitter.split(df, df['total_cat']):
#   strat_train_set = df.loc[train_index]
#   strat_test_set = df.loc[test_index]

In [328]:
# strat_train_set.drop(columns=['income_cat'], inplace=True)
# strat_test_set.drop(columns=['income_cat'], inplace=True)

In [329]:
# strat_test_set['total_cat'].value_counts()

### D. Preparação

##### a) Selecionando 10% das colunas

No dataset utilizado existem somente 9 colunas, dessa forma:

*    10% de 9 = 0.9
*    Aproximando para 1, fica somente uma coluna a ser selecionada como referência para a modificação dos dados



In [330]:
qtd_colunas = len(df.columns)
p_colunas = round(qtd_colunas * 10 / 100)
coluna_aleatoria = df.sample(n=p_colunas, axis='columns').keys()[0]
colunas_numericas = df.describe().columns
colunas_categoricas = df.describe(exclude=np.number).columns
coluna_aleatoria # escolhendo uma coluna aleatoriamente

'rooms'

##### a) Selecionando 3% dos dados

In [331]:
p_3 = int(df.shape[0] * 3 / 100)
p_3 # 3% de 1000 é igual a 30

182

##### a) Excluindo 3% dos dados aleatoriamente

In [332]:
reg_nulos = df[df[coluna_aleatoria].isnull()]
qtd_reg_nulos = reg_nulos.shape[0]
reg_excluidos = df.sample(n=p_3)
reg_exc_index = reg_excluidos.index.array
df.loc[reg_exc_index, coluna_aleatoria] = np.nan

##### a) Alterando 3% dos dados aleatoriamente

In [333]:
def gerar_string_aleatoria():
  letters = string.ascii_lowercase
  return ''.join(random.choice(letters) for i in range(10)) 

def gerar_numero_aleatorio():
  return random.randint(0, 999)

In [334]:
df_selecao_nao_nulo = df[~df[coluna_aleatoria].isnull()]
reg_alterados = df_selecao_nao_nulo.sample(n=p_3)
reg_alter_index = reg_alterados.index.array

In [335]:
if coluna_aleatoria in colunas_numericas:
    for index in reg_alter_index:
        df.loc[[index],[coluna_aleatoria]] = gerar_numero_aleatorio()
else:
    for index in reg_alter_index:
        df.loc[[index],[coluna_aleatoria]] = gerar_string_aleatoria()

#### b) Limpeza dos dados

i. Codificação One-Hot

pos_encoded.toarray()

A atribuição da mediana geral foi utilizada para lidar com os valores nulos das colunas em que existiam, pelos seguintes motivos:

*   Número relativamente pequeno de linhas (1000), o que ocasionaria numa maior escassez de dados caso as linhas com valores nulos fossem removidas
*   Número pequeno de colunas, logo a remoção completa de cada coluna em que houvesse valor nulo acarretaria na perda desnecessária de informações
*   Facilidade na implementação em comparação com o treinamento de predição e atribuição
*   Coesão com o dataset, visto que logo no primeiro quartil a coluna 'Meta_score' apresenta valor próximo à média

Após implementação da correção, o dataset não conta mais com valores nulos.


In [336]:
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="mean")),        
        ('std_scaler', StandardScaler()),
    ])

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

In [337]:
total = df['total']
df.drop('total', axis=1, inplace=True)
df.insert(5, 'total', total.values)
df.insert(6, 'no_animal', df['animal'].values)
df.insert(9, 'no_furniture', df['furniture'].values)
df.rename({'no_animal': 'animal', 'animal':'no_animal'}, axis=1, inplace=True)

df_prepared = full_pipeline.fit_transform(df)

In [338]:
df.drop('total_cat', axis=1, inplace=True)

In [339]:
scaled_df = pd.DataFrame(df_prepared, index=df.index, columns=df.columns)

In [340]:
df

Unnamed: 0,area,rooms,bathroom,parking spaces,floor,total,animal,no_animal,furniture,no_furniture
0,240,3.0,3,4,0,9121,acept,acept,furnished,furnished
1,64,2.0,1,1,10,1493,acept,acept,not furnished,not furnished
2,443,5.0,5,4,3,12680,acept,acept,furnished,furnished
3,73,2.0,2,1,12,2116,acept,acept,not furnished,not furnished
4,19,1.0,1,0,0,1257,not acept,not acept,not furnished,not furnished
...,...,...,...,...,...,...,...,...,...,...
6075,50,2.0,1,1,2,1585,acept,acept,not furnished,not furnished
6076,84,2.0,2,1,16,3768,not acept,not acept,furnished,furnished
6077,48,1.0,1,0,13,1255,acept,acept,not furnished,not furnished
6078,160,3.0,2,2,0,3803,not acept,not acept,not furnished,not furnished


In [341]:
scaled_df

Unnamed: 0,area,rooms,bathroom,parking spaces,floor,total,animal,no_animal,furniture,no_furniture
0,0.236616,-0.147852,0.457614,1.392098,-0.919557,0.352867,1.0,0.0,1.0,0.0
1,-0.232057,-0.158424,-0.932489,-0.469203,0.701606,-0.518842,1.0,0.0,0.0,1.0
2,0.777187,-0.126708,1.847717,1.392098,-0.433208,0.759580,1.0,0.0,1.0,0.0
3,-0.208090,-0.158424,-0.237438,-0.469203,1.025839,-0.447647,1.0,0.0,0.0,1.0
4,-0.351888,-0.168997,-0.932489,-1.089636,-0.919557,-0.545812,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
6075,-0.269337,-0.158424,-0.932489,-0.469203,-0.595324,-0.508329,1.0,0.0,0.0,1.0
6076,-0.178798,-0.158424,-0.237438,-0.469203,1.674304,-0.258861,0.0,1.0,1.0,0.0
6077,-0.274663,-0.168997,-0.932489,-1.089636,1.187955,-0.546040,1.0,0.0,0.0,1.0
6078,0.023583,-0.147852,-0.237438,0.151231,-0.919557,-0.254861,0.0,1.0,0.0,1.0
