### Importando as bibliotecas e a base de dados

In [325]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pickle


In [326]:
base = pd.read_csv('teste_indicium_precificacao.csv')
base.head(10)

Unnamed: 0,id,nome,host_id,host_name,bairro_group,bairro,latitude,longitude,room_type,price,minimo_noites,numero_de_reviews,ultima_review,reviews_por_mes,calculado_host_listings_count,disponibilidade_365
0,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
1,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
2,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
3,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0
4,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,74,2019-06-22,0.59,1,129
5,5121,BlissArtsSpace!,7356,Garon,Brooklyn,Bedford-Stuyvesant,40.68688,-73.95596,Private room,60,45,49,2017-10-05,0.4,1,0
6,5178,Large Furnished Room Near B'way,8967,Shunichi,Manhattan,Hell's Kitchen,40.76489,-73.98493,Private room,79,2,430,2019-06-24,3.47,1,220
7,5203,Cozy Clean Guest Room - Family Apt,7490,MaryEllen,Manhattan,Upper West Side,40.80178,-73.96723,Private room,79,2,118,2017-07-21,0.99,1,0
8,5238,Cute & Cozy Lower East Side 1 bdrm,7549,Ben,Manhattan,Chinatown,40.71344,-73.99037,Entire home/apt,150,1,160,2019-06-09,1.33,4,188
9,5295,Beautiful 1br on Upper West Side,7702,Lena,Manhattan,Upper West Side,40.80316,-73.96545,Entire home/apt,135,5,53,2019-06-22,0.43,1,6


### Analisando os dados superficialmente

In [327]:
base.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48894 entries, 0 to 48893
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   id                             48894 non-null  int64  
 1   nome                           48878 non-null  object 
 2   host_id                        48894 non-null  int64  
 3   host_name                      48873 non-null  object 
 4   bairro_group                   48894 non-null  object 
 5   bairro                         48894 non-null  object 
 6   latitude                       48894 non-null  float64
 7   longitude                      48894 non-null  float64
 8   room_type                      48894 non-null  object 
 9   price                          48894 non-null  int64  
 10  minimo_noites                  48894 non-null  int64  
 11  numero_de_reviews              48894 non-null  int64  
 12  ultima_review                  38842 non-null 

In [328]:
base.isnull().sum()

id                                   0
nome                                16
host_id                              0
host_name                           21
bairro_group                         0
bairro                               0
latitude                             0
longitude                            0
room_type                            0
price                                0
minimo_noites                        0
numero_de_reviews                    0
ultima_review                    10052
reviews_por_mes                  10052
calculado_host_listings_count        0
disponibilidade_365                  0
dtype: int64

In [329]:
base.dtypes

id                                 int64
nome                              object
host_id                            int64
host_name                         object
bairro_group                      object
bairro                            object
latitude                         float64
longitude                        float64
room_type                         object
price                              int64
minimo_noites                      int64
numero_de_reviews                  int64
ultima_review                     object
reviews_por_mes                  float64
calculado_host_listings_count      int64
disponibilidade_365                int64
dtype: object

In [330]:
# Descartando valores nulos 
base = base.dropna()

# Tentando entender a correlação das variaveis com o Preço
base.corr(numeric_only=True)

Unnamed: 0,id,host_id,latitude,longitude,price,minimo_noites,numero_de_reviews,reviews_por_mes,calculado_host_listings_count,disponibilidade_365
id,1.0,0.59147,-0.01015,0.103321,-0.006695,-0.073952,-0.33005,0.291768,0.098451,0.006709
host_id,0.59147,1.0,0.012792,0.141268,0.006264,-0.05168,-0.141984,0.296263,0.149419,0.155412
latitude,-0.01015,0.012792,1.0,0.088244,0.031346,0.024883,-0.008576,-0.010144,0.004341,-0.021849
longitude,0.103321,0.141268,0.088244,1.0,-0.155298,-0.055417,0.054741,0.146221,-0.093348,0.102601
price,-0.006695,0.006264,0.031346,-0.155298,1.0,0.025501,-0.035924,-0.030622,0.052895,0.078278
minimo_noites,-0.073952,-0.05168,0.024883,-0.055417,0.025501,1.0,-0.069369,-0.121718,0.073474,0.101676
numero_de_reviews,-0.33005,-0.141984,-0.008576,0.054741,-0.035924,-0.069369,1.0,0.549696,-0.059796,0.193439
reviews_por_mes,0.291768,0.296263,-0.010144,0.146221,-0.030622,-0.121718,0.549696,1.0,-0.009441,0.18594
calculado_host_listings_count,0.098451,0.149419,0.004341,-0.093348,0.052895,0.073474,-0.059796,-0.009441,1.0,0.182988
disponibilidade_365,0.006709,0.155412,-0.021849,0.102601,0.078278,0.101676,0.193439,0.18594,0.182988,1.0


### Transformando dados

In [331]:
# Dropando uma coluna sem muito impacto na analise
base = base.drop(['bairro'], axis=1)


In [332]:
# Alguns hoteis estão com o preço zerado, então troquei eles pela media de preço
base['price'][base['price'] > 0].mean()
base.loc[base['price'] == 0, 'price'] = 142

Unnamed: 0,id,nome,host_id,host_name,bairro_group,latitude,longitude,room_type,price,minimo_noites,numero_de_reviews,ultima_review,reviews_por_mes,calculado_host_listings_count,disponibilidade_365


In [333]:
# Transformando datas em datatime 
base['ultima_review'] = pd.to_datetime(base['ultima_review'])
base['ultima_review'] = base['ultima_review'].astype('int64') // 10**9 

# Transformando os valores que são separados em categorias em category types
base['bairro_group'] = base['bairro_group'].astype('category')
base['room_type'] = base['room_type'].astype('category')

### Separando a base de treinamento e otimizando

In [334]:
# Separando a base entre variaveis e valor a ser predito (x = Variaveis, y = valor alvo)
base_x_1 = base.iloc[:, 4:8]
base_x_2 = base.iloc[:, 9:]
base_x = pd.concat([base_x_1, base_x_2], axis=1)

base_y = base.iloc[:, 8]


# Transformei as bases em vetores
base_x = base_x.values
base_y = base_y.values

In [335]:
base_x.shape

(38820, 10)

In [336]:
# Transformei os valores categoricos em valores númericos:
transformador_1hot = ColumnTransformer(transformers=[('Onehot', OneHotEncoder(), [0, 3])], remainder='passthrough')
transformador_treinado = transformador_1hot.fit(base_x)
base_x = transformador_treinado.transform(base_x)

In [367]:
base_x.shape

(38820, 16)

In [370]:
# Escalando os valores
escalador = StandardScaler(with_mean=False)
escalador_treinado = escalador.fit(base_x)

base_x = escalador_treinado.transform(base_x)
base_x.shape

In [341]:
# Separando as bases de teste e de treinamento
x_treinamento, x_teste, y_treinamento, y_teste = train_test_split(base_x, base_y, test_size = 0.1, random_state = 0)

In [343]:
# Salvei as bases por preucação
with open('base_indicium.pkl', mode = 'wb') as f:
  pickle.dump([x_treinamento, x_teste, y_treinamento, y_teste ], f)

### Criando o modelo

In [344]:
# Importando a Random Forest e o MSE
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Treinando o modelo
regressor_rf = RandomForestRegressor( min_samples_split =2, n_estimators = 100)
regressor_rf.fit(x_treinamento, y_treinamento)

In [345]:
# Prevendo os valores
previsoes = regressor_rf.predict(x_teste)

# Calculando o MSE
mean_absolute_error(y_teste, previsoes)

53.397637815558994

In [346]:
# Calculando a precisão em porcentagem do modelo
print(regressor_rf.score(x_treinamento, y_treinamento) * 100)

87.48243602117765


#### Salvando o modelo

In [347]:
pickle.dump(regressor_rf, open('random_forest.pkl', 'wb'))

### Respondendo questões especificas:

In [348]:
base['melhores_valores'] = base['numero_de_reviews'] / base['price']
melhor_quarto = base.sort_values(by='melhores_valores', ascending=False).iloc[0]
print(melhor_quarto)


id                                               9145202
nome                             Room near JFK Queen Bed
host_id                                         47621202
host_name                                           Dona
bairro_group                                      Queens
latitude                                         40.6673
longitude                                      -73.76831
room_type                                   Private room
price                                                 47
minimo_noites                                          1
numero_de_reviews                                    629
ultima_review                                 1562284800
reviews_por_mes                                    14.58
calculado_host_listings_count                          2
disponibilidade_365                                  333
melhores_valores                               13.382979
Name: 11758, dtype: object


O melhor quarto é o "JFK Queen bed" por 47$ com o minimo de uma noite e 629 reviews

### Prevendo valores especificos

In [373]:
linha_nova = {
 'bairro_group': 'Manhattan',
 'latitude': 40.75362,
 'longitude': -73.98377,
 'room_type': 'Entire home/apt',
 'minimo_noites': 1,
 'numero_de_reviews': 45,
 'ultima_review': '2019-05-21',
 'reviews_por_mes': 0.38,
 'calculado_host_listings_count': 2,
 'disponibilidade_365': 355}

base_linha = pd.DataFrame([linha_nova])

base_linha['bairro_group'] = base_linha['bairro_group'].astype('category')
base_linha['room_type'] = base_linha['room_type'].astype('category')

base_linha['ultima_review'] = pd.to_datetime(base_linha['ultima_review'])
base_linha['ultima_review'] = base_linha['ultima_review'].astype('int64') // 10**9 

In [374]:
linha_nova = base_linha.values

In [375]:
linha_nova[0]

array(['Manhattan', 40.75362, -73.98377, 'Entire home/apt', 1, 45,
       1558396800, 0.38, 2, 355], dtype=object)

In [376]:
# Transformei os valores categoricos em valores númericos:
transformador_1hot = ColumnTransformer(transformers=[('Onehot', OneHotEncoder(), [0, 3])], remainder='passthrough')
linha_nova = transformador_treinado.transform(linha_nova)
linha_nova.shape


(1, 16)

In [377]:
# Escalei os valores da linha nova
escalador = StandardScaler(with_mean=False)
linha_nova = escalador_treinado.transform(linha_nova)

In [378]:
# Prevendo o preço
previ_linha = regressor_rf.predict(linha_nova)

In [379]:
print(f'O preço é {previ_linha} dolares')

O preço é [204.01] dolares
