# Precificação de Alugéis em Nova York

## Data Understanding

In [8]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [9]:
DATA_DIR = "../Data"
POLLUTION_DATASET_FILE_NAME = "teste_indicium_precificacao.csv"

file_path = os.path.join(DATA_DIR, POLLUTION_DATASET_FILE_NAME)

if os.path.exists(file_path):
    df = pd.read_csv(file_path)
else: 
     print("Arquivo não encontrado")

In [None]:
df.head()

In [None]:
df.describe().T

In [None]:
df.head(1)

In [None]:
df.info()

In [None]:
pd.DataFrame({"Quantidade_Dados_Faltantes" : df.isnull().sum()[df.isnull().sum() > 0],\
              "Porcentagem_Dados_Faltantes" : round((df.isnull().sum()[df.isnull().sum() > 0] / len(df) * 100),2)})

In [None]:
# o tipo set retorna somente valores únicos
set(df[df["ultima_review"].isnull()].index == df[df["reviews_por_mes"].isnull()].index) 

In [None]:
df[df.duplicated()]

In [None]:
lista = []
for col in df.columns:
    if len(set(df[col].duplicated())) == 2:
        lista.append(col)
lista

In [11]:
valores_continuos = ['price', 'minimo_noites', 'numero_de_reviews', 
                    'reviews_por_mes', 'calculado_host_listings_count','disponibilidade_365']

outliers_df = df[valores_continuos]

In [None]:
outliers_df.describe().T

In [None]:
stat_df = pd.DataFrame({"Valores únicos": outliers_df.nunique(),
                        "Média":outliers_df.mean(),
                        "Desvio padrão":outliers_df.std(),
                        "Valor mínimo":outliers_df.min(),
                        "Valor máximo":outliers_df.max(),
                        })
stat_df

In [None]:
outliers_df.nunique()

In [None]:
fig, axes = plt.subplots(2,3, figsize=(16,6))
axes = axes.flatten()

for i, col in enumerate(valores_continuos):
  sns.boxplot(data=outliers_df, x= col, ax=axes[i])
  axes[i].set_title(f'boxplot {col}')

plt.tight_layout()
plt.show()


In [None]:
fig, axes = plt.subplots(2,3, figsize=(16,6))
axes = axes.flatten()

for i, col in enumerate(valores_continuos):
  sns.histplot(data=outliers_df, x= col, ax=axes[i])
  axes[i].set_title(f'boxplot {col}')

plt.tight_layout()
plt.show()


### Price

In [None]:
df.query('price < 1')

In [None]:
q3 = df.price.quantile(0.75)
df.query('price >= @q3')

In [None]:
len(df.query('price >= @q3'))

In [155]:
q1 = df.price.quantile(0.25)
iqr = q3-q1

lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

In [None]:
df.query('price > @upper_bound')

In [None]:
len(df.query('price > @upper_bound'))

In [None]:
df.query("calculado_host_listings_count > 1 & minimo_noites < 30")

In [None]:
df['host_id'].nunique()

In [None]:
round(df["host_id"].value_counts().reset_index().mean().values[1])

In [None]:
df["bairro_group"].nunique()

In [None]:
pd.DataFrame(df.groupby('bairro_group')['id'].count()).rename({'id': 'Quantidade de anuncios'}, axis=1)

In [None]:
teste = pd.DataFrame(df.groupby('host_id')['id'].count()).rename({'id': 'Quantidade de anuncios'}, axis=1).sort_values('Quantidade de anuncios', ascending=False)
teste

In [None]:
df["host_id"].value_counts().reset_index().sort_values(by='count',ascending=False).head(10)

In [None]:
df.query('price != 0')\
  .groupby('room_type')['price']\
  .agg(['mean', 'count','min','max','sum'])

In [None]:
df.minimo_noites.nunique()

In [18]:
df.head(2)

Unnamed: 0,id,nome,host_id,host_name,bairro_group,bairro,latitude,longitude,room_type,price,minimo_noites,numero_de_reviews,ultima_review,reviews_por_mes,calculado_host_listings_count,disponibilidade_365,minimo_noites_grupo
0,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355,Até 1 Semana
1,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365,Até 1 Semana


In [16]:
df['minimo_noites_grupo'] = pd.cut(
    df['minimo_noites'], 
    bins=[0, 7, 14, 30, float('inf')], 
    labels=['Até 1 Semana', 'Entre 1 e 2 Semanas', 'Entre 2 Semanas e 1 Mês', 'Mais de 1 Mês']
)


In [23]:
df

Unnamed: 0,id,nome,host_id,host_name,bairro_group,bairro,latitude,longitude,room_type,price,minimo_noites,numero_de_reviews,ultima_review,reviews_por_mes,calculado_host_listings_count,disponibilidade_365,minimo_noites_grupo
0,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355,Até 1 Semana
1,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.94190,Private room,150,3,0,,,1,365,Até 1 Semana
2,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194,Até 1 Semana
3,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.10,1,0,Entre 1 e 2 Semanas
4,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.97500,Entire home/apt,200,3,74,2019-06-22,0.59,1,129,Até 1 Semana
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48889,36484665,Charming one bedroom - newly renovated rowhouse,8232441,Sabrina,Brooklyn,Bedford-Stuyvesant,40.67853,-73.94995,Private room,70,2,0,,,2,9,Até 1 Semana
48890,36485057,Affordable room in Bushwick/East Williamsburg,6570630,Marisol,Brooklyn,Bushwick,40.70184,-73.93317,Private room,40,4,0,,,2,36,Até 1 Semana
48891,36485431,Sunny Studio at Historical Neighborhood,23492952,Ilgar & Aysel,Manhattan,Harlem,40.81475,-73.94867,Entire home/apt,115,10,0,,,1,27,Entre 1 e 2 Semanas
48892,36485609,43rd St. Time Square-cozy single bed,30985759,Taz,Manhattan,Hell's Kitchen,40.75751,-73.99112,Shared room,55,1,0,,,6,2,Até 1 Semana


In [24]:
df.query('price != 0')\
  .groupby(['minimo_noites_grupo','room_type'], observed=True)[['price']]\
  .agg(['mean', 'count','min','max','sum']).reset_index()

Unnamed: 0_level_0,minimo_noites_grupo,room_type,price,price,price,price,price
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,count,min,max,sum
0,Até 1 Semana,Entire home/apt,210.817349,20728,10,10000,4369822
1,Até 1 Semana,Private room,89.962283,19832,10,7500,1784132
2,Até 1 Semana,Shared room,74.072581,992,10,1800,73480
3,Entre 1 e 2 Semanas,Entire home/apt,181.206566,731,16,6000,132462
4,Entre 1 e 2 Semanas,Private room,72.053111,659,16,1000,47483
5,Entre 1 e 2 Semanas,Shared room,39.372093,43,15,235,1693
6,Entre 2 Semanas e 1 Mês,Entire home/apt,220.615854,3444,10,10000,759801
7,Entre 2 Semanas e 1 Mês,Private room,71.558401,1601,24,6500,114565
8,Entre 2 Semanas e 1 Mês,Shared room,49.622642,106,15,800,5260
9,Mais de 1 Mês,Entire home/apt,236.894841,504,12,6500,119395


In [29]:
df.calculado_host_listings_count.nunique()

47

In [26]:
df.numero_de_reviews.nunique()

394

In [None]:
df['minimo_noites'].unique()

In [None]:
sns.heatmap(df[['price', 
                'minimo_noites', 
                'numero_de_reviews', 
                'reviews_por_mes', 
                'calculado_host_listings_count', 
                'disponibilidade_365']].corr(), annot=True)

analisar lat long para verificar a qtd de imoveis e se existem imóveis diferentes no mesmo endereço disponíveis.

## MLFLOW

In [6]:
import mlflow
import mlflow.sklearn

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
mlflow.set_experiment("pollution_dataset_experiment")

In [9]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1:]

In [None]:
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
model = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
with mlflow.start_run():
    # Treinando o modelo
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculando métricas
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    # Registrando parâmetros, métricas e o modelo
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("random_state", 42)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)

    # Registrando o modelo
    mlflow.sklearn.log_model(model, "classification_rf_model",input_example=X_test)

In [None]:
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor

# set the experiment id
mlflow.set_experiment(experiment_id="0")

mlflow.autolog()
db = load_diabetes()

X_train, X_test, y_train, y_test = train_test_split(db.data, db.target)

# Create and train models.
rf = RandomForestRegressor(n_estimators=100, max_depth=6, max_features=3)
rf.fit(X_train, y_train)

# Use the model to make predictions on the test dataset.
predictions = rf.predict(X_test)

In [2]:
mlflow.set_tracking_uri("/home/aurelio/projetos/Python/indicium/Notebooks/mlruns")