# Modelo Regress√£o Linear M√∫ltipla

### Carregamento de bibliotecas

In [107]:
# OS e Manipula√ß√£o de dados
import os
import sys
import pandas as pd
import numpy as np

# adiocando a path das fun√ß√µes customizadas
currentdir = os.getcwd()
abs_path = os.path.abspath(os.path.join(currentdir, '../../'))
sys.path.append(abs_path)

# Tratamento dos dados
from utils import tratamento_de_dados

# Modelagem
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
import dagshub
import mlflow
import pickle

# Inicializa√ß√£o do Dagshub para trackeamento com MLFlow
dagshub.init(repo_owner='aurelioguilherme',
             repo_name='AmbienteDeDesenvolvimento',
             mlflow=True)


### Leitura e tratamento dos dados

In [91]:
# Leitura dos dados
file_path = os.path.join("../../Data", 'teste_indicium_precificacao.csv')
df = pd.read_csv(file_path)

# Definindo features do modelo
features_numericas = ['numero_de_reviews',
                      'reviews_por_mes',
                      'calculado_host_listings_count']

features_categoricas = [
    'room_type',
    'bairro_group',
    'minimo_noites_categorico',
    'disponibilidade_365_categorico',
    'ultima_review_semestre',
    'valor_preenchido']

# Aplicando o pipeline de tratamento de dados
cleaner_data = tratamento_de_dados.TransformData(df, features_categoricas, features_numericas)
X, y  = cleaner_data.fit_transform()

# Separando os dados
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)


transformer = ColumnTransformer(
    transformers=[
        # Padroniza√ß√£o features num√©ricas com RobustScaler
        ('num', RobustScaler(), features_numericas),
        # Encondingg das features categ√≥ricas com OneHotEncoder
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), features_categoricas)
    ])

transformer.fit(X_train)

X_train = transformer.transform(X_train)
X_test = transformer.transform(X_test)

### Treinamento do modelo base

In [38]:
model = LinearRegression()
model.fit(X_train, y_train)

In [39]:
y_pred = model.predict(X_test)

In [47]:
metricas = {'M√©trica': ['R¬≤', 'MAE', 'MSE', 'RMSE'],
            'Valor': [r2_score(y_test, y_pred),
                      mean_absolute_error(y_test, y_pred),
                      mean_squared_error(y_test, y_pred),
                      np.sqrt(mean_squared_error(y_test, y_pred))]}

In [46]:
pd.DataFrame(metricas)

Unnamed: 0,M√©trica,Valor
0,R¬≤,0.08957
1,MAE,74.192615
2,MSE,52733.150074
3,RMSE,229.636996


In [63]:
pd.DataFrame({'feature': transformer.get_feature_names_out(),
              'coeficiente': np.abs(model.coef_)}).sort_values(by = "coeficiente", ascending=False)

Unnamed: 0,feature,coeficiente
16,cat__disponibilidade_365_categorico_1_Ano,165.447238
4,cat__room_type_Shared room,163.54244
9,cat__minimo_noites_categorico_Entre_1_Meses_e_...,124.364681
3,cat__room_type_Private room,118.44622
6,cat__bairro_group_Manhattan,92.500193
14,cat__minimo_noites_categorico_Entre_6_Meses_e_...,70.2761
23,cat__disponibilidade_365_categorico_Entre_6_Me...,66.180089
20,cat__disponibilidade_365_categorico_Entre_2_Me...,59.056563
10,cat__minimo_noites_categorico_Entre_1_a_3_Dias,54.318781
15,cat__minimo_noites_categorico_Mais_de_1_Ano,48.599096


# MlFlow


In [104]:
mlflow.set_experiment(experiment_name='Linear Regression')

<Experiment: artifact_location='mlflow-artifacts:/a3f428b22db544078744bc002db609ca', creation_time=1738046783649, experiment_id='5', last_update_time=1738046783649, lifecycle_stage='active', name='Linear Regression', tags={}>

In [68]:
with mlflow.start_run(run_name='Execu√ß√£o Modelo Base'):
    model = LinearRegression()
    model.fit(X_train, y_train)

    mlflow.log_params({"model" : "LinearRegression",
                       "numerical_features" : features_numericas,
                       "categorical_features" : features_categoricas,
                       "scaler" : "RobustScaler",
                       "encoder" : "OneHotEncoder"})   
    y_pred = model.predict(X_test)

    metricas = {'r2' : r2_score(y_test, y_pred),
                'mean_absolute_error' :mean_absolute_error(y_test, y_pred),
                'mean_squared_error'  :  mean_squared_error(y_test, y_pred),
                'root_mean_squared_error' : np.sqrt(mean_squared_error(y_test, y_pred))}
    
    mlflow.log_metrics(metricas)

    mlflow.sklearn.log_model(model, "LinearRegression-PrecificacaoNY")
    mlflow.sklearn.log_model(transformer, "transformer")

    coefficients = pd.DataFrame({'feature': transformer.get_feature_names_out(),
                                'coeficiente': np.abs(model.coef_)}).sort_values(by = "coeficiente", ascending=False)
    coefficients.to_csv("coefficients.csv", index=False)
    mlflow.log_artifact("coefficients.csv")





üèÉ View run Execu√ß√£o Modelo Base at: https://dagshub.com/aurelioguilherme/AmbienteDeDesenvolvimento.mlflow/#/experiments/5/runs/c459978202b74b84bef9b2593523f339
üß™ View experiment at: https://dagshub.com/aurelioguilherme/AmbienteDeDesenvolvimento.mlflow/#/experiments/5


# 2 Experimenta√ß√£o:

- Abordagem com vari√°veis num√©ricas


In [95]:
# Leitura dos dados
file_path = os.path.join("../../Data", 'teste_indicium_precificacao.csv')
df = pd.read_csv(file_path)

# Definindo features do modelo
features_numericas = ['numero_de_reviews',
                      'reviews_por_mes',
                      'calculado_host_listings_count',
                      'disponibilidade_365',
                      'latitude',
                      'longitude',
                      'minimo_noites']

features_categoricas = [
    'room_type',
    'bairro_group',
    'valor_preenchido']

# Aplicando o pipeline de tratamento de dados
cleaner_data = tratamento_de_dados.TransformData(df, features_categoricas, features_numericas)
X, y  = cleaner_data.fit_transform()

# Separando os dados
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)


transformer = ColumnTransformer(
    transformers=[
        # Padroniza√ß√£o features num√©ricas com RobustScaler
        ('num', RobustScaler(), features_numericas),
        # Encondingg das features categ√≥ricas com OneHotEncoder
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), features_categoricas)
    ])

transformer.fit(X_train)

X_train = transformer.transform(X_train)
X_test = transformer.transform(X_test)

In [100]:
model = LinearRegression()
model.fit(X_train, y_train)

In [114]:
y_pred = model.predict(X_test)
metricas = {'M√©trica': ['R¬≤', 'MAE', 'MSE', 'RMSE'],
            'Valor': [r2_score(y_test, y_pred),
                      mean_absolute_error(y_test, y_pred),
                      mean_squared_error(y_test, y_pred),
                      np.sqrt(mean_squared_error(y_test, y_pred))]}

pd.DataFrame(metricas)

Unnamed: 0,M√©trica,Valor
0,R¬≤,0.105432
1,MAE,72.067135
2,MSE,46035.07187
3,RMSE,214.557852


In [103]:
pd.DataFrame({'feature': transformer.get_feature_names_out(),
              'coeficiente': np.abs(model.coef_)}).sort_values(by = "coeficiente", ascending=False)

Unnamed: 0,feature,coeficiente
8,cat__room_type_Shared room,149.785546
12,cat__bairro_group_Staten Island,148.378172
7,cat__room_type_Private room,109.614196
13,cat__valor_preenchido_1,49.155665
3,num__disponibilidade_365,47.222958
9,cat__bairro_group_Brooklyn,34.021825
10,cat__bairro_group_Manhattan,24.465613
5,num__longitude,24.036828
4,num__latitude,14.980752
11,cat__bairro_group_Queens,5.105063


## MLflow

In [105]:
with mlflow.start_run(run_name='Execu√ß√£o Modelo com Features Num√©ricas'):
    model = LinearRegression()
    model.fit(X_train, y_train)

    mlflow.log_params({"model" : "LinearRegression",
                       "numerical_features" : features_numericas,
                       "categorical_features" : features_categoricas,
                       "scaler" : "RobustScaler",
                       "encoder" : "OneHotEncoder"})   
    y_pred = model.predict(X_test)

    metricas = {'r2' : r2_score(y_test, y_pred),
                'mean_absolute_error' :mean_absolute_error(y_test, y_pred),
                'mean_squared_error'  :  mean_squared_error(y_test, y_pred),
                'root_mean_squared_error' : np.sqrt(mean_squared_error(y_test, y_pred))}
    
    mlflow.log_metrics(metricas)

    mlflow.sklearn.log_model(model, "LinearRegression-PrecificacaoNY")
    mlflow.sklearn.log_model(transformer, "transformer")

    coefficients = pd.DataFrame({'feature': transformer.get_feature_names_out(),
                                'coeficiente': np.abs(model.coef_)}).sort_values(by = "coeficiente", ascending=False)
    coefficients.to_csv("coefficients.csv", index=False)
    mlflow.log_artifact("coefficients.csv")





üèÉ View run Execu√ß√£o Modelo com Features Num√©ricas at: https://dagshub.com/aurelioguilherme/AmbienteDeDesenvolvimento.mlflow/#/experiments/5/runs/6f5536a36b714199a9009be8bf857517
üß™ View experiment at: https://dagshub.com/aurelioguilherme/AmbienteDeDesenvolvimento.mlflow/#/experiments/5


In [111]:
# Salvando o modelo em um arquivo .pkl
with open('/home/aurelio/projetos/Python/indicium/Models/modelo_precificacaoRL.pkl', 'wb') as file:
    pickle.dump(model, file)
     

In [113]:
# Salvando Enconding e Padronizador
with open('/home/aurelio/projetos/Python/indicium/Models/Transformer_RL.pkl', 'wb') as file:
    pickle.dump(transformer, file)