En este cuaderno vamos a realizar un modelo utlizando Regresion lineal

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import make_scorer, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import sys
import os

# Añadir la carpeta raíz del proyecto al PATH (ajústalo según donde esté tu notebook)
sys.path.append(os.path.abspath('..'))  # '..' significa que sube un nivel desde donde está el notebook

from utils.data_training_utils import train_test_model_with_hyperparameter_tuning

In [2]:
df = pd.read_csv("../data/working_data/data_cleaned_20250827.csv")

In [3]:
for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].astype("category")

from sklearn.preprocessing import LabelEncoder

df_numeric = df.copy()
encoders = {}

for col in df_numeric.select_dtypes(include="category").columns:
    le = LabelEncoder()
    df_numeric[col] = le.fit_transform(df_numeric[col].astype(str))
    encoders[col] = le

In [4]:
param_grid = {
    'fit_intercept': [True, False],
    'positive': [True, False]  # Desde scikit-learn 0.24: obliga a que los coef sean positivos
}


model = LinearRegression()

### Linear Regression Model for rental prices

Probamos a entrenar el modelo primero sin escalar los datos y luego lo repetidos con los datos escalados para ver si de verdad mejoran los resultados

### Linear Regression Model for rental prices

In [5]:
df_numeric_rent = df_numeric[df_numeric['operation'] == 0]

best_rent_num_model, best_rent_num_params, mae_train, r2_train, mae_test, r2_test = train_test_model_with_hyperparameter_tuning(
    model, param_grid, df_numeric_rent, drop_features=['operation'])

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best model found: LinearRegression(fit_intercept=False, positive=True)
Best hyperparameters: {'fit_intercept': False, 'positive': True}
Best training MAE: 597.3130857491922
Best training R2: 0.5597355922061364
·········································
MAE test: 554.9261971374838
R2 test: 0.6082816091924705


In [6]:
price_mean = df_numeric_rent['price'].mean()
print(f"Mean price rent data: {price_mean}")

Mean price rent data: 2188.078862660944


In [7]:
df_numeric_rent = df_numeric[df_numeric['operation'] == 0]

best_rent_num_model, best_rent_num_params, mae_train, r2_train, mae_test, r2_test = train_test_model_with_hyperparameter_tuning(
    model, param_grid, df_numeric_rent, drop_features=['operation'], scale_data=True)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best model found: LinearRegression(fit_intercept=False, positive=True)
Best hyperparameters: {'fit_intercept': False, 'positive': True}
Best training MAE: 1429.21436406148
Best training R2: -0.5587871452592739
·········································
MAE test: 1394.8051964457868
R2 test: -0.7848557040126474


### Linear Regression Model for sales prices

In [8]:
df_numeric_sale = df_numeric[df_numeric['operation'] == 1]

best_sale_num_model, best_sale_num_params, mae_train, r2_train, mae_test, r2_test = train_test_model_with_hyperparameter_tuning(
    model, param_grid, df_numeric_sale, drop_features=['operation'])

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best model found: LinearRegression(fit_intercept=False, positive=True)
Best hyperparameters: {'fit_intercept': False, 'positive': True}
Best training MAE: 249220.86332707028
Best training R2: 0.1287119488587547
·········································
MAE test: 242789.530170478
R2 test: 0.643968888223416


In [9]:
price_mean = df_numeric_sale['price'].mean()
print(f"Mean price sale data: {price_mean}")

Mean price sale data: 661293.86940495


In [10]:
df_numeric_sale = df_numeric[df_numeric['operation'] == 1]

best_sale_num_model, best_sale_num_params, mae_train, r2_train, mae_test, r2_test = train_test_model_with_hyperparameter_tuning(
    model, param_grid, df_numeric_sale, drop_features=['operation'], scale_data=True)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best model found: LinearRegression(fit_intercept=False, positive=True)
Best hyperparameters: {'fit_intercept': False, 'positive': True}
Best training MAE: 469162.59809000156
Best training R2: -0.5330607763384803
·········································
MAE test: 456713.0326633496
R2 test: -0.019749052156005753
