# 🤖 Data Modeling with Linear Regression

In [6]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import make_scorer, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import sys
import os

# Añadir la carpeta raíz del proyecto al PATH (ajústalo según donde esté tu notebook)
sys.path.append(os.path.abspath('..'))  # '..' significa que sube un nivel desde donde está el notebook

from utils.data_training_utils import train_test_model_with_hyperparameter_tuning

In [7]:
df = pd.read_csv("../data/working_data/data_cleaned_20250901.csv")

In [8]:
for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].astype("category")

from sklearn.preprocessing import LabelEncoder

df_numeric = df.copy()
encoders = {}

for col in df_numeric.select_dtypes(include="category").columns:
    le = LabelEncoder()
    df_numeric[col] = le.fit_transform(df_numeric[col].astype(str))
    encoders[col] = le

In [9]:
param_grid = {
    'fit_intercept': [True, False],
    'positive': [True, False]  # Desde scikit-learn 0.24: obliga a que los coef sean positivos
}


model = LinearRegression()

### Linear Regression Model for rental prices

The model was first trained using the raw data, and then the experiment was repeated with scaled data to evaluate whether scaling actually improves the predictive performance.

In [10]:
df_numeric_rent = df_numeric[df_numeric['operation'] == 0]

best_rent_num_model, best_rent_num_params, mae_train, r2_train, mae_test, r2_test = train_test_model_with_hyperparameter_tuning(
    model, param_grid, df_numeric_rent, drop_features=['operation'])

Fitting 5 folds for each of 4 candidates, totalling 20 fits


Best model found: LinearRegression(fit_intercept=False, positive=True)
Best hyperparameters: {'fit_intercept': False, 'positive': True}
Best training MAE: 597.4172602092046
Best training R2: 0.5597123682627917
·········································
MAE test: 558.4104755834879
R2 test: 0.6062971819201288


In [11]:
price_mean = df_numeric_rent['price'].mean()
print(f"Mean price rent data: {price_mean}")

Mean price rent data: 2188.078862660944


In [12]:
df_numeric_rent = df_numeric[df_numeric['operation'] == 0]

best_rent_num_model, best_rent_num_params, mae_train, r2_train, mae_test, r2_test = train_test_model_with_hyperparameter_tuning(
    model, param_grid, df_numeric_rent, drop_features=['operation'], scale_data=True)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best model found: LinearRegression(fit_intercept=False, positive=True)
Best hyperparameters: {'fit_intercept': False, 'positive': True}
Best training MAE: 1436.3349932155245
Best training R2: -0.564065811629076
·········································
MAE test: 1409.841337342733
R2 test: -0.801136096172161


### Linear Regression Model for sales prices

In [13]:
df_numeric_sale = df_numeric[df_numeric['operation'] == 1]

best_sale_num_model, best_sale_num_params, mae_train, r2_train, mae_test, r2_test = train_test_model_with_hyperparameter_tuning(
    model, param_grid, df_numeric_sale, drop_features=['operation'])

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best model found: LinearRegression(fit_intercept=False, positive=True)
Best hyperparameters: {'fit_intercept': False, 'positive': True}
Best training MAE: 249209.59525448224
Best training R2: 0.12768408709478968
·········································
MAE test: 242504.40860235918
R2 test: 0.6444374908819035


In [14]:
price_mean = df_numeric_sale['price'].mean()
print(f"Mean price sale data: {price_mean}")

Mean price sale data: 661293.86940495


In [15]:
df_numeric_sale = df_numeric[df_numeric['operation'] == 1]

best_sale_num_model, best_sale_num_params, mae_train, r2_train, mae_test, r2_test = train_test_model_with_hyperparameter_tuning(
    model, param_grid, df_numeric_sale, drop_features=['operation'], scale_data=True)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best model found: LinearRegression(fit_intercept=False, positive=True)
Best hyperparameters: {'fit_intercept': False, 'positive': True}
Best training MAE: 471334.7179674739
Best training R2: -0.5444034350570609
·········································
MAE test: 456551.31784538255
R2 test: -0.0203691276524407
