In [6]:
# Import Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, make_scorer, root_mean_squared_error
from sklearn.pipeline import Pipeline


In [2]:
df = pd.read_csv("datas/df_1123_train.csv")

In [7]:
# Custom RMSE scoring with rounding
def rounded_rmse(y_true, y_pred):
    # Round predictions to nearest integer
    y_pred_rounded = np.round(y_pred)
    return np.sqrt(mean_squared_error(y_true, y_pred_rounded))

# Custom scorer for GridSearchCV
rmse_scorer = make_scorer(rounded_rmse, greater_is_better=False)

In [4]:
# Define features and target
X = df.drop('price', axis=1)
y = df['price']

In [5]:
# Split into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [8]:
# Define a pipeline with scaling and Ridge Regression
ridge_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge())
])

# Define the hyperparameter grid for Ridge Regression
ridge_param_grid = {
    'ridge__alpha': [0.1, 1, 10, 100, 1000]
}

# Initialize GridSearchCV
ridge_grid = GridSearchCV(
    ridge_pipeline,
    ridge_param_grid,
    cv=5,
    scoring=rmse_scorer,
    n_jobs=-1,
    verbose=2
)

# Fit the Ridge Regression model
ridge_grid.fit(X_train, y_train)

# Best Ridge model
best_ridge = ridge_grid.best_estimator_
print(f"Best Ridge parameters: {ridge_grid.best_params_}")


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Ridge parameters: {'ridge__alpha': 1000}


In [9]:
# Predict with the best Ridge model
y_pred_ridge = best_ridge.predict(X_test)

# Round predictions to nearest integer and clip to [0, 5]
y_pred_ridge_rounded = np.round(y_pred_ridge).astype(int)
y_pred_ridge_clipped = np.clip(y_pred_ridge_rounded, 0, 5)

# Calculate RMSE
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge_clipped))
print(f"Final RMSE (Ridge Regression): {rmse_ridge:.4f}")

Final RMSE (Ridge Regression): 0.9681


In [10]:
# Define a pipeline with scaling and Lasso Regression
lasso_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lasso', Lasso(max_iter=10000))
])

# Define the hyperparameter grid for Lasso Regression
lasso_param_grid = {
    'lasso__alpha': [0.001, 0.01, 0.1, 1, 10]
}

# Initialize GridSearchCV
lasso_grid = GridSearchCV(
    lasso_pipeline,
    lasso_param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

# Fit the Lasso Regression model
lasso_grid.fit(X_train, y_train)

# Best Lasso model
best_lasso = lasso_grid.best_estimator_
print(f"Best Lasso parameters: {lasso_grid.best_params_}")


Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END .....................................ridge__alpha=1; total time=   1.5s
[CV] END ...................................ridge__alpha=100; total time=   1.2s
[CV] END ..................................ridge__alpha=1000; total time=   1.0s
[CV] END ..................................lasso__alpha=0.01; total time=   2.9s
[CV] END ..................................lasso__alpha=0.01; total time=   2.4s
[CV] END ...................................lasso__alpha=0.1; total time=   0.7s
[CV] END .....................................lasso__alpha=1; total time=   0.7s
[CV] END ....................................lasso__alpha=10; total time=   0.6s
[CV] END ...................................ridge__alpha=0.1; total time=   1.7s
[CV] END ....................................ridge__alpha=10; total time=   1.2s
[CV] END ...................................ridge__alpha=100; total time=   1.1s
[CV] END ..................................lasso_

In [12]:
# Predict with the best Lasso model
y_pred_lasso = best_lasso.predict(X_test)

# Round predictions to nearest integer and clip to [0, 5]
y_pred_lasso_rounded = np.round(y_pred_lasso).astype(int)
y_pred_lasso_clipped = np.clip(y_pred_lasso_rounded, 0, 5)

# Calculate RMSE
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso_clipped))
print(f"Final RMSE (Lasso Regression): {rmse_lasso:.4f}")


Final RMSE (Lasso Regression): 0.9755
