In [2]:
# Lasso Regression to predict responder6

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the cleaned dataset
data_path = "../data/processed/day0_data_cleaned_finalcode.parquet"
day0_data_cleaned = pd.read_parquet(data_path)

# Define features (X) and target (y)
X = day0_data_cleaned.drop(columns=["responder_6"])  # Drop the target column
y = day0_data_cleaned["responder_6"]  # Target variable

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display shapes to confirm
print(f"Training Features Shape: {X_train.shape}")
print(f"Testing Features Shape: {X_test.shape}")
print(f"Training Target Shape: {y_train.shape}")
print(f"Testing Target Shape: {y_test.shape}")

Training Features Shape: (1555368, 56)
Testing Features Shape: (388842, 56)
Training Target Shape: (1555368,)
Testing Target Shape: (388842,)


In [4]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Initialize Lasso regression model with default parameters
lasso_model = Lasso(alpha=1.0, random_state=42)  # Alpha is the regularization strength

print("Starting Lasso Regression Training...")

# Train the model
lasso_model.fit(X_train, y_train)

print("Lasso Regression Training Complete!")

# Make predictions
y_pred_lasso = lasso_model.predict(X_test)

# Calculate performance metrics
mse = mean_squared_error(y_test, y_pred_lasso)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred_lasso)
r2 = r2_score(y_test, y_pred_lasso)

# Print results
print(f"Test MSE: {mse}")
print(f"Test RMSE: {rmse}")
print(f"Test MAE: {mae}")
print(f"Test R²: {r2}")

Starting Lasso Regression Training...
Lasso Regression Training Complete!
Test MSE: 0.7593987584114075
Test RMSE: 0.8714348847799286
Test MAE: 0.5622975826263428
Test R²: 3.933906555175781e-05


In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
import numpy as np

# Define the range of alpha values to test
alpha_values = np.logspace(-4, 1, 20)  # Tries values from 0.0001 to 10

# Set up GridSearch
lasso = Lasso(random_state=42)
param_grid = {"alpha": alpha_values}

grid_search = GridSearchCV(lasso, param_grid, scoring="neg_mean_squared_error", cv=5, n_jobs=-1, verbose=2)

print("Starting Hyperparameter Tuning for Lasso Regression...")

# Run the Grid Search
grid_search.fit(X_train, y_train)

# Get the best alpha
best_alpha = grid_search.best_params_["alpha"]
print(f"Best Alpha Found: {best_alpha}")

# Train the best model
lasso_best = Lasso(alpha=best_alpha, random_state=42)
lasso_best.fit(X_train, y_train)

print("Lasso Model Training with Best Alpha Complete!")

# Predict on test set
y_pred_lasso_best = lasso_best.predict(X_test)

# Evaluate the optimized model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_test, y_pred_lasso_best)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred_lasso_best)
r2 = r2_score(y_test, y_pred_lasso_best)

# Print results
print(f"Optimized Test MSE: {mse}")
print(f"Optimized Test RMSE: {rmse}")
print(f"Optimized Test MAE: {mae}")
print(f"Optimized Test R²: {r2}")

Starting Hyperparameter Tuning for Lasso Regression...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END .......................................alpha=0.0001; total time= 2.3min
[CV] END .......................................alpha=0.0001; total time= 2.3min
[CV] END .......................alpha=0.00018329807108324357; total time= 2.3min
[CV] END .......................alpha=0.00018329807108324357; total time= 2.3min
[CV] END .......................alpha=0.00018329807108324357; total time= 2.3min
[CV] END .......................................alpha=0.0001; total time= 2.3min
[CV] END .......................................alpha=0.0001; total time= 2.3min
[CV] END .......................alpha=0.00018329807108324357; total time= 1.1min
[CV] END ........................alpha=0.0006158482110660267; total time=  15.8s


  model = cd_fast.enet_coordinate_descent(


[CV] END .......................................alpha=0.0001; total time= 4.1min
[CV] END ........................alpha=0.0003359818286283781; total time= 1.8min
[CV] END ........................alpha=0.0003359818286283781; total time= 1.9min
[CV] END .......................alpha=0.00018329807108324357; total time= 1.9min
[CV] END ........................alpha=0.0006158482110660267; total time=  20.4s
[CV] END ........................alpha=0.0003359818286283781; total time= 1.9min
[CV] END ........................alpha=0.0003359818286283781; total time= 1.9min
[CV] END ........................alpha=0.0003359818286283781; total time= 1.9min
[CV] END ........................alpha=0.0011288378916846883; total time=   4.8s
[CV] END ........................alpha=0.0011288378916846883; total time=   3.9s
[CV] END ........................alpha=0.0006158482110660267; total time=   9.6s
[CV] END ........................alpha=0.0006158482110660267; total time=  12.3s
[CV] END ...................

In [6]:
# Feature scaling

from sklearn.preprocessing import StandardScaler

# Standardize the features (mean=0, std=1)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature Scaling Applied!")

Feature Scaling Applied!


In [7]:
# Select only important features

# Train Lasso on scaled data
lasso_best.fit(X_train_scaled, y_train)

# Identify important features
important_features = X_train.columns[lasso_best.coef_ != 0]
print(f"Selected {len(important_features)} important features out of {X_train.shape[1]}")

# Reduce dataset to important features
X_train_selected = X_train_scaled[:, lasso_best.coef_ != 0]
X_test_selected = X_test_scaled[:, lasso_best.coef_ != 0]

Selected 56 important features out of 56


In [8]:
# Further hyperparameter search

alpha_values = np.logspace(-5, 1, 50)  # More granular search
param_grid = {"alpha": alpha_values}

grid_search = GridSearchCV(lasso, param_grid, scoring="neg_mean_squared_error", cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_selected, y_train)

best_alpha = grid_search.best_params_["alpha"]
print(f"New Best Alpha: {best_alpha}")

Fitting 5 folds for each of 50 candidates, totalling 250 fits


Python(91948) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(91949) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(91950) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(91951) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(91952) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(91953) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(91954) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(91955) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


[CV] END ........................................alpha=1e-05; total time=  26.3s
[CV] END ........................................alpha=1e-05; total time=  51.6s
[CV] END .......................alpha=1.3257113655901082e-05; total time=  51.5s
[CV] END .......................alpha=1.3257113655901082e-05; total time=  51.6s
[CV] END ........................................alpha=1e-05; total time=  51.6s
[CV] END .......................alpha=1.3257113655901082e-05; total time=  51.6s
[CV] END ........................................alpha=1e-05; total time=  51.6s
[CV] END ........................................alpha=1e-05; total time=  52.5s
[CV] END ........................alpha=1.757510624854793e-05; total time=  18.2s
[CV] END ........................alpha=1.757510624854793e-05; total time=  18.7s
[CV] END .......................alpha=1.3257113655901082e-05; total time=  18.9s
[CV] END .......................alpha=1.3257113655901082e-05; total time=  37.4s
[CV] END ...................

In [9]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Train the final optimized Lasso model
lasso_optimized = Lasso(alpha=best_alpha, random_state=42)
lasso_optimized.fit(X_train_selected, y_train)

print("Optimized Lasso Model Training Complete!")

# Make predictions
y_pred_lasso_optimized = lasso_optimized.predict(X_test_selected)

# Compute performance metrics
mse = mean_squared_error(y_test, y_pred_lasso_optimized)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred_lasso_optimized)
r2 = r2_score(y_test, y_pred_lasso_optimized)

# Print evaluation results
print("\n Optimized Lasso Model Performance ")
print(f"Test MSE: {mse}")
print(f"Test RMSE: {rmse}")
print(f"Test MAE: {mae}")
print(f"Test R²: {r2}")

Optimized Lasso Model Training Complete!

 Optimized Lasso Model Performance 
Test MSE: 0.748341977596283
Test RMSE: 0.8650676144650676
Test MAE: 0.5579515099525452
Test R²: 0.014598727226257324
