In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
import pickle, os

In [2]:
df = pd.read_csv('../../data/processed/transformed_data.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-1.327835,1.052548,0.982143,,,,,1.264411,1.140915,NEAR BAY
1,-1.322844,1.043185,-0.607019,1.202953,1.140615,0.871267,1.208457,1.264411,0.839045,NEAR BAY
2,-1.332827,1.038503,1.856182,-1.195101,,,,1.126602,0.814787,NEAR BAY
3,-1.337818,1.038503,1.856182,-1.804267,-2.762127,,-2.842005,0.747298,0.772467,NEAR BAY
4,-1.337818,1.038503,1.856182,-0.874327,-1.507164,,-1.565578,0.026897,0.776062,NEAR BAY


In [3]:
# Numeric feature columns
feature_cols = [
    'longitude', 'latitude', 'housing_median_age', 'total_rooms',
    'total_bedrooms', 'population', 'households', 'median_income'
]
target_col = 'median_house_value'

# Drop rows with missing target
df = df.dropna(subset=[target_col])

# Impute missing numeric features
imputer = SimpleImputer(strategy='median')
X = imputer.fit_transform(df[feature_cols])
y = df[target_col].values

# Save imputer
os.makedirs('../src/imputers', exist_ok=True)
with open('../src/imputers/lasso_imputer.pkl', 'wb') as f:
    pickle.dump(imputer, f)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [5]:
# alpha is the regularization strength (default 1.0)
lasso_model = Lasso(alpha=0.1, max_iter=10000)
lasso_model.fit(X_train, y_train)

print("Lasso Regression trained successfully!")
print("Coefficients:", lasso_model.coef_)
print("Intercept:", lasso_model.intercept_)

Lasso Regression trained successfully!
Coefficients: [-0.         -0.01666958  0.03772148  0.          0.         -0.
  0.          0.36402891]
Intercept: -0.2405585844088464


In [6]:
y_pred = lasso_model.predict(X_test)

In [7]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

Mean Squared Error: 1.06
R^2 Score: 0.16


In [8]:
os.makedirs('../src/models', exist_ok=True)
with open('../src/models/lasso_model.pkl', 'wb') as f:
    pickle.dump(lasso_model, f)

print("Lasso Regression model saved successfully!")

Lasso Regression model saved successfully!
