In [15]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import os, pickle
import pandas as pd

# Load data
df = pd.read_csv('../../data/processed/transformed_data.csv')

# Columns
feature_cols = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
                'total_bedrooms', 'population', 'households', 'median_income']
target_col = 'median_house_value'

# Drop target NaNs
df = df.dropna(subset=[target_col])

# Impute numeric features
imputer = SimpleImputer(strategy='median')
X = imputer.fit_transform(df[feature_cols])
y = df[target_col].values

# Save imputer
os.makedirs('../src/imputers', exist_ok=True)
with open('../src/imputers/multiple_linear_imputer.pkl', 'wb') as f:
    pickle.dump(imputer, f)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
mlr = LinearRegression()
mlr.fit(X_train, y_train)

# Evaluate
y_pred = mlr.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))

# Save model
os.makedirs('../src/models', exist_ok=True)
with open('../src/models/multiple_linear_model.pkl', 'wb') as f:
    pickle.dump(mlr, f)

MSE: 0.8247560000486654
R2: 0.34618327701618046
