In [None]:
# House Price Prediction Model - Google Colab Notebook
# Run this in Google Colab to train and download the model

# Cell 1: Install required libraries
!pip install scikit-learn pandas numpy joblib

# Cell 2: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
from google.colab import files

# Cell 3: Load dataset (using California Housing dataset as example)
from sklearn.datasets import fetch_california_housing

# Load the dataset
data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['PRICE'] = data.target

print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nDataset info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())

# Cell 4: Data Preprocessing
# a. Handling missing values
print("Checking for missing values...")
print(df.isnull().sum())

# If there were missing values, we would handle them like this:
# df.fillna(df.median(), inplace=True)

# b. Feature selection
X = df.drop('PRICE', axis=1)
y = df['PRICE']

print("\nFeatures selected:")
print(X.columns.tolist())

# c. No categorical variables in this dataset, but if there were:
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# df['categorical_column'] = le.fit_transform(df['categorical_column'])

# d. Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("\nFeature scaling completed")

# Cell 5: Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# Cell 6: Train Random Forest Regressor
print("Training Random Forest Regressor...")
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)
print("Model training completed!")

# Cell 7: Model Evaluation
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("\n=== Model Evaluation Metrics ===")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"RÂ² Score: {r2:.4f}")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': data.feature_names,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\n=== Feature Importance ===")
print(feature_importance)

# Cell 8: Save model and scaler
# Create a dictionary with model, scaler, and feature names
model_data = {
    'model': model,
    'scaler': scaler,
    'feature_names': data.feature_names
}

# Save the model
joblib.dump(model_data, 'house_price_model.pkl')
print("\nModel saved as 'house_price_model.pkl'")

# Cell 9: Test loading the model
loaded_model_data = joblib.load('house_price_model.pkl')
loaded_model = loaded_model_data['model']
loaded_scaler = loaded_model_data['scaler']

# Test prediction
test_sample = X_test[0].reshape(1, -1)
prediction = loaded_model.predict(test_sample)
print(f"\nTest prediction: ${prediction[0] * 100000:.2f}")
print(f"Actual value: ${y_test.iloc[0] * 100000:.2f}")
print("\nModel loaded successfully and ready for deployment!")

# Cell 10: Download the model file
files.download('house_price_model.pkl')
print("\nModel file downloaded! Upload this to your project's /model/ directory.")