# House Price Prediction Model Development

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import os

In [None]:
# Define paths (assuming notebook is in the 'model' directory)
DATA_PATH = 'train.csv'
MODEL_PATH = 'house_price_model.pkl'

## 1. Load Dataset

In [None]:
print("Loading dataset...")
if not os.path.exists(DATA_PATH):
    print(f"Error: Dataset not found at {DATA_PATH}. Please ensure 'train.csv' is in the current directory.")
else:
    df = pd.read_csv(DATA_PATH)
    print("Dataset loaded successfully.")
    print(df.head())

## 2. Feature Selection & Preprocessing

In [None]:
# Selected features based on assignment requirements
features = ['OverallQual', 'GrLivArea', 'GarageCars', 'YearBuilt', 'TotalBsmtSF', 'FullBath']
target = 'SalePrice'

# Check if columns exist
missing_cols = [col for col in features + [target] if col not in df.columns]
if missing_cols:
    print(f"Error: Missing columns in dataset: {missing_cols}")
else:
    # Handling missing values
    # For simplicity, filling numeric missing values with the median
    X = df[features].copy()
    y = df[target]

    for col in X.columns:
        if X[col].isnull().sum() > 0:
            X[col] = X[col].fillna(X[col].median())
            print(f"Filled missing values in {col}")
            
    print("Preprocessing complete.")

## 3. Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

## 4. Model Training (Random Forest)

In [None]:
print("Training Random Forest Regressor...")
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
print("Model training complete.")

## 5. Evaluation

In [None]:
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Model Evaluation:")
print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"RÂ² Score: {r2:.4f}")

## 6. Save Model

In [None]:
print(f"Saving model to {MODEL_PATH}...")
joblib.dump(model, MODEL_PATH)
print("Model saved successfully.")