In [None]:
# Reload the model and verify
print("Loading saved model...")
loaded_model = joblib.load(model_path)
loaded_scaler = joblib.load(scaler_path)
loaded_features = joblib.load(feature_names_path)

print("Model reloaded successfully!")

# Test on sample data
sample_test_pred_original = model.predict(X_test_scaled[:5])
sample_test_pred_loaded = loaded_model.predict(loaded_scaler.transform(X_test_encoded[:5]))

# Compare predictions
print("\nVerification - Comparing original vs reloaded model predictions:")
print("="*60)
for i in range(5):
    print(f"Sample {i+1}: Original=${sample_test_pred_original[i]:,.2f} | Loaded=${sample_test_pred_loaded[i]:,.2f}")

# Check if predictions are identical
if np.allclose(sample_test_pred_original, sample_test_pred_loaded):
    print("\n✓ Model reloaded successfully! Predictions are identical.")
else:
    print("\n✗ Warning: Predictions differ slightly.")

print("\nModel is ready for deployment!")
print(f"Features used: {loaded_features}")

## Section 10: Verify Model Reloading

In [None]:
import os

# Create model directory if it doesn't exist
model_dir = '../model'
os.makedirs(model_dir, exist_ok=True)

# Save the trained model
model_path = os.path.join(model_dir, 'house_price_model.pkl')
joblib.dump(model, model_path)
print(f"Model saved to: {model_path}")

# Save the scaler
scaler_path = os.path.join(model_dir, 'scaler.pkl')
joblib.dump(scaler, scaler_path)
print(f"Scaler saved to: {scaler_path}")

# Save feature names for later use
feature_names_path = os.path.join(model_dir, 'feature_names.pkl')
joblib.dump(X_train_encoded.columns.tolist(), feature_names_path)
print(f"Feature names saved to: {feature_names_path}")

print("\nModel artifacts saved successfully!")

## Section 9: Save the Trained Model

In [None]:
# Make predictions
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Calculate evaluation metrics for training set
train_mae = mean_absolute_error(y_train, y_train_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(y_train, y_train_pred)

# Calculate evaluation metrics for test set
test_mae = mean_absolute_error(y_test, y_test_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_test_pred)

# Display results
print("="*60)
print("MODEL PERFORMANCE EVALUATION")
print("="*60)
print("\nTRAINING SET METRICS:")
print(f"  Mean Absolute Error (MAE):        ${train_mae:,.2f}")
print(f"  Mean Squared Error (MSE):         ${train_mse:,.2f}")
print(f"  Root Mean Squared Error (RMSE):   ${train_rmse:,.2f}")
print(f"  R² Score:                         {train_r2:.4f}")

print("\nTEST SET METRICS:")
print(f"  Mean Absolute Error (MAE):        ${test_mae:,.2f}")
print(f"  Mean Squared Error (MSE):         ${test_mse:,.2f}")
print(f"  Root Mean Squared Error (RMSE):   ${test_rmse:,.2f}")
print(f"  R² Score:                         {test_r2:.4f}")

print("\n" + "="*60)
print(f"Model explains {test_r2*100:.2f}% of variance in test data")
print(f"Average prediction error: ${test_mae:,.2f}")
print("="*60)

## Section 8: Evaluate Model Performance

In [None]:
# Create and train Random Forest Regressor model
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

print("Training Random Forest Regressor Model...")
model.fit(X_train_scaled, y_train)
print("Model training completed!")

# Feature importance
feature_importance = model.feature_importances_
feature_names = X_train_encoded.columns.tolist()
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
}).sort_values('Importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(importance_df.head(10))

## Section 6 & 7: Implement and Train the Model
Using Random Forest Regressor for prediction

In [None]:
# Apply StandardScaler for feature normalization
scaler = StandardScaler()

# Fit scaler on training data only
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

print("Feature Scaling Applied:")
print(f"Training set mean: {X_train_scaled.mean(axis=0)[:5]}...")
print(f"Training set std: {X_train_scaled.std(axis=0)[:5]}...")
print(f"\nScaling complete. Features normalized to have mean=0 and std=1")

## Section 5: Feature Scaling

In [None]:
# Identify categorical variables
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical columns: {categorical_cols}")

# Use One-Hot Encoding for categorical variables
X_train_encoded = pd.get_dummies(X_train, columns=categorical_cols, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)

# Ensure both datasets have the same columns
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

print(f"\nShape after encoding:")
print(f"Training set: {X_train_encoded.shape}")
print(f"Testing set: {X_test_encoded.shape}")
print(f"\nFeature columns: {X_train_encoded.columns.tolist()}")

## Section 4: Encode Categorical Variables

In [None]:
# Select 6 features from the recommended 9
# Selected features: OverallQual, GrLivArea, TotalBsmtSF, GarageCars, YearBuilt, Neighborhood
selected_features = ['OverallQual', 'GrLivArea', 'TotalBsmtSF', 'GarageCars', 'YearBuilt', 'Neighborhood']
target = 'SalePrice'

print("Selected Features for Model:")
print(selected_features)
print("\nRationale: These features have strong correlation with house prices:")
print("- OverallQual: Overall material and finish quality")
print("- GrLivArea: Above grade (ground) living area")
print("- TotalBsmtSF: Total basement area")
print("- GarageCars: Size of garage in car capacity")
print("- YearBuilt: Original construction year")
print("- Neighborhood: Physical location of the property")

# Prepare features and target
X = train_data[selected_features].copy()
y = train_data[target].copy()

# Split into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

## Section 3: Feature Selection and Data Splitting

In [None]:
# Check missing values in recommended features
print("Missing Values in Recommended Features:")
print(train_data[recommended_features].isnull().sum())

# Handle missing values
# Strategy: Mean imputation for numerical, Mode for categorical
for col in recommended_features:
    if train_data[col].isnull().sum() > 0:
        if train_data[col].dtype == 'object':
            train_data[col].fillna(train_data[col].mode()[0], inplace=True)
            print(f"Filled missing values in {col} with mode")
        else:
            train_data[col].fillna(train_data[col].mean(), inplace=True)
            print(f"Filled missing values in {col} with mean")

print("\nMissing values after handling:")
print(train_data[recommended_features].isnull().sum())

## Section 2: Handle Missing Values

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
train_data = pd.read_csv('https://raw.githubusercontent.com/awesomedata/awesome-public-datasets/master/datasets/Kaggle/train.csv')

print("Dataset Shape:", train_data.shape)
print("\nFirst few rows:")
print(train_data.head())
print("\nDataset Info:")
print(train_data.info())
print("\nRecommended Features:")
recommended_features = ['OverallQual', 'GrLivArea', 'TotalBsmtSF', 'GarageCars', 'BedroomAbvGr', 'FullBath', 'YearBuilt', 'Neighborhood', 'SalePrice']
print(train_data[recommended_features].head())

## Section 1: Load and Explore the Dataset

# House Price Prediction System - Model Development
This notebook implements a machine learning model to predict house prices using the House Prices: Advanced Regression Techniques dataset.