# House Price Prediction: Model Training Workflow
This notebook demonstrates the full workflow for training house price prediction models using the House Price India.csv dataset. It covers data loading, exploration, cleaning, preprocessing, feature engineering, and training both Linear Regression and Random Forest models.

In [None]:
# 1. Import Required Libraries
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [None]:
def load_data(filepath):
    """Load and return the dataset."""
    return pd.read_csv(filepath)

def clean_data(df, discrete_cols, continuous_cols):
    """Clean the dataframe by handling missing values and appropriate rounding."""
    # Handle missing values
    for col in discrete_cols + continuous_cols:
        if col in df.columns and df[col].isnull().any():
            df[col].fillna(df[col].median(), inplace=True)
    
    # Round only discrete columns
    for col in discrete_cols:
        if col in df.columns:
            df[col] = df[col].round(0).astype('Int64')
    
    return df

def create_features(df, current_year=None):
    """Create engineered features."""
    if current_year is None:
        current_year = datetime.now().year
        
    df_new = df.copy()
    
    # Create time-based features
    df_new['house_age'] = current_year - df_new['Built Year']
    df_new['years_since_renovation'] = current_year - df_new['Renovation Year']
    
    # Create area-based features
    df_new['price_per_sqft'] = df_new['Price'] / df_new['living area']
    df_new['total_area'] = df_new['living area'] + df_new['lot area']
    df_new['total_rooms'] = df_new['number of bedrooms'] + df_new['number of bathrooms']
    df_new['area_per_room'] = df_new['living area'] / df_new['total_rooms']
    
    return df_new

def plot_feature_importance(model, feature_names, top_n=10):
    """Plot feature importance from the model."""
    feature_importance = pd.DataFrame({
        'feature': feature_names,
        'importance': model.feature_importances_
    })
    feature_importance = feature_importance.sort_values('importance', ascending=False)
    
    plt.figure(figsize=(12, 6))
    sns.barplot(x='importance', y='feature', data=feature_importance.head(top_n))
    plt.title(f'Top {top_n} Most Important Features')
    plt.xlabel('Feature Importance')
    plt.tight_layout()
    plt.show()
    
    return feature_importance

def evaluate_model(model, X_test, y_test):
    """Evaluate model performance."""
    predictions = model.predict(X_test)
    
    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    r2 = r2_score(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    
    metrics = {
        'RMSE': rmse,
        'R2': r2,
        'MAE': mae
    }
    
    for metric_name, value in metrics.items():
        print(f'{metric_name}: {value:.2f}')
    
    return metrics

## 2. Load the Dataset
Read the House Price India.csv dataset into a pandas DataFrame.

In [4]:
# 2. Loading Dataset
DATASET = 'House Price India.csv'
df = load_data(DATASET)
df.head()

Unnamed: 0,id,Date,number of bedrooms,number of bathrooms,living area,lot area,number of floors,waterfront present,number of views,condition of the house,...,Built Year,Renovation Year,Postal Code,Lattitude,Longitude,living_area_renov,lot_area_renov,Number of schools nearby,Distance from the airport,Price
0,6762810635,42491,4,2.5,2920,4000,1.5,0,0,5,...,1909,0,122004,52.8878,-114.47,2470,4000,2,51,1400000
1,6762810998,42491,5,2.75,2910,9480,1.5,0,0,3,...,1939,0,122004,52.8852,-114.468,2940,6600,1,53,1200000
2,6762812605,42491,4,2.5,3310,42998,2.0,0,0,3,...,2001,0,122005,52.9532,-114.321,3350,42847,3,76,838000
3,6762812919,42491,3,2.0,2710,4500,1.5,0,0,4,...,1929,0,122006,52.9047,-114.485,2060,4500,1,51,805000
4,6762813105,42491,3,2.5,2600,4750,1.0,0,0,4,...,1951,0,122007,52.9133,-114.59,2380,4750,1,67,790000


## 3. Data Exploration
Explore the dataset through statistical analysis and visualizations to understand data patterns, distributions, and relationships.

In [None]:
# Data Exploration
print('Info:')
df.info()
print('\nDescription:')
display(df.describe())
print('\nMissing values:')
df.isnull().sum()

# Visualizations
plt.figure(figsize=(10, 6))
sns.histplot(df['Price'], bins=30, kde=True)
plt.title('Price Distribution')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(x='living area', y='Price', data=df)
plt.title('Price vs Living Area')
plt.xlabel('Living Area')
plt.ylabel('Price')
plt.grid(True)
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(x='number of bedrooms', y='Price', data=df)
plt.title('Price Distribution by Number of Bedrooms')
plt.xlabel('Number of Bedrooms')
plt.ylabel('Price')
plt.grid(True)
plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 8))
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
correlation = df[numeric_features].corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap of Numeric Features')
plt.tight_layout()
plt.show()

# Distribution of key features
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Distribution of Key Features')

sns.histplot(data=df, x='living area', kde=True, ax=axes[0,0])
axes[0,0].set_title('Living Area Distribution')
axes[0,0].grid(True)

sns.histplot(data=df, x='number of bedrooms', kde=True, ax=axes[0,1])
axes[0,1].set_title('Number of Bedrooms Distribution')
axes[0,1].grid(True)

sns.histplot(data=df, x='grade of the house', kde=True, ax=axes[1,0])
axes[1,0].set_title('House Grade Distribution')
axes[1,0].grid(True)

sns.histplot(data=df, x='condition of the house', kde=True, ax=axes[1,1])
axes[1,1].set_title('House Condition Distribution')
axes[1,1].grid(True)

plt.tight_layout()
plt.show()

# Geographic distribution of prices
plt.figure(figsize=(12, 8))
scatter = plt.scatter(df['Longitude'], df['Lattitude'], 
                     c=df['Price'], cmap='viridis', 
                     s=50, alpha=0.6)
plt.colorbar(scatter, label='Price')
plt.title('Geographic Distribution of House Prices')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid(True)
plt.show()

# Pairplot of main features
main_features = ['Price', 'living area', 'number of bedrooms', 
                'number of bathrooms', 'grade of the house']
sns.pairplot(df[main_features], diag_kind='kde')
plt.suptitle('Pairplot of Main Features', y=1.02)
plt.show()

## 4. Data Cleaning
Fill missing values for numeric columns with the median, categorical columns with the mode, and round numeric columns to the nearest integer.

In [6]:
# Data Cleaning
# Define discrete and continuous columns
discrete_cols = [
    'number of bedrooms', 'number of bathrooms', 'number of floors',
    'number of views', 'condition of the house', 'grade of the house',
    'Number of schools nearby'
]

continuous_cols = [
    'living area', 'lot area', 'Area of the house(excluding basement)', 'Area of the basement',
    'Built Year', 'Renovation Year', 'Postal Code', 'Lattitude', 'Longitude',
    'living_area_renov', 'lot_area_renov', 'Distance from the airport'
]

# Clean data using helper function
df = clean_data(df, discrete_cols, continuous_cols)

## 5. Data Preprocessing
Convert categorical columns to numeric using one-hot encoding.

In [7]:
# Data PreProcessing
# Only 'waterfront present' is categorical in this dataset
categorical_cols = ['waterfront present']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
df_encoded.head()

Unnamed: 0,id,Date,number of bedrooms,number of bathrooms,living area,lot area,number of floors,number of views,condition of the house,grade of the house,...,Renovation Year,Postal Code,Lattitude,Longitude,living_area_renov,lot_area_renov,Number of schools nearby,Distance from the airport,Price,waterfront present_1
0,6762810635,42491,4,2,2920,4000,2,0,5,8,...,0,122004,52.8878,-114.47,2470,4000,2,51,1400000,False
1,6762810998,42491,5,3,2910,9480,2,0,3,8,...,0,122004,52.8852,-114.468,2940,6600,1,53,1200000,False
2,6762812605,42491,4,2,3310,42998,2,0,3,9,...,0,122005,52.9532,-114.321,3350,42847,3,76,838000,False
3,6762812919,42491,3,2,2710,4500,2,0,4,8,...,0,122006,52.9047,-114.485,2060,4500,1,51,805000,False
4,6762813105,42491,3,2,2600,4750,1,0,4,9,...,0,122007,52.9133,-114.59,2380,4750,1,67,790000,False


## 6. Feature Engineering
Select relevant features for the model.

In [8]:
# Feature Engineering
# Create features using helper function
df_encoded = create_features(df_encoded)

# Select features including engineered ones
features = [
    'number of bedrooms', 'number of bathrooms', 'living area', 'lot area', 'number of floors',
    'number of views', 'condition of the house', 'grade of the house',
    'Area of the house(excluding basement)', 'Area of the basement',
    'Lattitude', 'Longitude', 'Number of schools nearby', 'Distance from the airport',
    'waterfront present_1', 'house_age', 'years_since_renovation',
    'total_area', 'total_rooms', 'area_per_room'
]

# Prepare features and target
X = df_encoded[features]
y = df_encoded['Price']

# Create preprocessing pipeline
preprocessor = StandardScaler()

## 7. Train/Test Split
Split the data into training and testing sets.

In [10]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 8. Random Forest Model
Train a Random Forest Regressor and evaluate its performance.

In [11]:
# Random Forest Model
rf_model = RandomForestRegressor(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
try:
    rf_rmse = mean_squared_error(y_test, rf_pred, squared=False)
except TypeError:
    rf_rmse = mean_squared_error(y_test, rf_pred) ** 0.5
rf_r2 = r2_score(y_test, rf_pred)
print(f'Random Forest RMSE: {rf_rmse:.2f}, R2: {rf_r2:.2f}')

Random Forest RMSE: 135392.76, R2: 0.88


## 9. Save Models
Save both trained models and feature columns for later use.

In [12]:
# Save both models and feature columns
joblib.dump({'rf_model': rf_model, 'features': features}, 'model.pkl')
print('Models trained and saved as model.pkl')

Models trained and saved as model.pkl


In [None]:
# Create Pipeline with Random Forest
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestRegressor(random_state=42))
])

# Define parameter grid for Random Forest
param_grid = {
    'rf__n_estimators': [100, 200, 300],
    'rf__max_depth': [10, 20, 30, None],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4]
}

# Perform grid search
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,  # Use all CPU cores
    scoring='neg_mean_squared_error',
    verbose=1
)

print("Starting model training...")
# Time the model fitting
from time import time
start_time = time()

# Fit the model
grid_search.fit(X_train, y_train)

# Calculate training time
train_time = time() - start_time
print(f'\nTraining completed in {train_time:.2f} seconds')

# Get best model and parameters
best_model = grid_search.best_estimator_
print('\nBest parameters:')
for param, value in grid_search.best_params_.items():
    print(f'{param}: {value}')

# Make predictions and evaluate
print('\nModel Performance:')
metrics = evaluate_model(best_model, X_test, y_test)

# Feature Importance Analysis
feature_importance = plot_feature_importance(best_model.named_steps['rf'], features)

# Save the complete pipeline and grid search results
save_dict = {
    'pipeline': best_model,
    'features': features,
    'grid_search_results': {
        'best_params': grid_search.best_params_,
        'cv_results': grid_search.cv_results_
    },
    'metrics': metrics,
    'training_time': train_time
}
joblib.dump(save_dict, 'model.pkl')
print('\nModel, features, and grid search results saved as model.pkl')

Fitting 5 folds for each of 108 candidates, totalling 540 fits
