In [5]:
#pip install pandas numpy matplotlib seaborn scikit-learn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import pickle

In [16]:
print("=" * 60)
print("LOAD THE DATASET")
print("=" * 60)

df = pd.read_csv('house_data.csv')
print(f"\nDataset Loaded Successfully!")
print(f"Total records: {len(df)}")
print(f"Total features: {len(df.columns)}")

# STEP 1: Load the Dataset
print("=" * 60)
print("DATASET OVERVIEW")
print("=" * 60)

print(f"\nFirst 5 rows:")
print(df.head())

print(f"\nDataset info:")
print(df.info())

print(f'\nStatistical Summary:')
print(df.describe())

# missing value
print("\n" + "=" * 60)
print("DATA QUALITY CHECK")
print("=" * 60)

print(f"\nMissing values per column:")
print(df.isnull().sum())

#Check for duplicates
duplicates = df.duplicated().sum()
print(f"\nDuplicate rows: {duplicates}")


LOAD THE DATASET

Dataset Loaded Successfully!
Total records: 10000
Total features: 15
DATASET OVERVIEW

First 5 rows:
   square_feet  bedrooms  bathrooms  age_years  garage_spaces  lot_size_sqft  \
0         1360         5          4         28              2          15659   
1         4272         2          4         48              2           9809   
2         3592         3          1         74              0          16535   
3          966         1          4         50              2          17298   
4         4926         7          4         49              1          18244   

   floors  crime_rate  school_rating  distance_to_city_miles  has_pool  \
0       2    5.015651              8               28.443143         1   
1       2    9.479502              6               23.830587         1   
2       3    3.455443              9               40.558587         1   
3       2    0.592153              4               16.086668         0   
4       1    6.126729         

In [19]:
# Step 2: EDA
print("\n" + "=" * 60)
print("EXPLORATORY DATA ANALYSIS")
print("=" * 60)

# Distribution of the target Variable (price)
print(f"\nPrice Statistics:")
print(f"Mean price: ${df['price'].mean():,.2f}")
print(f"Median price: ${df['price'].median():,.2f}")
print(f"Min price: ${df['price'].min():,.2f}")
print(f"Max price: ${df['price'].max():,.2f}")
print(f"Std Dev: ${df['price'].std():,.2f}")

# Correlation analysis
print("\n" + "=" * 60)
print("CORRELATION WITH PRICE")
print("=" * 60)

correlations = df.corr()['price'].sort_values(ascending=False)
print(f"\nFeatures correlation with price:")
print(correlations)

# Identify highly correlated features
high_corr = correlations[abs(correlations) > 0.5].drop('price')
print(f"\nHighly correlated features (|correlation| > 0.5):")
print(high_corr)


EXPLORATORY DATA ANALYSIS

Price Statistics:
Mean price: $622,410.73
Median price: $621,881.82
Min price: $116,848.38
Max price: $1,146,149.01
Std Dev: $175,581.76

CORRELATION WITH PRICE

Features correlation with price:
price                     1.000000
square_feet               0.885530
bedrooms                  0.183812
bathrooms                 0.167741
lot_size_sqft             0.163816
neighborhood_quality      0.141185
garage_spaces             0.073030
school_rating             0.068939
has_renovated             0.062737
floors                    0.044935
has_pool                  0.039736
has_fireplace             0.020941
crime_rate               -0.036558
distance_to_city_miles   -0.050041
age_years                -0.086806
Name: price, dtype: float64

Highly correlated features (|correlation| > 0.5):
square_feet    0.88553
Name: price, dtype: float64


In [24]:
#Data Preprocessing
# STEP 4: Data Preprocessing
print("\n" + "=" * 60)
print("STEP 4: DATA PREPROCESSING")
print("=" * 60)

# Separate features and the target
X = df.drop('price', axis=1)
y = df['price']

print(f"\nFeatures (X) shape {X.shape}:")
print(f"\nTarget (y) shape {y.shape}:")

# 4.2 Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=43
)

print(f"\nTrain set size: {len(X_train)} samples ({len(X_train)/len(X) * 100:.1f}%)")
print(f"\nTest set size: {len(X_test)} samples ({len(X_test)/len(X) * 100:.1f}%)")

# 4.3 Feature Scaling
print("\n" + "=" * 60)
print("FEATURE SCALING")
print("=" * 60)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Features scaled using StandardScaler")
print(f"Formula: z = (x - mean)/std_deviation")
print(f"\nSample before scaling: \n{X_train.iloc[0].values}")
print(f"\nSample after scaling: \n{X_train_scaled[0]}")


STEP 4: DATA PREPROCESSING

Features (X) shape (10000, 14):

Target (y) shape (10000,):

Train set size: 8000 samples (80.0%)

Test set size: 2000 samples (20.0%)

FEATURE SCALING
Features scaled using StandardScaler
Formula: z = (x - mean)/std_deviation

Sample before scaling: 
[5.81000000e+02 3.00000000e+00 2.00000000e+00 5.80000000e+01
 2.00000000e+00 2.22600000e+03 3.00000000e+00 5.15434067e+00
 5.00000000e+00 2.74451021e+01 0.00000000e+00 1.00000000e+00
 0.00000000e+00 2.00000000e+00]

Sample after scaling: 
[-1.68333866 -0.50662453 -0.72096613  0.28737159  0.43612513 -1.50482593
  1.24288511  0.01309466 -0.18236817  0.15996557 -0.655628    1.23468849
 -0.73057671 -0.70390816]


In [26]:
# Model Training
print("\n" + "=" * 60)
print("STEP 5: MODEL TRAINING")
print("=" * 60)

models = {}

# MODEL 1: Linear Regression
print("\n[1/3] Training Linear Regression...")
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)
models['Linear Regression'] = lr_model
print("Linear Regression Trained")

# MODEL 2: Random Forest Regressor
print("\n[2/3] Training Random Forest...")
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train_scaled, y_train)
models['Random Forest'] = rf_model
print("Random Forest Trained")

# MODEL 3: Gradient Boosting Regressor
print("\n[3/3] Gradient Boosting Forest...")
gb_model = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)
gb_model.fit(X_train_scaled, y_train)
models['Gradient Boosting'] = gb_model
print("Gradient Boosting Trained")

print("\nAll models trained successfully!")


STEP 5: MODEL TRAINING

[1/3] Training Linear Regression...
Linear Regression Trained

[2/3] Training Random Forest...
Random Forest Trained

[3/3] Gradient Boosting Forest...
Gradient Boosting Trained

All models trained successfully!


In [31]:
# STEP 6: Model evaluation
print("\n" + "=" * 60)
print("STEP 6: MODEL EVALUATION")
print("=" * 60)

results = {}

for name, model in models.items():
    print("\n" + "=" * 60)
    print(f"Evaluation: {name}")
    print("=" * 60)

    # Prediction
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)

    # Training metrics
    train_r2 = r2_score(y_train, y_train_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)

    # Testing metrics
    test_r2 = r2_score(y_test, y_test_pred)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_mae = mean_absolute_error(y_test, y_test_pred)

    results[name] = {
    'train_r2': train_r2,
    'test_r2': test_r2,
    'train_rmse': train_rmse,
    'test_rmse': test_rmse,
    'train_mae': train_mae,
    'test_mae': test_mae
    }
    
    print(f"\nTraining Performance:")
    print(f"R2 Score: {train_r2:.4f}")
    print(f"RMSE: {train_rmse:.2f}")
    print(f"MAE: {train_mae:.2f}")
    
    print(f"\nTest Performance:")
    print(f"R2 Score: {test_r2:.4f}")
    print(f"RMSE: {test_rmse:.2f}")
    print(f"MAE: {test_mae:.2f}")

    overfit_diff = train_r2 - test_r2
    if overfit_diff > 0.1:
        print(f"Warning: Possible overfitting (difference: {overfit_diff:.4f})")
    else:
        print(f"Good generalization (difference: {overfit_diff:.4f})")

# Summary comparison
print("\n" + "=" * 60)
print("MODEL COMPARISON SUMMARY")
print("=" * 60)

print(f"\n{'Model':<20}{'Test R2':<12}{'Test RMSE':<15}{'Test MAE':<15}")
print("-" * 60)

for name, metrics in results.items():
    print(f"{name:<20}{metrics['test_r2']:<12.4f} ${metrics['test_rmse']:<14,.2f} ${metrics['test_mae']:<15,.2f}")

# Select best model
best_model_name = max(results, key=lambda x: results[x]['test_r2'])
best_model = models[best_model_name]

print(f"Best Model: {best_model_name}")
print(f"Test R2 Score: {results[best_model_name]['test_r2']:.4f}")



STEP 6: MODEL EVALUATION

Evaluation: Linear Regression

Training Performance:
R2 Score: 0.9195
RMSE: 49848.47
MAE: 39768.17

Test Performance:
R2 Score: 0.9191
RMSE: 49728.11
MAE: 39647.97
Good generalization (difference: 0.0004)

Evaluation: Random Forest

Training Performance:
R2 Score: 0.9846
RMSE: 21799.24
MAE: 17207.09

Test Performance:
R2 Score: 0.8931
RMSE: 57183.50
MAE: 45299.78
Good generalization (difference: 0.0915)

Evaluation: Gradient Boosting

Training Performance:
R2 Score: 0.9456
RMSE: 41000.68
MAE: 32717.91

Test Performance:
R2 Score: 0.9096
RMSE: 52565.29
MAE: 42056.00
Good generalization (difference: 0.0359)

MODEL COMPARISON SUMMARY

Model               Test R2     Test RMSE      Test MAE       
------------------------------------------------------------
Linear Regression   0.9191       $49,728.11      $39,647.97      
Random Forest       0.8931       $57,183.50      $45,299.78      
Gradient Boosting   0.9096       $52,565.29      $42,056.00      
Best Model:

In [33]:
# STEP 7: Save Model (Serialization)
print("\n" + "=" * 60)
print("STEP 7: MODEL SERIALIZATION")
print("=" * 60)

# Save the best model
model_filename = 'house_price_model.pkl'
scaler_filename = 'scaler.pkl'

print(f"\nSaving the best model: {best_model_name}")
with open(model_filename, "wb") as file:
    pickle.dump(best_model, file)
print(f"Model saved as '{model_filename}'")

print(f"\nSaving scaler...")
with open(scaler_filename, "wb") as file:
    pickle.dump(scaler, file)
print(f"Scaler saved as '{scaler_filename}'")

# Save feature names
feature_names = X.columns.tolist()
with open('feature_names.pkl', 'wb') as file:
    pickle.dump(feature_names, file)
print(f"Feature names saved as '{feature_names}'")

# Save model metadata
metadata = {
    'model_name': best_model_name,
    'test_rmse': results[best_model_name]['test_rmse'],
    'test_mae': results[best_model_name]['test_mae'],
    'features': feature_names,
    'training_samples': len(X_train),
    'testing_samples': len(X_test)
}

with open('model_metadata.pkl', 'wb') as file:
    pickle.dump(metadata, file)
print(f"Metadata saved as 'model_metadata.pkl'")

print("\n" + "=" * 60)
print("MODEL TRAINING COMPLETE!")
print("=" * 60)

print(f"\nFiles created:")
print(f" 1. {model_filename} - Trained model")
print(f" 2. {scaler_filename} - Feature scaler")
print(f" 3. feature_names.pkl - Feature names")
print(f" 4. model_metadata.pkl - Model information")




STEP 7: MODEL SERIALIZATION

Saving the best model: Linear Regression
Model saved as 'house_price_model.pkl'

Saving scaler...
Scaler saved as 'scaler.pkl'
Feature names saved as '['square_feet', 'bedrooms', 'bathrooms', 'age_years', 'garage_spaces', 'lot_size_sqft', 'floors', 'crime_rate', 'school_rating', 'distance_to_city_miles', 'has_pool', 'has_fireplace', 'has_renovated', 'neighborhood_quality']'
Metadata saved as 'model_metadata.pkl'

MODEL TRAINING COMPLETE!

Files created:
 1. house_price_model.pkl - Trained model
 2. scaler.pkl - Feature scaler
 3. feature_names.pkl - Feature names
 4. model_metadata.pkl - Model information
