# House Price Prediction with FastAPI
## Complete ML Model Inference Project

This notebook covers:
1. Data Exploration and Cleaning
2. Feature Engineering
3. Model Training and Evaluation
4. Model Saving
5. FastAPI Application Creation
6. Testing and Deployment


## 1. Import Libraries and Load Data


In [71]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")


Libraries imported successfully!


In [72]:
# Load the dataset
df = pd.read_csv('Housing.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nFirst 5 rows:")
df.head()


Dataset shape: (545, 13)

First 5 rows:


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


## 2. Data Exploration and Analysis


In [73]:
# Basic information about the dataset
print("Dataset Info:")
print(df.info())
print("\n" + "="*50)
print("Missing Values:")
print(df.isnull().sum())
print("\n" + "="*50)
print("Statistical Summary:")
df.describe()


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB
None

Missing Values:
price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
count,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.693578
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.861586
min,1750000.0,1650.0,1.0,1.0,1.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,0.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0
max,13300000.0,16200.0,6.0,4.0,4.0,3.0


In [74]:
# Explore categorical variables
categorical_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 
                   'airconditioning', 'prefarea', 'furnishingstatus']

print("Unique values in categorical columns:")
for col in categorical_cols:
    print(f"{col}: {df[col].unique()}")


Unique values in categorical columns:
mainroad: ['yes' 'no']
guestroom: ['no' 'yes']
basement: ['no' 'yes']
hotwaterheating: ['no' 'yes']
airconditioning: ['yes' 'no']
prefarea: ['yes' 'no']
furnishingstatus: ['furnished' 'semi-furnished' 'unfurnished']


## 3. Data Preprocessing and Model Training


In [75]:
# Create a copy for preprocessing
df_processed = df.copy()

# Encode categorical variables
label_encoders = {}
categorical_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 
                   'airconditioning', 'prefarea', 'furnishingstatus']

print("Encoding categorical variables:")
for col in categorical_cols:
    le = LabelEncoder()
    df_processed[col] = le.fit_transform(df_processed[col])
    label_encoders[col] = le
    print(f"{col}: {le.classes_} -> {range(len(le.classes_))}")

print("\nProcessed dataset:")
df_processed.head()


Encoding categorical variables:
mainroad: ['no' 'yes'] -> range(0, 2)
guestroom: ['no' 'yes'] -> range(0, 2)
basement: ['no' 'yes'] -> range(0, 2)
hotwaterheating: ['no' 'yes'] -> range(0, 2)
airconditioning: ['no' 'yes'] -> range(0, 2)
prefarea: ['no' 'yes'] -> range(0, 2)
furnishingstatus: ['furnished' 'semi-furnished' 'unfurnished'] -> range(0, 3)

Processed dataset:


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0


In [76]:
# Prepare features and target
X = df_processed.drop('price', axis=1)
y = df_processed['price']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
print(f"Features: {list(X.columns)}")

# Train Random Forest Regressor
print("\nTraining Random Forest Regressor...")
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Make predictions
rf_pred_test = rf_model.predict(X_test)

# Calculate metrics
rf_test_rmse = np.sqrt(mean_squared_error(y_test, rf_pred_test))
rf_test_r2 = r2_score(y_test, rf_pred_test)
rf_test_mae = mean_absolute_error(y_test, rf_pred_test)

print(f"\nRandom Forest Results:")
print(f"  Test RMSE:  ₹{rf_test_rmse:,.2f}")
print(f"  Test R²:    {rf_test_r2:.4f}")
print(f"  Test MAE:   ₹{rf_test_mae:,.2f}")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 5 Important Features:")
print(feature_importance.head())


Training set size: 436 samples
Test set size: 109 samples
Features: ['area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'parking', 'prefarea', 'furnishingstatus']

Training Random Forest Regressor...

Random Forest Results:
  Test RMSE:  ₹1,401,263.08
  Test R²:    0.6115
  Test MAE:   ₹1,025,289.68

Top 5 Important Features:
           feature  importance
0             area    0.470417
2        bathrooms    0.152678
8  airconditioning    0.062343
9          parking    0.056608
3          stories    0.054772


In [77]:
# Train Linear Regression
print("\nTraining Linear Regression...")
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions
lr_pred_test = lr_model.predict(X_test)

# Calculate metrics
lr_test_rmse = np.sqrt(mean_squared_error(y_test, lr_pred_test))
lr_test_r2 = r2_score(y_test, lr_pred_test)
lr_test_mae = mean_absolute_error(y_test, lr_pred_test)

print(f"\nLinear Regression Results:")
print(f"  Test RMSE:  ₹{lr_test_rmse:,.2f}")
print(f"  Test R²:    {lr_test_r2:.4f}")
print(f"  Test MAE:   ₹{lr_test_mae:,.2f}")



Training Linear Regression...

Linear Regression Results:
  Test RMSE:  ₹1,331,071.42
  Test R²:    0.6495
  Test MAE:   ₹979,679.69


In [78]:
# Train Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor

print("\nTraining Decision Tree Regressor...")
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# Make predictions
dt_pred_test = dt_model.predict(X_test)

# Calculate metrics
dt_test_rmse = np.sqrt(mean_squared_error(y_test, dt_pred_test))
dt_test_r2 = r2_score(y_test, dt_pred_test)
dt_test_mae = mean_absolute_error(y_test, dt_pred_test)

print(f"\nDecision Tree Results:")
print(f"  Test RMSE:  ₹{dt_test_rmse:,.2f}")
print(f"  Test R²:    {dt_test_r2:.4f}")
print(f"  Test MAE:   ₹{dt_test_mae:,.2f}")



Training Decision Tree Regressor...

Decision Tree Results:
  Test RMSE:  ₹1,639,566.30
  Test R²:    0.4682
  Test MAE:   ₹1,222,399.08


In [79]:
# Compare all models
models_comparison = pd.DataFrame({
    'Model': ['Random Forest', 'Linear Regression', 'Decision Tree'],
    'Test R²': [rf_test_r2, lr_test_r2, dt_test_r2],
    'Test RMSE': [rf_test_rmse, lr_test_rmse, dt_test_rmse],
    'Test MAE': [rf_test_mae, lr_test_mae, dt_test_mae]
})

print("\nModel Comparison:")
print(models_comparison.round(4))

# Choose the best model
best_model_index = models_comparison['Test R²'].idxmax()
best_model_name = models_comparison.loc[best_model_index, 'Model']
best_model_r2 = models_comparison.loc[best_model_index, 'Test R²']

print(f"\n🏆 Best Model: {best_model_name} (R² = {best_model_r2:.4f})")



Model Comparison:
               Model  Test R²     Test RMSE      Test MAE
0      Random Forest   0.6115  1.401263e+06  1.025290e+06
1  Linear Regression   0.6495  1.331071e+06  9.796797e+05
2      Decision Tree   0.4682  1.639566e+06  1.222399e+06

🏆 Best Model: Linear Regression (R² = 0.6495)


In [80]:
# Train Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor

print("\nTraining Decision Tree Regressor...")
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# Make predictions
dt_pred_test = dt_model.predict(X_test)

# Calculate metrics
dt_test_rmse = np.sqrt(mean_squared_error(y_test, dt_pred_test))
dt_test_r2 = r2_score(y_test, dt_pred_test)
dt_test_mae = mean_absolute_error(y_test, dt_pred_test)

print(f"\nDecision Tree Results:")
print(f"  Test RMSE:  ₹{dt_test_rmse:,.2f}")
print(f"  Test R²:    {dt_test_r2:.4f}")
print(f"  Test MAE:   ₹{dt_test_mae:,.2f}")



Training Decision Tree Regressor...

Decision Tree Results:
  Test RMSE:  ₹1,639,566.30
  Test R²:    0.4682
  Test MAE:   ₹1,222,399.08


In [81]:
# Train and evaluate models
models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42)
}

results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    r2 = r2_score(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    results[name] = {'RMSE': rmse, 'R²': r2, 'MAE': mae}

# Determine the best model
best_model_name = max(results, key=lambda k: results[k]['R²'])
best_model = models[best_model_name]
best_results = results[best_model_name]

print(f"\n🏆 Best Model: {best_model_name}")
print(f"  Test RMSE:  ₹{best_results['RMSE']:,.2f}")
print(f"  Test R²:    {best_results['R²']:.4f}")
print(f"  Test MAE:   ₹{best_results['MAE']:,.2f}")



Training Random Forest...

Training Linear Regression...

Training Decision Tree...

🏆 Best Model: Linear Regression
  Test RMSE:  ₹1,331,071.42
  Test R²:    0.6495
  Test MAE:   ₹979,679.69


In [86]:
# Save the best model and preprocessing components
print("Saving the best model and preprocessing components...")

# Save the best model
joblib.dump(best_model, 'house_price_model.pkl')

# Test prediction with a sample
sample_house = X_test.iloc[0]
sample_prediction = best_model.predict([sample_house])[0]
actual_price = y_test.iloc[0]

print(f"\nSample Prediction Test:")
print(f"Predicted: ₹{sample_prediction:,.2f}")
print(f"Actual: ₹{actual_price:,.2f}")
print(f"Difference: ₹{abs(sample_prediction - actual_price):,.2f}")

print("\nAll components saved successfully!")


Saving the best model and preprocessing components...

Sample Prediction Test:
Predicted: ₹5,203,691.71
Actual: ₹4,060,000.00
Difference: ₹1,143,691.71

All components saved successfully!


In [87]:
# Train Linear Regression
print("\nTraining Linear Regression...")
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions
lr_pred_test = lr_model.predict(X_test)

# Calculate metrics
lr_test_rmse = np.sqrt(mean_squared_error(y_test, lr_pred_test))
lr_test_r2 = r2_score(y_test, lr_pred_test)
lr_test_mae = mean_absolute_error(y_test, lr_pred_test)

print(f"\nLinear Regression Results:")
print(f"  Test RMSE:  ₹{lr_test_rmse:,.2f}")
print(f"  Test R²:    {lr_test_r2:.4f}")
print(f"  Test MAE:   ₹{lr_test_mae:,.2f}")



Training Linear Regression...

Linear Regression Results:
  Test RMSE:  ₹1,331,071.42
  Test R²:    0.6495
  Test MAE:   ₹979,679.69


## 4. Save Model and Create FastAPI Application


In [90]:


# Save the Random Forest model
joblib.dump(rf_model, 'house_price_model.pkl')
print(" Model saved as 'house_price_model.pkl'")

# Test prediction with a sample
sample_house = X_test.iloc[0]
sample_prediction = rf_model.predict([sample_house])[0]
actual_price = y_test.iloc[0]

print(f"\nSample Prediction Test:")
print(f"Predicted: ₹{sample_prediction:,.2f}")
print(f"Actual: ₹{actual_price:,.2f}")
print(f"Difference: ₹{abs(sample_prediction - actual_price):,.2f}")

print("\nAll components saved successfully!")


 Model saved as 'house_price_model.pkl'

Sample Prediction Test:
Predicted: ₹5,211,325.00
Actual: ₹4,060,000.00
Difference: ₹1,151,325.00

All components saved successfully!
