In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

In [15]:
plt.style.use('seaborn-v0_8')

In [16]:
script_dir = os.path.dirname(os.path.abspath(r"C:\Users\Aagaaz Kapoor\Desktop\CT\week5_house_price\Housing.csv"))


In [17]:
# Load the dataset
# Construct path to Housing.csv in the same directory
csv_path = os.path.join(script_dir, 'Housing.csv')
df = pd.read_csv(csv_path)

In [18]:
# --- Data Cleaning ---
# Check for missing values
print("Missing Values:\n", df.isnull().sum())


Missing Values:
 price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64


In [19]:
# No missing values found in the dataset, but let's confirm data types
print("\nData Types:\n", df.dtypes)


Data Types:
 price                int64
area                 int64
bedrooms             int64
bathrooms            int64
stories              int64
mainroad            object
guestroom           object
basement            object
hotwaterheating     object
airconditioning     object
parking              int64
prefarea            object
furnishingstatus    object
dtype: object


In [20]:
# Convert categorical columns to appropriate type
categorical_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 
                    'airconditioning', 'prefarea', 'furnishingstatus']
for col in categorical_cols:
    df[col] = df[col].astype('category')

In [21]:
# --- Data Cleaning ---
# Check for missing values
print("Missing Values:\n", df.isnull().sum())


Missing Values:
 price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64


In [22]:
# No missing values found in the dataset, but let's confirm data types
print("\nData Types:\n", df.dtypes)



Data Types:
 price                  int64
area                   int64
bedrooms               int64
bathrooms              int64
stories                int64
mainroad            category
guestroom           category
basement            category
hotwaterheating     category
airconditioning     category
parking                int64
prefarea            category
furnishingstatus    category
dtype: object


In [23]:
# Convert categorical columns to appropriate type
categorical_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 
                    'airconditioning', 'prefarea', 'furnishingstatus']
for col in categorical_cols:
    df[col] = df[col].astype('category')

In [24]:
# Check for outliers in numerical columns using IQR method
numerical_cols = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']  # Excluded 'price'
for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col]
    if not outliers.empty:
        print(f"\nOutliers in {col}:\n", outliers)
        # Cap outliers to reduce their impact
        df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)


Outliers in area:
 7      16200
10     13200
56     11440
64     11175
66     13200
69     12090
125    15600
129    11460
186    11410
191    10700
211    12900
403    12944
Name: area, dtype: int64

Outliers in bedrooms:
 7      5
28     5
34     5
89     5
112    6
143    5
152    5
271    5
340    5
356    5
395    6
536    5
Name: bedrooms, dtype: int64

Outliers in bathrooms:
 1    4
Name: bathrooms, dtype: int64

Outliers in stories:
 1      4
6      4
9      4
17     4
26     4
30     4
31     4
35     4
37     4
38     4
39     4
41     4
42     4
43     4
44     4
46     4
47     4
50     4
51     4
52     4
53     4
57     4
58     4
59     4
71     4
72     4
73     4
83     4
92     4
94     4
102    4
105    4
124    4
131    4
135    4
140    4
145    4
160    4
220    4
226    4
247    4
Name: stories, dtype: int64

Outliers in parking:
 1      3
3      3
47     3
93     3
225    3
247    3
299    3
304    3
323    3
331    3
401    3
472    3
Name: parking, dtype: int

In [26]:

# --- Exploratory Data Analysis (EDA) ---
# Summary statistics
print("\nSummary Statistics:\n", df.describe())


Summary Statistics:
               price          area    bedrooms   bathrooms     stories  \
count  5.450000e+02    545.000000  545.000000  545.000000  545.000000   
mean   4.766729e+06   5102.249541    2.950459    1.285321    1.767890   
std    1.870440e+06   2005.804353    0.697504    0.497942    0.777543   
min    1.750000e+06   1650.000000    1.000000    1.000000    1.000000   
25%    3.430000e+06   3600.000000    2.000000    1.000000    1.000000   
50%    4.340000e+06   4600.000000    3.000000    1.000000    2.000000   
75%    5.740000e+06   6360.000000    3.000000    2.000000    2.000000   
max    1.330000e+07  10500.000000    4.500000    3.500000    3.500000   

          parking  
count  545.000000  
mean     0.682569  
std      0.834773  
min      0.000000  
25%      0.000000  
50%      0.000000  
75%      1.000000  
max      2.500000  


In [27]:
# Visualize price distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['price'], kde=True, color='blue')
plt.title('Distribution of House Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.savefig(os.path.join(script_dir, 'price_distribution.png'))
plt.close()


In [28]:
# Correlation matrix for numerical features including price
corr_cols = numerical_cols + ['price']  # Include price for correlation
plt.figure(figsize=(10, 8))
sns.heatmap(df[corr_cols].corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Numerical Features')
plt.savefig(os.path.join(script_dir, 'correlation_matrix.png'))
plt.close()

In [29]:
# Boxplot of price vs. furnishing status
plt.figure(figsize=(10, 6))
sns.boxplot(x='furnishingstatus', y='price', data=df)
plt.title('Price vs. Furnishing Status')
plt.savefig(os.path.join(script_dir, 'price_vs_furnishing.png'))
plt.close()

In [30]:
# Scatter plot of area vs. price
plt.figure(figsize=(10, 6))
sns.scatterplot(x='area', y='price', hue='bedrooms', size='bathrooms', data=df)
plt.title('Area vs. Price (Colored by Bedrooms, Sized by Bathrooms)')
plt.savefig(os.path.join(script_dir, 'area_vs_price.png'))
plt.close()

In [31]:
# --- Feature Engineering ---
# Encode categorical variables
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

In [32]:
# Create new feature: total rooms (bedrooms + bathrooms)
df['total_rooms'] = df['bedrooms'] + df['bathrooms']

In [33]:
# Create interaction term: area per room
df['area_per_room'] = df['area'] / df['total_rooms']

In [34]:
# --- Data Preprocessing ---
# Define features (X) and target (y)
X = df.drop('price', axis=1)
y = df['price']

In [35]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
# Scale numerical features
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

In [37]:
# --- Model Training and Evaluation ---
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': xgb.XGBRegressor(random_state=42)
}


In [38]:
# Dictionary to store results
results = {}

In [39]:
# Train and evaluate each model
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    

In [40]:
# Make predictions
y_pred = model.predict(X_test)

In [41]:
 # Calculate metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

In [42]:
   # Store results
results[name] = {'RMSE': rmse, 'R2': r2, 'MAE': mae}

In [43]:
 # Perform cross-validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
print(f"\n{name} Results:")
print(f"RMSE: {rmse:.2f}")
print(f"R2 Score: {r2:.2f}")
print(f"MAE: {mae:.2f}")
print(f"Cross-Validation R2 Scores: {cv_scores}")
print(f"Average CV R2 Score: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")


XGBoost Results:
RMSE: 1457169.09
R2 Score: 0.58
MAE: 1089245.38
Cross-Validation R2 Scores: [ -2.85137582  -8.65804005 -25.29599762 -23.78619194  -8.98022842]
Average CV R2 Score: -13.91 ± 8.96


In [44]:
# --- Hyperparameter Tuning for Random Forest ---
# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}


In [45]:
# Initialize GridSearchCV
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring='r2', n_jobs=-1)

In [46]:
# Fit GridSearchCV
grid_search.fit(X_train, y_train)


0,1,2
,estimator,RandomForestR...ndom_state=42)
,param_grid,"{'max_depth': [None, 10, ...], 'min_samples_split': [2, 5, ...], 'n_estimators': [100, 200, ...]}"
,scoring,'r2'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [47]:
# Best parameters and score
print("\nRandom Forest Hyperparameter Tuning Results:")
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best R2 Score: {grid_search.best_score_:.2f}")


Random Forest Hyperparameter Tuning Results:
Best Parameters: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 100}
Best R2 Score: 0.62


In [48]:
# Train final model with best parameters
final_model = grid_search.best_estimator_
final_model.fit(X_train, y_train)
final_predictions = final_model.predict(X_test)

In [49]:
# Final evaluation
final_rmse = np.sqrt(mean_squared_error(y_test, final_predictions))
final_r2 = r2_score(y_test, final_predictions)
final_mae = mean_absolute_error(y_test, final_predictions)


In [50]:
print("\nFinal Random Forest Model Results:")
print(f"RMSE: {final_rmse:.2f}")
print(f"R2 Score: {final_r2:.2f}")
print(f"MAE: {final_mae:.2f}")


Final Random Forest Model Results:
RMSE: 1447734.92
R2 Score: 0.59
MAE: 1052187.96


In [51]:
# --- Feature Importance ---
# Plot feature importance for Random Forest
feature_importance = pd.Series(final_model.feature_importances_, index=X.columns)
feature_importance.sort_values(ascending=False).plot(kind='bar', figsize=(12, 6))
plt.title('Feature Importance (Random Forest)')
plt.ylabel('Importance')
plt.savefig(os.path.join(script_dir, 'feature_importance.png'))
plt.close()

In [52]:
# --- Save Results ---
# Save model performance to a CSV file
results_df = pd.DataFrame(results).T
results_df.to_csv(os.path.join(script_dir, 'model_performance.csv'))

print("\nEDA plots and model performance have been saved to the House_Price_Prediction folder.")
print("You can view the plots: price_distribution.png, correlation_matrix.png, price_vs_furnishing.png, area_vs_price.png, feature_importance.png")
print("Model performance metrics are saved in model_performance.csv")


EDA plots and model performance have been saved to the House_Price_Prediction folder.
You can view the plots: price_distribution.png, correlation_matrix.png, price_vs_furnishing.png, area_vs_price.png, feature_importance.png
Model performance metrics are saved in model_performance.csv
