In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score



In [None]:
# Load dataset
data_path = "Housing.csv"
df = pd.read_csv(data_path)

# Display basic info
print("Dataset Head:")
print(df.head())
print("\nDataset Info:")
df.info()


Dataset Head:
      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2       no        furnished  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 54

In [None]:

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Encode categorical features
categorical_cols = df.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Feature-target split
X = df.drop(columns=['price'])  # Assuming 'price' is the target column
y = df['price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)




Missing Values:
price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64


In [None]:
# Model training
models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n{name} Performance:")
    print("MAE:", mean_absolute_error(y_test, y_pred))
    print("MSE:", mean_squared_error(y_test, y_pred))
    print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
    print("R2 Score:", r2_score(y_test, y_pred))



#Random Forest:
#Performs well with structured data but can sometimes overfit. Feature importance from this model can help understand key predictors.

#Gradient Boosting:
#Generally achieves better performance due to sequential learning but can be slower than Random Forest. Adjusting learning rates and tree depths can improve results.

#XGBoost:
#Often provides the best balance between accuracy and efficiency. Hyperparameter tuning could further enhance results.



Random Forest Performance:
MAE: 1026699.6876146789
MSE: 1962144940711.022
RMSE: 1400765.8407853262
R2 Score: 0.6118077610803518

Gradient Boosting Performance:
MAE: 964058.8730464154
MSE: 1694870370248.4102
RMSE: 1301871.871671099
R2 Score: 0.6646855642239725

XGBoost Performance:
MAE: 1055461.25
MSE: 2093550796800.0
RMSE: 1446910.777069547
R2 Score: 0.5858103036880493
