# Bengaluru House Price Prediction Model
This notebook loads, cleans, and trains a model on the Bengaluru housing dataset.

In [None]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn import metrics

In [None]:
# Load the dataset
df = pd.read_csv("Bengaluru_House_Data.csv")

# Trim whitespace from column names
df.columns = df.columns.str.strip()
print("Available columns:", df.columns.tolist())

# Drop unnecessary columns if they exist
cols_to_drop = ['area_type', 'availability', 'society', 'balcony']
df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])
df = df.dropna()

# Convert 'size' to BHK
df['bhk'] = df['size'].apply(lambda x: int(str(x).split(' ')[0]) if isinstance(x, str) else None)

# Convert 'total_sqft' to number
def convert_sqft_to_num(x):
    try:
        x = str(x).strip()
        if '-' in x:
            tokens = x.split('-')
            return (float(tokens[0]) + float(tokens[1])) / 2
        return float(x)
    except:
        return None

df['total_sqft'] = df['total_sqft'].apply(convert_sqft_to_num)
df = df.dropna(subset=['total_sqft'])

# Add price per sqft
df['price_per_sqft'] = df['price'] * 100000 / df['total_sqft']

# Clean and group locations
df['location'] = df['location'].apply(lambda x: x.strip())
loc_stats = df['location'].value_counts()
loc_less_than_10 = loc_stats[loc_stats <= 10]
df['location'] = df['location'].apply(lambda x: 'other' if x in loc_less_than_10 else x)

# One-hot encoding for location
dummies = pd.get_dummies(df['location'], drop_first=True)
df = pd.concat([df, dummies], axis=1)
df = df.drop(['location', 'size', 'price_per_sqft'], axis=1)

In [None]:
# Features and target
X = df.drop('price', axis=1)
y = df['price']

# Final safety check
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

# Train-test split
if len(df) >= 2:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print("Train/Test split done.")
    
    # Debugging info
    print("X_train shape:", X_train.shape)
    print("y_train shape:", y_train.shape)
    print("First few rows of X_train:")
    print(X_train.head())
    print("Are there any NaNs in X_train?", X_train.isnull().values.any())
    print("Are there any NaNs in y_train?", y_train.isnull().values.any())
    print("X_train dtypes:")
    print(X_train.dtypes)

    # Train model
    model = XGBRegressor()
    model.fit(X_train, y_train)

    training_preds = model.predict(X_train)
    print("Training R²:", metrics.r2_score(y_train, training_preds))
    print("Training MAE:", metrics.mean_absolute_error(y_train, training_preds))

    # Evaluate on test
    test_preds = model.predict(X_test)
    print("Test R²:", metrics.r2_score(y_test, test_preds))
    print("Test MAE:", metrics.mean_absolute_error(y_test, test_preds))

    # Plot results
    plt.scatter(y_test, test_preds)
    plt.xlabel("Actual Price")
    plt.ylabel("Predicted Price")
    plt.title("Actual vs Predicted Price")
    plt.show()
else:
    print("Not enough data to split. Check after preprocessing.")