In [12]:
!pip install category_encoders -q

In [13]:
import os
for root, _, files in os.walk('/content/gdrive/MyDrive'):
    for f in files:
        if 'RealEstate' in f or 'V21' in f:
            print(os.path.join(root, f))

/content/gdrive/MyDrive/Real Estate Data V21.csv
/content/gdrive/MyDrive/Colab Notebooks/RealEstate.ipynb


In [14]:
import os, shutil
from google.colab import drive
drive.mount('/content/gdrive')

src = '/content/gdrive/MyDrive/Real Estate Data V21.csv'
dst = '/content/Real_Estate_Data_V21.csv'

if not os.path.exists(dst):
    shutil.copyfile(src, dst)
    print('✅ Copied to VM.')
else:
    print('ℹ️ Already in VM.')

# Load
import pandas as pd
df = pd.read_csv(dst)
df.head()

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
ℹ️ Already in VM.


Unnamed: 0,Name,Property Title,Price,Location,Total_Area,Price_per_SQFT,Description,Baths,Balcony
0,Casagrand ECR 14,"4 BHK Flat for sale in Kanathur Reddikuppam, C...",₹1.99 Cr,"Kanathur Reddikuppam, Chennai",2583,7700.0,Best 4 BHK Apartment for modern-day lifestyle ...,4,Yes
1,"Ramanathan Nagar, Pozhichalur,Chennai",10 BHK Independent House for sale in Pozhichal...,₹2.25 Cr,"Ramanathan Nagar, Pozhichalur,Chennai",7000,3210.0,Looking for a 10 BHK Independent House for sal...,6,Yes
2,DAC Prapthi,"3 BHK Flat for sale in West Tambaram, Chennai",₹1.0 Cr,"Kasthuribai Nagar, West Tambaram,Chennai",1320,7580.0,"Property for sale in Tambaram, Chennai. This 3...",3,No
3,"Naveenilaya,Chepauk, Triplicane,Chennai",7 BHK Independent House for sale in Triplicane...,₹3.33 Cr,"Naveenilaya,Chepauk, Triplicane,Chennai",4250,7840.0,Entire Building for sale with 7 units of singl...,5,Yes
4,VGN Spring Field Phase 1,"2 BHK Flat for sale in Avadi, Chennai",₹48.0 L,"Avadi, Chennai",960,5000.0,"Property for sale in Avadi, Chennai. This 2 BH...",3,Yes


In [15]:
import re
import numpy as np
import pandas as pd

def price_lakhs(p):
    p = str(p).strip().lower()
    if 'cr' in p:
        return float(re.sub(r'[^\d.]', '', p)) * 100
    elif 'l' in p:
        return float(re.sub(r'[^\d.]', '', p))
    return np.nan

df['Price_Lakhs'] = df['Price'].apply(price_lakhs)


In [16]:
def extract_loc_city(loc):
    parts = [x.strip() for x in str(loc).split(',')]
    city = parts[-1] if parts else ''
    detail = ', '.join(parts[:-1]) if len(parts) > 1 else parts[0]
    return detail, city
df[['Location_Detail', 'City']] = df['Location'].apply(lambda x: pd.Series(extract_loc_city(x)))

In [17]:
def extract_bhk(title):
    m = re.search(r'(\d+)\s*bhk', str(title).lower())
    return int(m.group(1)) if m else np.nan
df['BHK'] = df['Property Title'].apply(extract_bhk)

In [18]:
num_cols = ['Total_Area', 'Price_per_SQFT', 'Price_Lakhs', 'BHK', 'Baths']
for col in num_cols:
    if col in df.columns:
        df[f'{col}_log'] = np.log1p(df[col])

In [19]:
cat_cols = ['Location_Detail', 'City', 'Name']
df[cat_cols] = df[cat_cols].fillna('Unknown')



In [20]:
!pip install category_encoders  # Run this only once
from category_encoders import TargetEncoder




In [21]:
mask = df['BHK'].notna()
te = TargetEncoder(cols=cat_cols)
te.fit(df.loc[mask, cat_cols], df.loc[mask, 'BHK'])


In [22]:
df_te = te.transform(df[cat_cols])  # all rows
df_te.index = df.index
df = pd.concat([df, df_te.add_suffix('_te')], axis=1)


In [23]:
feature_cols = [col for col in df.columns if df[col].dtype != 'object']
df_model = df.dropna(subset=['BHK'] + feature_cols)  # or use imputer if you want
X = df_model[feature_cols]
y = df_model['BHK']


In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [25]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [26]:
scale_cols = [c for c in df.columns if c.endswith('_log')]
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df[scale_cols]),
                         columns=[f'{c}_scaled' for c in scale_cols],
                         index=df.index)
df = pd.concat([df, df_scaled], axis=1)

In [27]:
display(df[['Price_Lakhs', 'Location_Detail', 'City', 'BHK',
            'Total_Area_log_scaled', 'Price_per_SQFT_log_scaled']].head())

Unnamed: 0,Price_Lakhs,Location_Detail,City,BHK,Total_Area_log_scaled,Price_per_SQFT_log_scaled
0,199.0,Kanathur Reddikuppam,Chennai,4.0,1.41839,0.168578
1,225.0,"Ramanathan Nagar, Pozhichalur",Chennai,10.0,2.938666,-1.068762
2,100.0,"Kasthuribai Nagar, West Tambaram",Chennai,3.0,0.395,0.146364
3,333.0,"Naveenilaya, Chepauk, Triplicane",Chennai,7.0,2.177702,0.194062
4,48.0,Avadi,Chennai,2.0,-0.0903,-0.44207


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [29]:
# Assuming numeric columns are already scaled
feature_cols = [col for col in df.columns if col not in ['BHK']]  # all except target
X = df[feature_cols]
y = df['BHK']


In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [31]:
# Keep only numeric and *_te columns
feature_cols = [col for col in df.columns if (df[col].dtype != 'object')]  # all numeric
X = df[feature_cols]
y = df['BHK']


In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [33]:
# Keep only rows where BHK is not null
df_model = df[df['BHK'].notna()]

# Define features and target
feature_cols = [col for col in df_model.columns if df_model[col].dtype != 'object']  # numeric + _te
X = df_model[feature_cols]
y = df_model['BHK']

# Split train/test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [34]:
# Drop rows where target or any feature is NaN
df_model = df.dropna(subset=['BHK'] + feature_cols)

# Redefine X and y
X = df_model[feature_cols]
y = df_model['BHK']

# Split into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Linear Regression
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)


In [35]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [36]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_scaled, y_train)


In [37]:
from sklearn.metrics import mean_squared_error, r2_score

y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

train_mse = mean_squared_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("Train MSE:", train_mse)
print("Train R2:", train_r2)
print("Test MSE:", test_mse)
print("Test R2:", test_r2)


Train MSE: 1.174556466704547e-30
Train R2: 1.0
Test MSE: 1.4680186037949091e-30
Test R2: 1.0


In [39]:
from google.colab import files
uploaded = files.upload()

import pandas as pd

# Load dataset
df = pd.read_csv("Real Estate Data V21.csv")
print("Dataset Loaded Successfully!")
df.head()


Saving Real Estate Data V21.csv to Real Estate Data V21.csv
Dataset Loaded Successfully!


Unnamed: 0,Name,Property Title,Price,Location,Total_Area,Price_per_SQFT,Description,Baths,Balcony
0,Casagrand ECR 14,"4 BHK Flat for sale in Kanathur Reddikuppam, C...",₹1.99 Cr,"Kanathur Reddikuppam, Chennai",2583,7700.0,Best 4 BHK Apartment for modern-day lifestyle ...,4,Yes
1,"Ramanathan Nagar, Pozhichalur,Chennai",10 BHK Independent House for sale in Pozhichal...,₹2.25 Cr,"Ramanathan Nagar, Pozhichalur,Chennai",7000,3210.0,Looking for a 10 BHK Independent House for sal...,6,Yes
2,DAC Prapthi,"3 BHK Flat for sale in West Tambaram, Chennai",₹1.0 Cr,"Kasthuribai Nagar, West Tambaram,Chennai",1320,7580.0,"Property for sale in Tambaram, Chennai. This 3...",3,No
3,"Naveenilaya,Chepauk, Triplicane,Chennai",7 BHK Independent House for sale in Triplicane...,₹3.33 Cr,"Naveenilaya,Chepauk, Triplicane,Chennai",4250,7840.0,Entire Building for sale with 7 units of singl...,5,Yes
4,VGN Spring Field Phase 1,"2 BHK Flat for sale in Avadi, Chennai",₹48.0 L,"Avadi, Chennai",960,5000.0,"Property for sale in Avadi, Chennai. This 2 BH...",3,Yes


Linear Regression

In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, mean_absolute_error

# Load dataset
df = pd.read_csv("Real Estate Data V21.csv")

# Fix Price Column — Ignore invalid entries like '95.45 acs'
def convert_price(value):
    value = str(value).replace("₹", "").replace(",", "").strip()
    try:
        if "Cr" in value:
            return float(value.replace("Cr", "")) * 10000000
        elif "L" in value:
            return float(value.replace("L", "")) * 100000
        else:
            return pd.to_numeric(value, errors="coerce")
    except:
        return None

df["Price"] = df["Price"].apply(convert_price)

# Drop rows where price is invalid or missing
df = df.dropna(subset=["Price"])

# Drop unnecessary columns
df = df.drop(["Name", "Property Title", "Description"], axis=1)

# Handle missing values in other columns
df = df.dropna()

# Encode categorical columns
le = LabelEncoder()
df["Location"] = le.fit_transform(df["Location"])
df["Balcony"] = le.fit_transform(df["Balcony"])

# Features & Target
X = df.drop(columns=["Price"])
y = df["Price"]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)

# Evaluate model
print("Linear Regression Results:")
print("R² Score:", r2_score(y_test, y_pred))
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))


Linear Regression Results:
R² Score: 0.28041056330154146
Mean Absolute Error: 6008788.544543079


In [41]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error

# Train Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predictions
y_train_pred = lr_model.predict(X_train)
y_test_pred = lr_model.predict(X_test)

# Training results
print("Linear Regression - Training Results:")
print("R²:", r2_score(y_train, y_train_pred))
print("MAE:", mean_absolute_error(y_train, y_train_pred))

# Testing results
print("\nLinear Regression - Testing Results:")
print("R²:", r2_score(y_test, y_test_pred))
print("MAE:", mean_absolute_error(y_test, y_test_pred))


Linear Regression - Training Results:
R²: 0.18026665708499257
MAE: 6159807.074992791

Linear Regression - Testing Results:
R²: 0.28041056330154146
MAE: 6008788.544543079


In [42]:
# Comparison between training and testing performance
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("\nComparison of Linear Regression")
if train_r2 > test_r2:
    print("Model performs better on training data → possible overfitting.")
elif train_r2 < test_r2:
    print("Model performs slightly better on testing data → possible underfitting.")
else:
    print("Model performs equally on training and testing data → well balanced.")



Comparison of Linear Regression
Model performs slightly better on testing data → possible underfitting.


Decision Tree Regressor


In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, mean_absolute_error

# Load dataset
df = pd.read_csv("Real Estate Data V21.csv")

# Convert price into numeric
def convert_price(value):
    value = str(value).replace("₹", "").replace(",", "").strip()
    try:
        if "Cr" in value:
            return float(value.replace("Cr", "").strip()) * 10000000
        elif "L" in value:
            return float(value.replace("L", "").strip()) * 100000
        else:
            return float(value)  # Direct conversion if numeric
    except:
        return pd.NA  # For invalid formats

df["Price"] = df["Price"].apply(convert_price)

# Drop rows with missing price
df = df.dropna(subset=["Price"])

# Drop unnecessary columns
df = df.drop(["Name", "Property Title", "Description"], axis=1, errors='ignore')

# Handle missing values in other columns
df = df.dropna()

# Encode categorical columns
le = LabelEncoder()
if "Location" in df.columns:
    df["Location"] = le.fit_transform(df["Location"])
if "Balcony" in df.columns:
    df["Balcony"] = le.fit_transform(df["Balcony"])

# Features & target
X = df.drop(columns=["Price"])
y = df["Price"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Decision Tree model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)
y_pred = dt_model.predict(X_test)

# Evaluate model
print("Decision Tree Results:")
print("R² Score:", r2_score(y_test, y_pred))
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))


Decision Tree Results:
R² Score: 0.1659683941804464
Mean Absolute Error: 975623.19339298


In [44]:
from sklearn.metrics import r2_score, mean_absolute_error

# Assume dt_model is trained
y_train_pred = dt_model.predict(X_train)
y_test_pred = dt_model.predict(X_test)

# Decision Tree results
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

print("Decision Tree Regressor Results:")
print("Train R² Score:", train_r2)
print("Test R² Score:", test_r2)
print("Test Mean Absolute Error:", test_mae)

# Comparison with other models (example values, replace with actual)
comparison = {
    "Model": ["Linear Regression", "Decision Tree", "Random Forest", "Gradient Boosting"],
    "R² Score": [0.28, test_r2, 0.78, 0.82],
    "MAE": [6008788, test_mae, 2500000, 2000000]
}

import pandas as pd
comp_df = pd.DataFrame(comparison)
print("\nModel Comparison:")
print(comp_df)


Decision Tree Regressor Results:
Train R² Score: 1.0
Test R² Score: 0.1659683941804464
Test Mean Absolute Error: 975623.19339298

Model Comparison:
               Model  R² Score           MAE
0  Linear Regression  0.280000  6.008788e+06
1      Decision Tree  0.165968  9.756232e+05
2      Random Forest  0.780000  2.500000e+06
3  Gradient Boosting  0.820000  2.000000e+06


Random Forest Regressor

In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, mean_absolute_error

# Load dataset
df = pd.read_csv("Real Estate Data V21.csv")

# Convert price into numeric safely
def convert_price(value):
    value = str(value).replace("₹", "").replace(",", "").strip()
    try:
        if "Cr" in value:
            return float(value.replace("Cr", "").strip()) * 10000000
        elif "L" in value:
            return float(value.replace("L", "").strip()) * 100000
        else:
            return float(value)  # Try direct conversion
    except:
        return pd.NA  # Invalid entries become missing

df["Price"] = df["Price"].apply(convert_price)

# Drop rows with missing price
df = df.dropna(subset=["Price"])

# Drop unnecessary columns
df = df.drop(["Name", "Property Title", "Description"], axis=1, errors='ignore')

# Drop any remaining rows with missing values
df = df.dropna()

# Encode categorical columns
le = LabelEncoder()
if "Location" in df.columns:
    df["Location"] = le.fit_transform(df["Location"])
if "Balcony" in df.columns:
    df["Balcony"] = le.fit_transform(df["Balcony"])

# Features & target
X = df.drop(columns=["Price"])
y = df["Price"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

# Evaluate model
print("Random Forest Results:")
print("R² Score:", r2_score(y_test, y_pred))
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))


Random Forest Results:
R² Score: 0.8207345564410997
Mean Absolute Error: 607696.0702718514


In [46]:
from sklearn.metrics import r2_score, mean_absolute_error

# Assume rf_model is already trained
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

# Random Forest results
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

print("Random Forest Regressor Results:")
print("Train R² Score:", train_r2)
print("Test R² Score:", test_r2)
print("Test Mean Absolute Error:", test_mae)

# Comparison with other models (example values, replace with actual)
comparison = {
    "Model": ["Linear Regression", "Decision Tree", "Random Forest", "Gradient Boosting"],
    "R² Score": [0.28, 0.65, test_r2, 0.82],
    "MAE": [6008788, 3500000, test_mae, 2000000]
}

import pandas as pd
comp_df = pd.DataFrame(comparison)
print("\nModel Comparison:")
print(comp_df)


Random Forest Regressor Results:
Train R² Score: 0.9412993565339786
Test R² Score: 0.8207345564410997
Test Mean Absolute Error: 607696.0702718514

Model Comparison:
               Model  R² Score           MAE
0  Linear Regression  0.280000  6.008788e+06
1      Decision Tree  0.650000  3.500000e+06
2      Random Forest  0.820735  6.076961e+05
3  Gradient Boosting  0.820000  2.000000e+06


Gradient Boosting Regressor

In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, mean_absolute_error

# Load dataset
df = pd.read_csv("Real Estate Data V21.csv")

# Convert price into numeric safely
def convert_price(value):
    value = str(value).replace("₹", "").replace(",", "").strip()
    try:
        if "Cr" in value:
            return float(value.replace("Cr", "").strip()) * 10000000
        elif "L" in value:
            return float(value.replace("L", "").strip()) * 100000
        else:
            return float(value)  # Try direct conversion
    except:
        return pd.NA  # Mark invalid entries as missing

df["Price"] = df["Price"].apply(convert_price)

# Drop rows with missing price
df = df.dropna(subset=["Price"])

# Drop unnecessary columns
df = df.drop(["Name", "Property Title", "Description"], axis=1, errors='ignore')

# Drop any remaining rows with missing values
df = df.dropna()

# Encode categorical columns
le = LabelEncoder()
if "Location" in df.columns:
    df["Location"] = le.fit_transform(df["Location"])
if "Balcony" in df.columns:
    df["Balcony"] = le.fit_transform(df["Balcony"])

# Features & target
X = df.drop(columns=["Price"])
y = df["Price"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Gradient Boosting model
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)
y_pred = gb_model.predict(X_test)

# Evaluate model
print("Gradient Boosting Results:")
print("R² Score:", r2_score(y_test, y_pred))
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))


Gradient Boosting Results:
R² Score: 0.8414748366430221
Mean Absolute Error: 1252103.4015784988


In [48]:
from sklearn.metrics import r2_score, mean_absolute_error

# Assume gb_model is already trained
y_train_pred = gb_model.predict(X_train)
y_test_pred = gb_model.predict(X_test)

# Gradient Boosting results
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

print("Gradient Boosting Regressor Results:")
print("Train R² Score:", train_r2)
print("Test R² Score:", test_r2)
print("Test Mean Absolute Error:", test_mae)

# Comparison with other models (example values, replace with actual)
comparison = {
    "Model": ["Linear Regression", "Decision Tree", "Random Forest", "Gradient Boosting"],
    "R² Score": [0.28, 0.65, 0.78, test_r2],
    "MAE": [6008788, 3500000, 2500000, test_mae]
}

import pandas as pd
comp_df = pd.DataFrame(comparison)
print("\nModel Comparison:")
print(comp_df)


Gradient Boosting Regressor Results:
Train R² Score: 0.7889107149508492
Test R² Score: 0.8414748366430221
Test Mean Absolute Error: 1252103.4015784988

Model Comparison:
               Model  R² Score           MAE
0  Linear Regression  0.280000  6.008788e+06
1      Decision Tree  0.650000  3.500000e+06
2      Random Forest  0.780000  2.500000e+06
3  Gradient Boosting  0.841475  1.252103e+06


In [49]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

#  Load dataset
df = pd.read_csv("Real Estate Data V21.csv")

#  Convert price into numeric
def convert_price(value):
    value = str(value).replace("₹", "").replace(",", "").strip()
    try:
        if "Cr" in value:
            return float(value.replace("Cr", "")) * 10000000
        elif "L" in value:
            return float(value.replace("L", "")) * 100000
        else:
            return pd.to_numeric(value, errors="coerce")
    except:
        return None

df["Price"] = df["Price"].apply(convert_price)

# Drop rows with invalid price
df = df.dropna(subset=["Price"])

#  Drop unnecessary columns
df = df.drop(["Name", "Property Title", "Description"], axis=1)

#  Handle missing values

#  Encode categorical columns
le = LabelEncoder()
df["Location"] = le.fit_transform(df["Location"])
df["Balcony"] = le.fit_transform(df["Balcony"])

#  Features & Target
X = df.drop(columns=["Price"])
y = df["Price"]

#  Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define Models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

results = []

#  Train, Predict & Evaluate
for name, model in models.items():
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    mae = mean_absolute_error(y_test, y_test_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    overfit_gap = abs(train_r2 - test_r2)

    results.append([name, train_r2, test_r2, mae, rmse, overfit_gap])

#  Create Comparison Table
results_df = pd.DataFrame(results, columns=["Model", "Train R²", "Test R²", "MAE", "RMSE", "Overfitting Gap"])
print("\n Model Performance Comparison ")
print(results_df)

#  Select Best Model
best_model = results_df[
    (results_df["Test R²"].between(0.80, 0.90)) &
    (results_df["RMSE"] < 1000000) &
    (results_df["MAE"] < 500000) &
    (results_df["Overfitting Gap"] < 0.10)
]

print("\n Best Model Based on Criteria ")
if not best_model.empty:
    print(best_model)
else:
    print(" No model meets all selection criteria. Consider tuning hyperparameters.")



 Model Performance Comparison 
               Model  Train R²   Test R²           MAE          RMSE  \
0  Linear Regression  0.180267  0.280411  6.008789e+06  1.432350e+07   
1      Decision Tree  1.000000  0.165968  9.756232e+05  1.542049e+07   
2      Random Forest  0.941299  0.820735  6.076961e+05  7.149162e+06   
3  Gradient Boosting  0.788911  0.841475  1.252103e+06  6.722890e+06   

   Overfitting Gap  
0         0.100144  
1         0.834032  
2         0.120565  
3         0.052564  

 Best Model Based on Criteria 
 No model meets all selection criteria. Consider tuning hyperparameters.


In [50]:
from sklearn.ensemble import RandomForestRegressor


In [51]:
# Install encoder if not already installed
!pip install category_encoders

# Import required libraries
import pandas as pd
import numpy as np
import re
import category_encoders as ce
from sklearn.model_selection import train_test_split

# Load dataset (make sure file name matches exactly)
df = pd.read_csv("/content/Real Estate Data V21.csv")

# --- Cleaning steps ---

# Clean Price column (robust version)
def convert_price(value):
    value = str(value).replace("₹", "").replace(",", "").strip()

    try:
        if "Cr" in value:
            return float(value.replace("Cr", "").strip()) * 100   # 1 Cr = 100 Lakhs
        elif "Lac" in value or "L" in value:
            return float(value.replace("Lac", "").replace("L", "").strip())
        else:
            return float(value)   # if it's just a number
    except:
        return np.nan   # if conversion fails, return NaN

df["Price"] = df["Price"].apply(convert_price)

# Drop rows where Price could not be converted
df = df.dropna(subset=["Price"])


# 2. Extract BHK
def extract_bhk(title):
    match = re.search(r'(\d+)\s*BHK', str(title))
    if match:
        return int(match.group(1))
    return np.nan

df["BHK"] = df["Property Title"].apply(extract_bhk)

# 3. Handle missing values
df = df.dropna(subset=["Price", "BHK"])
df.fillna("Unknown", inplace=True)

# 4. Split location into City and Area (if column exists)
if "Location" in df.columns:
    df[["City", "Area"]] = df["Location"].str.split(",", n=1, expand=True)

# 5. Drop useless columns
drop_cols = ["Name", "Property Title", "Description"]
df = df.drop([col for col in drop_cols if col in df.columns], axis=1)

# 6. Encode categorical columns
categorical_cols = df.select_dtypes(include=["object"]).columns
encoder = ce.TargetEncoder(cols=categorical_cols)
df[categorical_cols] = encoder.fit_transform(df[categorical_cols], df["Price"])

# Train-test split
X = df.drop("Price", axis=1)
y = df["Price"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("✅ Dataset ready")
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


✅ Dataset ready
Train shape: (11020, 8)
Test shape: (2755, 8)
