In [2]:
import pandas as pd
import numpy as np
import re
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [4]:
# 1. Load the Ahmedabad Cleaned Dataset
df = pd.read_csv('ahmedabad.csv')


In [None]:
import pandas as pd
import numpy as np
import re
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# 1. Load the dataset
df = pd.read_csv('ahmedabad.csv')

# 2. Robust Cleaning Functions
def clean_price(price_str):
    if pd.isna(price_str): return None
    price_str = str(price_str).replace(',', '')
    val_match = re.search(r"(\d+\.?\d*)", price_str)
    if not val_match: return None
    val = float(val_match.group(1))
    if 'Cr' in price_str:
        return val * 100 # Convert Cr to Lakhs
    return val

def clean_area(area_str):
    if pd.isna(area_str): return None
    area_str = str(area_str).replace(',', '')
    val_match = re.search(r"(\d+\.?\d*)", area_str)
    if not val_match: return None
    val = float(val_match.group(1))
    if 'sqyrd' in area_str.lower():
        return val * 9 # Convert sqyrd to sqft
    return val

def extract_bhk(title):
    res = re.findall(r'(\d+)\s*BHK', str(title))
    return int(res[0]) if res else None

def extract_location(title):
    try:
        main_part = title.split(' Ahmedabad')[0]
        if ' in ' in main_part:
            details = main_part.split(' in ')[-1]
            return details.split(',')[-1].strip()
        return "Other"
    except: return "Other"

# 3. Apply cleaning
df['price_lakhs'] = df['price'].apply(clean_price)
df['area_sqft'] = df['value_area'].apply(clean_area)
df['bhk'] = df['Title'].apply(extract_bhk)
df['location'] = df['Title'].apply(extract_location)

# 4. Remove Outliers & Filter
df = df[['bhk', 'area_sqft', 'location', 'price_lakhs']].dropna()
df = df[(df['price_lakhs'] >= 5) & (df['price_lakhs'] <= 2000)] # 5L to 20Cr
df = df[(df['area_sqft'] >= 300) & (df['area_sqft'] <= 10000)]

# 5. Handle "Messy" Locations (group those with < 5 entries)
loc_counts = df['location'].value_counts()
valid_locs = loc_counts[loc_counts >= 5].index
df['location'] = df['location'].apply(lambda x: x if x in valid_locs else 'Other')

# 6. Encode and Train
df_encoded = pd.get_dummies(df, columns=['location'], drop_first=True)
X = df_encoded.drop('price_lakhs', axis=1)
y = df_encoded['price_lakhs']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 7. Save Artifacts
joblib.dump(model, 'house_model.pkl')
joblib.dump(X.columns.tolist(), 'model_columns.pkl')
joblib.dump(sorted(df['location'].unique().tolist()), 'locations.pkl')

print(f"âœ… Training Complete! Model Accuracy (R2): {model.score(X_test, y_test):.2f}")