In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pickle
import os

# Load dataset
df = pd.read_csv("Bengaluru_House_Data.csv")  # Ensure file is in correct location

# Normalize column names
df.columns = df.columns.str.strip().str.lower()

# Clean and preprocess data
df['location'] = df['location'].apply(lambda x: str(x).strip().lower())
df = df.dropna()

# Ensure required columns exist
required_cols = ['total_sqft', 'bath', 'size', 'location', 'price']
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
    raise ValueError(f"Missing columns in data: {missing_cols}")

# Convert key columns to numeric
for col in ['total_sqft', 'size', 'bath', 'price']:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.dropna(subset=['total_sqft', 'size', 'bath', 'price'])

# Simplify location categories
location_stats = df['location'].value_counts()
locations_to_keep = location_stats[location_stats > 10].index
df['location'] = df['location'].apply(lambda x: x if x in locations_to_keep else 'other')

# One-hot encoding for location
dummies = pd.get_dummies(df['location'], prefix='loc')
df_model = pd.concat([df[['total_sqft', 'bath', 'size']], dummies], axis=1)

# Features and target
X = df_model
y = df['price']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Ensure output directory exists
os.makedirs("estimator/ml", exist_ok=True)

# Save model
with open("estimator/ml/model.pkl", "wb") as f:
    pickle.dump(model, f)

# Save location column names for UI/feature matching
location_columns = list(dummies.columns)
with open("estimator/ml/columns.pkl", "wb") as f:
    pickle.dump(location_columns, f)

print("✅ Model and location columns saved successfully.")


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.