In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.ensemble import StackingRegressor
from lightgbm import LGBMRegressor
import warnings

# Ignore warnings
warnings.filterwarnings("ignore")



In [4]:
# === Load datasets ===
train_df = pd.read_csv("train_(2)_(1)_(1).csv")
test_df = pd.read_csv("test_(2)_(1)_(1).csv")
avg_rent_df = pd.read_csv("avg_rent_(1)_(1)_(1).csv")
distance_df = pd.read_csv("dist_from_city_centre_(1)_(1)_(1).csv")

# === Helper Functions ===
def extract_bhk(size):
    try:
        return float(str(size).split(' ')[0])
    except:
        return np.nan

def convert_sqft(sqft):
    try:
        if '-' in str(sqft):
            return np.mean(list(map(float, sqft.split('-'))))
        return float(sqft)
    except:
        return np.nan



In [5]:
# === Preprocessing ===
for df in [train_df, test_df]:
    df['availability_cleaned'] = df['availability'].apply(lambda x: 1 if str(x).strip().lower() == 'ready to move' else 0)
    df['bath'] = df['bath'].fillna(df['bath'].median())
    df['balcony'] = df['balcony'].fillna(df['balcony'].median())
    df['bhk'] = df['size'].apply(extract_bhk)
    df['total_sqft_cleaned'] = df['total_sqft'].apply(convert_sqft)

train_df.dropna(subset=['location', 'bhk', 'total_sqft_cleaned'], inplace=True)

# Standardize location formatting
for df in [train_df, test_df, avg_rent_df, distance_df]:
    df['location'] = df['location'].astype(str).str.strip().str.lower()

# Merge external features
train_df = train_df.merge(avg_rent_df, on='location', how='left')
test_df = test_df.merge(avg_rent_df, on='location', how='left')
train_df = train_df.merge(distance_df, on='location', how='left')
test_df = test_df.merge(distance_df, on='location', how='left')

# Fill missing values
for col in ['avg_2bhk_rent', 'dist_from_city']:
    train_df[col] = train_df[col].fillna(train_df[col].median())
    test_df[col] = test_df[col].fillna(train_df[col].median())

# Outlier removal
train_df = train_df[train_df['total_sqft_cleaned'] > 300]
train_df = train_df[train_df['price'] < 1e8]

# Extra feature
train_df['price_per_sqft'] = train_df['price'] / train_df['total_sqft_cleaned']

# Top locations
top_locations = train_df['location'].value_counts().nlargest(20).index
train_df['location'] = train_df['location'].apply(lambda x: x if x in top_locations else 'other')
test_df['location'] = test_df['location'].apply(lambda x: x if x in top_locations else 'other')

# === Define features ===
features = ['area_type', 'availability_cleaned', 'location', 'bhk', 'bath', 
            'balcony', 'total_sqft_cleaned', 'avg_2bhk_rent', 'dist_from_city']

target = 'price'

X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]

# Log transform target
y_train_log = np.log1p(y_train)

# === Preprocessing Pipelines ===
numeric_features = ['bhk', 'bath', 'balcony', 'total_sqft_cleaned', 'avg_2bhk_rent', 'dist_from_city']
categorical_features = ['area_type', 'location']

numeric_transformer = SimpleImputer(strategy='median')
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])



In [6]:
# === Models ===
ridge_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', Ridge(alpha=1.0))  # Light Ridge regularization
])

lgbm_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LGBMRegressor(
        n_estimators=1200,
        learning_rate=0.05,
        max_depth=8,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    ))
])

# === Stacking Regressor ===
stacked_model = StackingRegressor(
    estimators=[('ridge', ridge_model), ('lgbm', lgbm_model)],
    final_estimator=Ridge(alpha=0.5),
    n_jobs=-1
)

# === Training ===
stacked_model.fit(X_train, y_train_log)




In [7]:
# === Cross-validation RMSE ===
cv_rmse = -cross_val_score(
    stacked_model, X_train, y_train_log,
    scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1
).mean()

print(f"✅ Cross-validated RMSE (log-transformed): {cv_rmse:.2f}")

# === Predictions ===
predictions_log = stacked_model.predict(X_test)
predictions = np.expm1(predictions_log)  # Inverse log transform



✅ Cross-validated RMSE (log-transformed): 0.30


In [8]:
# === Save Submission ===
submission = test_df[['ID']].copy()
submission['price'] = predictions
submission.to_csv("submission.csv", index=False)
print("📦 Predictions saved to 'submission.csv'")

📦 Predictions saved to 'submission.csv'
