In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from scipy import stats

try:
    import lightgbm as lgb
    LGBM_INSTALLED = True
except ImportError:
    LGBM_INSTALLED = False



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mlp-term-3-2025-kaggle-assignment-1/sample_submission.csv
/kaggle/input/mlp-term-3-2025-kaggle-assignment-1/train.csv
/kaggle/input/mlp-term-3-2025-kaggle-assignment-1/test.csv


In [2]:
#load data
train = pd.read_csv('/kaggle/input/mlp-term-3-2025-kaggle-assignment-1/train.csv')
test = pd.read_csv('/kaggle/input/mlp-term-3-2025-kaggle-assignment-1/test.csv')
submission = pd.read_csv('/kaggle/input/mlp-term-3-2025-kaggle-assignment-1/sample_submission.csv')


In [3]:
#feature engineering
train['size'] = train['size'].astype(str)
test['size'] = test['size'].astype(str)
train['num_bedrooms'] = train['size'].str.extract(r'(\d+)').astype(float)
test['num_bedrooms'] = test['size'].str.extract(r'(\d+)').astype(float)
train['total_sqft_per_bath'] = train['total_sqft'] / (train['bath'] + 1)
test['total_sqft_per_bath'] = test['total_sqft'] / (test['bath'] + 1)



In [4]:
#impute missing values
numeric_cols = ['total_sqft', 'bath', 'balcony', 'num_bedrooms', 'total_sqft_per_bath']
cat_cols = ['area_type', 'availability', 'location', 'size']
for col in numeric_cols:
    train[col] = train[col].fillna(train[col].median())
    test[col] = test[col].fillna(train[col].median())
for col in cat_cols:
    train[col] = train[col].fillna(train[col].mode()[0])
    test[col] = test[col].fillna(train[col].mode()[0])



In [5]:
# Group rare locations
loc_counts = train['location'].value_counts()
rare_locs = loc_counts[loc_counts <= 10].index
train['location'] = train['location'].replace(rare_locs, 'Other')
test['location'] = test['location'].replace(rare_locs, 'Other')



In [6]:
# Log transform the target
train['log_price'] = np.log1p(train['price'])


In [7]:
# Remove price outliers (now on log scale is best practice)
z = np.abs(stats.zscore(train['log_price']))
train = train[z < 3]  # can use z < 2.5 for even stricter


In [8]:
# Label encoding categorical variables
for col in cat_cols:
    le = LabelEncoder()
    combined = pd.concat([train[col], test[col]], axis=0).astype(str)
    le.fit(combined)
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))


In [9]:
# Feature scaling
scale_cols = ['total_sqft', 'num_bedrooms', 'total_sqft_per_bath']
scaler = StandardScaler()
train[scale_cols] = scaler.fit_transform(train[scale_cols])
test[scale_cols] = scaler.transform(test[scale_cols])

features = ['area_type', 'availability', 'location', 'size',
            'total_sqft', 'bath', 'balcony', 'num_bedrooms', 'total_sqft_per_bath']
X = train[features]
y = train['log_price']



In [10]:

# XGBoost hyperparameter tuning
xgb_params = {
    'learning_rate': [0.05, 0.08],
    'max_depth': [6, 8],
    'n_estimators': [200, 300],
    'subsample': [0.9, 1.0]
}
xgb_grid = GridSearchCV(xgb.XGBRegressor(random_state=0), xgb_params, scoring='r2', cv=3)
xgb_grid.fit(X, y)
best_xgb = xgb_grid.best_estimator_



In [11]:
# Random Forest (already good, add more trees)
rf = RandomForestRegressor(n_estimators=300, max_depth=16, random_state=0)
rf.fit(X, y)


In [12]:
if LGBM_INSTALLED:
    lgbm = lgb.LGBMRegressor(n_estimators=300, max_depth=8, learning_rate=0.08, random_state=0)
    lgbm.fit(X, y)
    lgbm_preds = lgbm.predict(test[features])

rf_preds = rf.predict(test[features])
xgb_preds = best_xgb.predict(test[features])


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002570 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 826
[LightGBM] [Info] Number of data points in the train set: 9911, number of used features: 9
[LightGBM] [Info] Start training from score 4.391864


In [13]:
if LGBM_INSTALLED:
    blend = (rf_preds + xgb_preds + lgbm_preds) / 3
else:
    blend = (rf_preds + xgb_preds) / 2

submission['price'] = np.expm1(blend)
submission.to_csv('submission.csv', index=False)
