# Machine Learning — Price Prediction

**Goal:** Predict `TARGET(PRICE_IN_LACS)` using OLS, Ridge, and Lasso. This notebook includes detailed preprocessing, visualizations, model tuning, and evaluation.

In [None]:
import numpy as np, pandas as pd, os, matplotlib.pyplot as plt, seaborn as sns
from IPython.display import display
sns.set(style='whitegrid')

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

print('Files in /mnt/data:', os.listdir('/mnt/data'))

## Load dataset
We will load `train.csv` and `test.csv` from `/mnt/data` (you uploaded them).

In [None]:
train_path = '/mnt/data/train.csv'
test_path = '/mnt/data/test.csv'

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
print('Train shape:', train.shape)
print('Test shape:', test.shape)
display(train.head())

## Exploratory Data Analysis (EDA)
Quick overview of missing values and basic distributions.

In [None]:
# Missing values summary
missing = train.isnull().sum().sort_values(ascending=False)
display(missing[missing>0].head(30))

# Describe numeric columns
display(train.describe().T)

# Histogram of target
plt.figure(figsize=(8,5))
sns.histplot(train['TARGET(PRICE_IN_LACS)'], bins=50, kde=True)
plt.title('Target Distribution: PRICE_IN_LACS')
plt.show()

## Preprocessing plan

**Steps:**
1. Handle missing values: median for numeric, mode for categorical.
2. Encode categorical variables: one-hot for nominal; binary columns kept as-is.
3. Scale numeric features using StandardScaler.
4. Optionally remove extreme outliers (capping at 99th percentile) — this cell demonstrates a conservative approach.

All preprocessing is wrapped into a `ColumnTransformer` + `Pipeline` so it is reproducible and safe for cross-validation.

In [None]:
# Columns (adjust if your CSV differs)
numeric_cols = ['BHK_NO.', 'SQUARE_FT', 'LONGITUDE', 'LATITUDE']
categorical_cols = ['POSTED_BY', 'UNDER_CONSTRUCTION', 'RERA', 'READY_TO_MOVE', 'RESALE', 'BHK_OR_RK']

# Ensure columns exist
for c in numeric_cols + categorical_cols:
    if c not in train.columns:
        raise ValueError(f'Expected column {c} not found in train.csv. Columns available: {list(train.columns)}')

# Impute and cap
train_proc = train.copy()
for col in numeric_cols:
    train_proc[col].fillna(train_proc[col].median(), inplace=True)
for col in categorical_cols:
    train_proc[col].fillna(train_proc[col].mode()[0], inplace=True)

# Cap 99th percentile of SQUARE_FT
sq_up = train_proc['SQUARE_FT'].quantile(0.99)
train_proc = train_proc[train_proc['SQUARE_FT'] <= sq_up].reset_index(drop=True)
print('After capping, train shape:', train_proc.shape)

In [None]:
plt.figure(figsize=(6,4))
sns.heatmap(train_proc[numeric_cols + ['TARGET(PRICE_IN_LACS)']].corr(), annot=True, cmap='coolwarm')
plt.title('Numeric Feature Correlations')
plt.show()

In [None]:
numeric_transformer = Pipeline([('scaler', StandardScaler())])

categorical_transformer = Pipeline([('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# Fit transform sample
X_sample = train_proc.drop(columns=['TARGET(PRICE_IN_LACS)'])
y_sample = train_proc['TARGET(PRICE_IN_LACS)']
X_trans = preprocessor.fit_transform(X_sample)
print('Preprocessed shape (sample):', X_trans.shape)

In [None]:
X = train_proc.drop(columns=['TARGET(PRICE_IN_LACS)'])
y = train_proc['TARGET(PRICE_IN_LACS)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('X_train:', X_train.shape, 'X_test:', X_test.shape)

### Model 1: OLS (Linear Regression)

In [None]:
ols_pipeline = Pipeline([('preprocessor', preprocessor), ('model', LinearRegression())])
ols_pipeline.fit(X_train, y_train)
y_pred_ols = ols_pipeline.predict(X_test)

rmse_ols = mean_squared_error(y_test, y_pred_ols, squared=False)
r2_ols = r2_score(y_test, y_pred_ols)
print(f'OLS — RMSE: {rmse_ols:.4f} | R2: {r2_ols:.4f}')

### Model 2: Ridge Regression (GridSearchCV)

In [None]:
ridge_pipeline = Pipeline([('preprocessor', preprocessor), ('model', Ridge())])
ridge_params = {'model__alpha': [0.01, 0.1, 1.0, 10.0, 50.0, 100.0]}
ridge_grid = GridSearchCV(ridge_pipeline, ridge_params, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
ridge_grid.fit(X_train, y_train)
print('Best Ridge params:', ridge_grid.best_params_)
y_pred_ridge = ridge_grid.predict(X_test)
rmse_ridge = mean_squared_error(y_test, y_pred_ridge, squared=False)
r2_ridge = r2_score(y_test, y_pred_ridge)
print(f'Ridge — RMSE: {rmse_ridge:.4f} | R2: {r2_ridge:.4f}')

### Model 3: Lasso Regression (GridSearchCV)

In [None]:
lasso_pipeline = Pipeline([('preprocessor', preprocessor), ('model', Lasso(max_iter=10000))])
lasso_params = {'model__alpha': [0.0001, 0.001, 0.01, 0.1, 1.0]}
lasso_grid = GridSearchCV(lasso_pipeline, lasso_params, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
lasso_grid.fit(X_train, y_train)
print('Best Lasso params:', lasso_grid.best_params_)
y_pred_lasso = lasso_grid.predict(X_test)
rmse_lasso = mean_squared_error(y_test, y_pred_lasso, squared=False)
r2_lasso = r2_score(y_test, y_pred_lasso)
print(f'Lasso — RMSE: {rmse_lasso:.4f} | R2: {r2_lasso:.4f}')

## Model comparison and selection
We report RMSE and R² for each model and pick the best based on RMSE (lower is better).

In [None]:
results = []
for name, (rmse, r2) in [('OLS',(rmse_ols, r2_ols)), ('Ridge',(rmse_ridge, r2_ridge)), ('Lasso',(rmse_lasso, r2_lasso))]:
    results.append({'model': name, 'rmse': rmse, 'r2': r2})
results_df = pd.DataFrame(results).sort_values('rmse')
display(results_df)
best_model_name = results_df.iloc[0]['model']
print('Best model by RMSE:', best_model_name)

best_pipeline = {'OLS': ols_pipeline, 'Ridge': ridge_grid.best_estimator_, 'Lasso': lasso_grid.best_estimator_}[best_model_name]


### Coefficients / Interpretability
Inspect linear coefficients for the final selected linear model.

In [None]:
onehot = preprocessor.named_transformers_['cat']['onehot']
cat_names = onehot.get_feature_names_out(categorical_cols)
feature_names = numeric_cols + list(cat_names)

if best_model_name in ['OLS','Ridge','Lasso']:
    coef = best_pipeline.named_steps['model'].coef_
    coef_df = pd.DataFrame({'feature': feature_names, 'coefficient': coef})
    coef_df = coef_df.reindex(coef_df.coefficient.abs().sort_values(ascending=False).index)
    display(coef_df.head(20))
else:
    print('Best model not linear; skipping coefficients.')

## Predict on provided test set and save submission
Attempt to run preprocessing + prediction on the uploaded `test.csv` and save `/mnt/data/submission_from_notebook.csv`. Adjust if columns differ.

In [None]:
test_proc = test.copy()
# Fill missing as before
for col in numeric_cols:
    if col in test_proc.columns:
        test_proc[col].fillna(train_proc[col].median(), inplace=True)
for col in categorical_cols:
    if col in test_proc.columns:
        test_proc[col].fillna(train_proc[col].mode()[0], inplace=True)

try:
    X_test_final = test_proc[numeric_cols + categorical_cols]
    preds_test = best_pipeline.predict(X_test_final)
    out = test_proc.copy()
    out['TARGET(PRICE_IN_LACS)'] = preds_test
    out[['TARGET(PRICE_IN_LACS)']].to_csv('/mnt/data/submission_from_notebook.csv', index=False)
    print('Saved submission to /mnt/data/submission_from_notebook.csv')
except Exception as e:
    print('Could not predict on test.csv automatically:', e)


## Conclusion
- Preprocessing: median/mode imputation, One-Hot encoding, StandardScaler, and conservative outlier capping.
- Models: OLS baseline, Ridge (best generalization in many cases), Lasso (sparse solution).

Download the notebook and run it locally or in Colab. Adjust hyperparameters and preprocessing for experimentation.