<a href="https://colab.research.google.com/github/Andy7204/CelebalTechAssignments/blob/main/House_Price_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# ======================================
# 1. IMPORT LIBRARIES
# ======================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LassoCV
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor

# ======================================
# 2. LOAD DATA
# ======================================
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
test_ids = test_df['Id']

# ======================================
# 3. DROP HEAVILY MISSING FEATURES
# ======================================
threshold = 0.3
missing_cols = train_df.columns[train_df.isnull().mean() > threshold]
train_df.drop(columns=missing_cols, inplace=True)
test_df.drop(columns=missing_cols, inplace=True)

# ======================================
# 4. TARGET + CONCATENATE FOR PREPROCESSING
# ======================================
y = np.log1p(train_df['SalePrice'])  # Log-transform target for regression
train_df.drop(columns=['SalePrice'], inplace=True)

train_df['source'] = 'train'
test_df['source'] = 'test'
combined = pd.concat([train_df, test_df], axis=0)
combined.drop(columns=['Id'], inplace=True)

# ======================================
# 5. IMPUTE MISSING VALUES
# ======================================
numeric_cols = combined.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = combined.select_dtypes(include='object').columns

combined[numeric_cols] = SimpleImputer(strategy='median').fit_transform(combined[numeric_cols])
combined[categorical_cols] = SimpleImputer(strategy='most_frequent').fit_transform(combined[categorical_cols])

# ======================================
# 6. ONE-HOT ENCODING
# ======================================
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
], remainder='passthrough')

combined_encoded = preprocessor.fit_transform(combined)
train_data = combined_encoded[combined['source'] == 'train']
test_data = combined_encoded[combined['source'] == 'test']

# ======================================
# 7. OUTLIER REMOVAL (based on GrLivArea)
# ======================================
# Optional: remove extreme outliers in training data
train_data_df = pd.DataFrame(train_data)
outliers = train_data_df.iloc[:, list(combined.columns).index('GrLivArea')] > 4000
train_data = train_data[~outliers]
y = y[~outliers]

# ======================================
# 8. SCALING AND PCA
# ======================================
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_data)
test_scaled = scaler.transform(test_data)

pca = PCA(n_components=0.95, random_state=42)
X_train_pca = pca.fit_transform(train_scaled)
X_test_pca = pca.transform(test_scaled)

# ======================================
# 9. TRAIN MODELS
# ======================================

# Lasso
lasso = make_pipeline(RobustScaler(), LassoCV(cv=5, random_state=42))
lasso.fit(train_scaled, y)
lasso_preds = lasso.predict(test_scaled)

# XGBoost
xgb = XGBRegressor(n_estimators=1000, learning_rate=0.05, max_depth=3,
                   subsample=0.7, colsample_bytree=0.7, random_state=42)
xgb.fit(train_scaled, y)
xgb_preds = xgb.predict(test_scaled)

# ======================================
# 10. ENSEMBLE + EXPORT SUBMISSION
# ======================================
final_preds = 0.6 * np.expm1(xgb_preds) + 0.4 * np.expm1(lasso_preds)

submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': final_preds
})
submission.to_csv('submission.csv', index=False)
print("✅ submission.csv saved!")

# ======================================
# 11. (OPTIONAL) EVALUATE RMSE VIA CV
# ======================================
scores = cross_val_score(xgb, train_scaled, y, scoring='neg_root_mean_squared_error', cv=5)
print(f"📊 XGBoost CV RMSE: {-np.mean(scores):.4f}")


✅ submission.csv saved!
📊 XGBoost CV RMSE: 0.1179
