In [24]:
# 📘 House Prices - Linear Regression with Preprocessing & Submission

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder



In [25]:
# Load train and test datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")


In [26]:
# Save SalePrice and Id separately
y_train = train_df['SalePrice']
train_ID = train_df['Id']
test_ID = test_df['Id']


In [27]:
# Drop target from train for merging
train_features = train_df.drop(['SalePrice'], axis=1)

In [28]:
# Combine train and test data for preprocessing
total_data = pd.concat([train_features, test_df], axis=0)

In [29]:
# Fill categorical NA with 'None'
for col in ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
            'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
            'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
            'BsmtFinType2', 'MasVnrType']:
    total_data[col] = total_data[col].fillna("None")


In [30]:
# Fill numerical NA with 0
for col in ['GarageYrBlt', 'GarageArea', 'GarageCars',
            'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
            'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea']:
    total_data[col] = total_data[col].fillna(0)

In [31]:
# Fill with mode
for col in ['MSZoning', 'Electrical', 'KitchenQual', 'Exterior1st',
            'Exterior2nd', 'SaleType']:
    total_data[col] = total_data[col].fillna(total_data[col].mode()[0])


In [32]:
# Fill LotFrontage by median of neighborhood
total_data['LotFrontage'] = total_data.groupby("Neighborhood")['LotFrontage'].transform(lambda x: x.fillna(x.median()))


In [33]:
# Feature Engineering
total_data['HouseAge'] = total_data['YrSold'] - total_data['YearBuilt']
total_data['RemodAge'] = total_data['YrSold'] - total_data['YearRemodAdd']
total_data['GarageAge'] = total_data['YrSold'] - total_data['GarageYrBlt']
total_data['TotalSF'] = total_data['TotalBsmtSF'] + total_data['1stFlrSF'] + total_data['2ndFlrSF']
total_data['TotalBath'] = (total_data['BsmtFullBath'] + 0.5 * total_data['BsmtHalfBath'] + total_data['FullBath'] + 0.5 * total_data['HalfBath'])
total_data['TotalPorchSF'] = (total_data['OpenPorchSF'] + total_data['EnclosedPorch'] + total_data['3SsnPorch'] + total_data['ScreenPorch'])
total_data['HasPool'] = total_data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
total_data['HasGarage'] = total_data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
total_data['HasFireplace'] = total_data['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
total_data['HasPorch'] = total_data['TotalPorchSF'].apply(lambda x: 1 if x > 0 else 0)


In [34]:
# Label Encoding for ordinal features
ordinal_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']
for col in ordinal_cols:
    lbl = LabelEncoder()
    total_data[col] = lbl.fit_transform(total_data[col])

In [35]:
# One-Hot Encoding
total_data = pd.get_dummies(total_data)


In [36]:
# Split back to train/test
X_train = total_data[:len(train_df)]
X_test = total_data[len(train_df):]

In [17]:
# Train/Validation split
X_train_split, X_valid, y_train_split, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [37]:
# Linear Regression
model = LinearRegression()
model.fit(X_train_split, y_train_split)
y_pred = model.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print("Validation RMSE:", rmse)

Validation RMSE: 64511.15205144974


In [38]:
# Train on full train data and predict test
model.fit(X_train, y_train)
test_predictions = model.predict(X_test)

# Submission File
submission = pd.DataFrame({
    'Id': test_ID,
    'SalePrice': test_predictions
})
submission.to_csv("submission.csv", index=False)
print("submission.csv saved ✅")


submission.csv saved ✅
