# House Prices

## Import Libraries

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

import warnings
warnings.filterwarnings('ignore')

## Load the Dataset

In [14]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

## Basic Preprocessing

In [6]:
train_df = pd.get_dummies(train_df)
test_df = pd.get_dummies(test_df)

## Missing Values Analysis

In [15]:
missing = train_df.isnull().sum()
missing[missing > 0].sort_values(ascending=False)

Unnamed: 0,0
PoolQC,1453
MiscFeature,1406
Alley,1369
Fence,1179
MasVnrType,872
FireplaceQu,690
LotFrontage,259
GarageType,81
GarageYrBlt,81
GarageFinish,81


## Data Cleaning & Feature Alignment

In [21]:
import pandas as pd
import numpy as np

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_df.columns = train_df.columns.str.encode('ascii', 'ignore').str.decode('ascii').str.strip()
test_df.columns = test_df.columns.str.encode('ascii', 'ignore').str.decode('ascii').str.strip()

print("Train Columns:", train_df.columns)

y = np.log1p(train_df['SalePrice'])
train_df.drop(['SalePrice'], axis=1, inplace=True)

numeric_cols = train_df.select_dtypes(include=['int64', 'float64']).columns
train_df[numeric_cols] = train_df[numeric_cols].fillna(train_df[numeric_cols].mean())
test_df[numeric_cols] = test_df[numeric_cols].fillna(test_df[numeric_cols].mean())

train_df, test_df = train_df.align(test_df, join='inner', axis=1)

Train Columns: Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCon

## Model Training & Evaluation

In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

combined = pd.concat([train_df, test_df], keys=["train", "test"])

combined_encoded = pd.get_dummies(combined)

train_df_encoded = combined_encoded.xs("train")
test_df_encoded = combined_encoded.xs("test")

X_train, X_valid, y_train, y_valid = train_test_split(train_df_encoded, y, test_size=0.2, random_state=42)

model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
model.fit(X_train, y_train)

preds = model.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, preds))
print("RMSE:", rmse)

RMSE: 0.14345413073937008


In [25]:
test_preds_log = model.predict(test_df_encoded)

test_preds = np.expm1(test_preds_log)

submission = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': test_preds
})
submission.to_csv("submission.csv", index=False)