In [1]:
import pandas as pd
import numpy as np

# Load the data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)

Train shape: (1460, 81)
Test shape: (1459, 80)


In [2]:
# Save target variable
y = train['SalePrice']
train.drop(['SalePrice'], axis=1, inplace=True)

# Combine train and test for preprocessing
combined = pd.concat([train, test], axis=0, ignore_index=True)
print("Combined shape:", combined.shape)

Combined shape: (2919, 80)


In [3]:
# Drop columns with more than 40% missing values
missing = combined.isnull().sum()
drop_cols = missing[missing > 0.4 * combined.shape[0]].index
combined.drop(columns=drop_cols, inplace=True)

# Fill numeric columns with median
num_cols = combined.select_dtypes(include=['int64', 'float64']).columns
combined[num_cols] = combined[num_cols].fillna(combined[num_cols].median())

# Fill categorical columns with mode
cat_cols = combined.select_dtypes(include=['object']).columns
combined[cat_cols] = combined[cat_cols].fillna(combined[cat_cols].mode().iloc[0])

In [4]:
# Create a new total square footage feature
combined['TotalSF'] = combined['TotalBsmtSF'] + combined['1stFlrSF'] + combined['2ndFlrSF']

# Create house age features
combined['HouseAge'] = 2024 - combined['YearBuilt']
combined['RemodAge'] = 2024 - combined['YearRemodAdd']
combined['GarageAge'] = 2024 - combined['GarageYrBlt'].fillna(0)

# Drop the original year columns
combined.drop(columns=['YearBuilt', 'YearRemodAdd', 'GarageYrBlt'], inplace=True)

In [5]:
combined = pd.get_dummies(combined, drop_first=True)
print("After one-hot encoding:", combined.shape)

After one-hot encoding: (2919, 231)


In [6]:
X_train = combined.iloc[:len(y), :]
X_test = combined.iloc[len(y):, :]

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

X_train shape: (1460, 231)
X_test shape: (1459, 231)
