Importing the necessary libraries for the project.

In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer


Loading and reading the dataset.

In [30]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')


In [32]:

num_cols_train = train_df.select_dtypes(include=['float64', 'int64']).columns
num_cols_train = num_cols_train.drop('SalePrice')
num_cols_test = test_df.select_dtypes(include=['float64', 'int64']).columns

In [34]:
imputer_num = SimpleImputer(strategy='median')
train_df[num_cols_train] = imputer_num.fit_transform(train_df[num_cols_train])
test_df[num_cols_test] = imputer_num.transform(test_df[num_cols_test])

cat_cols_train = train_df.select_dtypes(include=['object']).columns
cat_cols_test = test_df.select_dtypes(include=['object']).columns

imputer_cat = SimpleImputer(strategy='most_frequent')
train_df[cat_cols_train] = imputer_cat.fit_transform(train_df[cat_cols_train])
test_df[cat_cols_test] = imputer_cat.transform(test_df[cat_cols_test])

In [36]:

train_df['TotalArea'] = train_df['GrLivArea'] + train_df['TotalBsmtSF']
test_df['TotalArea'] = test_df['GrLivArea'] + test_df['TotalBsmtSF']

In [38]:

train_df['TotalPorchSF'] = train_df['OpenPorchSF'] + train_df['EnclosedPorch'] + train_df['3SsnPorch'] + train_df['ScreenPorch']
test_df['TotalPorchSF'] = test_df['OpenPorchSF'] + test_df['EnclosedPorch'] + test_df['3SsnPorch'] + test_df['ScreenPorch']

In [40]:

train_df['GrLivArea^2'] = train_df['GrLivArea'] ** 2
test_df['GrLivArea^2'] = test_df['GrLivArea'] ** 2


In [42]:
train_df['GrLivArea^3'] = train_df['GrLivArea'] ** 3
test_df['GrLivArea^3'] = test_df['GrLivArea'] ** 3


In [44]:

train_df['SalePrice'] = np.log1p(train_df['SalePrice'])  
train_df['GrLivArea'] = np.log1p(train_df['GrLivArea'])
test_df['GrLivArea'] = np.log1p(test_df['GrLivArea'])

In [46]:
train_df['TotalArea'] = np.log1p(train_df['TotalArea'])
test_df['TotalArea'] = np.log1p(test_df['TotalArea'])

In [48]:

train_df['AgeGroup'] = pd.cut(train_df['YearBuilt'], bins=[1870, 1940, 1970, 2000, 2020], labels=['Old', 'MidOld', 'Modern', 'New'])
test_df['AgeGroup'] = pd.cut(test_df['YearBuilt'], bins=[1870, 1940, 1970, 2000, 2020], labels=['Old', 'MidOld', 'Modern', 'New'])

In [50]:
train_df['LotAreaGroup'] = pd.cut(train_df['LotArea'], bins=[0, 5000, 10000, 15000, 20000, np.inf], labels=['Very Small', 'Small', 'Medium', 'Large', 'Very Large'])
test_df['LotAreaGroup'] = pd.cut(test_df['LotArea'], bins=[0, 5000, 10000, 15000, 20000, np.inf], labels=['Very Small', 'Small', 'Medium', 'Large', 'Very Large'])



In [52]:
neighborhood_means = train_df.groupby('Neighborhood')['SalePrice'].mean()
train_df['NeighborhoodEncoded'] = train_df['Neighborhood'].map(neighborhood_means)
test_df['NeighborhoodEncoded'] = test_df['Neighborhood'].map(neighborhood_means)

In [54]:
train_df['SpacePerRoom'] = train_df['TotalArea'] / (train_df['TotRmsAbvGrd'] + 1)  
test_df['SpacePerRoom'] = test_df['TotalArea'] / (test_df['TotRmsAbvGrd'] + 1)

In [56]:
train_df['TotalBath'] = train_df['FullBath'] + 0.5 * train_df['HalfBath'] + train_df['BsmtFullBath'] + 0.5 * train_df['BsmtHalfBath']
test_df['TotalBath'] = test_df['FullBath'] + 0.5 * test_df['HalfBath'] + test_df['BsmtFullBath'] + 0.5 * test_df['BsmtHalfBath']

In [58]:

train_df = pd.get_dummies(train_df, drop_first=True)
test_df = pd.get_dummies(test_df, drop_first=True)


train_df, test_df = train_df.align(test_df, join='left', axis=1, fill_value=0)

In [60]:

scaler = MinMaxScaler()

num_cols_train = train_df.select_dtypes(include=['float64', 'int64']).columns
num_cols_train = num_cols_train.drop('SalePrice')  

train_df[num_cols_train] = scaler.fit_transform(train_df[num_cols_train])
test_df[num_cols_train] = scaler.transform(test_df[num_cols_train])

In [62]:

X = train_df.drop('SalePrice', axis=1)  
y = train_df['SalePrice']  


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


train_df.to_csv('preprocessed_train_complete_feature_engineering.csv', index=False)
test_df.to_csv('preprocessed_test_complete_feature_engineering.csv', index=False)

print("Preprocessed datasets with complete feature engineering saved.")

Preprocessed datasets with complete feature engineering saved.
