In [254]:
# packages
import pandas as pd
from datetime import datetime

In [255]:
# read in the data
data_train = pd.read_csv("data/train.csv")
data_test = pd.read_csv("data/test.csv")

In [256]:
data_train.shape

(1460, 81)

In [257]:
# Missing Values

# Check for missing values
missing_values_count = data_train.isna().sum()
col_missing_mask = missing_values_count[missing_values_count > 0]
col_missing_mask

LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [258]:
# Remove columns with over half missing values
data_train = data_train.drop(columns=['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'MasVnrType'])

# Remove the 1 row of missing electrical data
data_train = data_train.dropna(subset=['Electrical'])

categorical_nas = ['FireplaceQu', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']

data_train[categorical_nas] = data_train[categorical_nas].fillna('NA')

# Impute the continuous variable LotFrontage with its median
data_train['LotFrontage'] = data_train['LotFrontage'].fillna(data_train.LotFrontage.median())
# Impute the continuous variable MasVnrArea with its median
data_train['MasVnrArea'] = data_train['MasVnrArea'].fillna(data_train.MasVnrArea.median())
# 

In [259]:
# Feature Engineering

# Convert anything to do with Years to an Age instead
# Year-based columns to target: GarageYrBlt, YrSold, YearBuilt,
# YearRemodAdd
current_year = datetime.now().year

def convert_to_age(df, date_columns, current_year = current_year):
    for col in date_columns:
        df[col + 'Age'] = df[col].apply(lambda x: current_year - x if pd.notna(x) else None)
    return df

date_columns = ['GarageYrBlt', 'YrSold', 'YearBuilt', 'YearRemodAdd']

convert_to_age(data_train, date_columns)

data_train = data_train.drop(columns=date_columns)

# Handle remaining missing values in GarageYrBltAge by setting them equal
# to the value for YearBuiltAge
data_train['GarageYrBltAge'] = data_train['GarageYrBltAge'].fillna(data_train.YearBuiltAge)

In [260]:
# Check results
data_train.isna().sum().any()

# Successfully took care of all missing values
data_train.shape

(1459, 76)

In [None]:
# Encoding

# Binary variable: CentralAir. Encode as 0s and 1s
centralAir_map = {'N': 0, 'Y': 1}
data_train['CentralAir'] = data_train['CentralAir'].map(centralAir_map)

array([1, 0])