In [1]:
import sys
import os

# Get the parent directory
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Add it to sys.path
sys.path.insert(0, parent_dir)


In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from data_consistency_check import check_data_consistency
from feature_engineering import engineer_features
from utils.load_data import load_data
from data_cleaning import clean_data
from utils.calculate_row_entropy import calculate_row_entropy

In [3]:
train_df = load_data("../data/train.csv")
train_df = engineer_features(train_df)
test_df = load_data("../data/test.csv")
test_df = engineer_features(test_df)
train_df_raw = pd.read_csv("../data/train.csv")
test_df_raw = pd.read_csv("../data/test.csv")

In [4]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.width", None)
pd.set_option('display.max_colwidth', None)

In [5]:
na_info = train_df.isna().sum()[train_df.isna().sum() > 0]
print("train df NA counts")
print(na_info)

train df NA counts
LotFrontage    259
Electrical       1
dtype: int64


In [6]:
na_info = test_df.isna().sum()[test_df.isna().sum() > 0]
print("test df NA counts")
print(na_info)

test df NA counts
MSZoning         4
LotFrontage    227
Utilities        2
Exterior1st      1
Exterior2nd      1
KitchenQual      1
Functional       2
SaleType         1
dtype: int64


In [7]:
print(test_df[test_df["Exterior1st"].isna()].transpose())

                          691
Id                       2152
MSSubClass                 30
MSZoning                   RL
LotFrontage              85.0
LotArea                 19550
Street                   Pave
Alley                      NA
LotShape                  Reg
LandContour               Lvl
Utilities              AllPub
LotConfig              Inside
LandSlope                 Gtl
Neighborhood          Edwards
Condition1               Norm
Condition2               Norm
BldgType                 1Fam
HouseStyle             1Story
OverallQual                 5
OverallCond                 7
YearBuilt                1940
YearRemodAdd             2007
RoofStyle                Flat
RoofMatl              Tar&Grv
Exterior1st               NaN
Exterior2nd               NaN
MasVnrType               None
MasVnrArea                0.0
ExterQual                  TA
ExterCond                  TA
Foundation              PConc
BsmtQual                   TA
BsmtCond                   TA
BsmtExposu

In [8]:
train_df["Exterior1st"].value_counts()

Exterior1st
VinylSd    515
HdBoard    222
MetalSd    220
Wd Sdng    206
Plywood    108
CemntBd     61
BrkFace     50
WdShing     26
Stucco      25
AsbShng     20
BrkComm      2
Stone        2
AsphShn      1
CBlock       1
ImStucc      1
Name: count, dtype: int64

In [9]:
train_df["Exterior2nd"].value_counts()

Exterior2nd
VinylSd    504
MetalSd    214
HdBoard    207
Wd Sdng    197
Plywood    142
CmentBd     60
Wd Shng     38
Stucco      26
BrkFace     25
AsbShng     20
ImStucc     10
Brk Cmn      7
Stone        5
AsphShn      3
CBlock       1
Other        1
Name: count, dtype: int64

In [10]:
def get_exterior_group(material):
    """
    Groups exterior materials into broader categories.
    
    Categories:
    - Wood_Based: Natural wood and wood-derived materials
    - Masonry: Stone, brick, and cement-based materials
    - Synthetic: Modern manufactured materials
    - Metal: Metal-based materials
    - Other: Miscellaneous or uncommon materials
    """
    wood_based = ['Wd Sdng', 'WdShing', 'Plywood', 'HdBoard']
    masonry = ['BrkComm', 'BrkFace', 'Stone', 'CBlock', 'CemntBd', 'Stucco', 'PreCast']
    synthetic = ['VinylSd', 'ImStucc', 'AsbShng', 'AsphShn']
    metal = ['MetalSd']
    
    if material in wood_based:
        return 'Wood_Based'
    elif material in masonry:
        return 'Masonry'
    elif material in synthetic:
        return 'Synthetic'
    elif material in metal:
        return 'Metal'
    else:
        return 'Other'

# Dictionary explaining material categories
material_categories = {
    'Wood_Based': {
        'Wd Sdng': 'Wood Siding',
        'WdShing': 'Wood Shingles',
        'Plywood': 'Plywood',
        'HdBoard': 'Hard Board'
    },
    'Masonry': {
        'BrkComm': 'Brick Common',
        'BrkFace': 'Brick Face',
        'Stone': 'Stone',
        'CBlock': 'Cinder Block',
        'CemntBd': 'Cement Board',
        'Stucco': 'Stucco',
        'PreCast': 'PreCast'
    },
    'Synthetic': {
        'VinylSd': 'Vinyl Siding',
        'ImStucc': 'Imitation Stucco',
        'AsbShng': 'Asbestos Shingles',
        'AsphShn': 'Asphalt Shingles'
    },
    'Metal': {
        'MetalSd': 'Metal Siding'
    },
    'Other': {
        'Other': 'Other'
    }
}

def apply_exterior_grouping(df):
    """
    Applies the exterior grouping to both Exterior1st and Exterior2nd columns.
    Creates new columns with the grouped categories.
    """
    df['Exterior1st_Group'] = df['Exterior1st'].apply(get_exterior_group)
    df['Exterior2nd_Group'] = df['Exterior2nd'].apply(get_exterior_group)
    return df

# Example usage:
# df = apply_exterior_grouping(df)

In [11]:
train_df = apply_exterior_grouping(train_df)

In [12]:
train_df['Exterior1st_Group'].value_counts()

Exterior1st_Group
Wood_Based    562
Synthetic     537
Metal         220
Masonry       141
Name: count, dtype: int64

In [13]:
train_df['Exterior2nd_Group'].value_counts()

Exterior2nd_Group
Wood_Based    546
Synthetic     537
Metal         214
Other         106
Masonry        57
Name: count, dtype: int64

In [14]:
pd.crosstab(train_df["ExterQual"], train_df["Exterior1st_Group"])

Exterior1st_Group,Masonry,Metal,Synthetic,Wood_Based
ExterQual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fa,1,2,5,6
TA,85,176,181,464
Gd,38,34,327,89
Ex,17,8,24,3


In [15]:
pd.crosstab(train_df["ExterQual"], train_df["Exterior2nd_Group"])

Exterior2nd_Group,Masonry,Metal,Other,Synthetic,Wood_Based
ExterQual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fa,1,2,1,5,5
TA,43,170,56,181,456
Gd,12,34,32,327,83
Ex,1,8,17,24,2


In [16]:
pd.crosstab(train_df["MasVnrType"], train_df["Exterior1st_Group"])

Exterior1st_Group,Masonry,Metal,Synthetic,Wood_Based
MasVnrType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BrkCmn,1,2,2,10
BrkFace,15,58,178,194
,106,155,265,346
Stone,19,5,92,12


In [17]:
pd.crosstab(train_df["ExterCond"], train_df["Exterior1st_Group"])

Exterior1st_Group,Masonry,Metal,Synthetic,Wood_Based
ExterCond,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Po,0,0,0,1
Fa,7,4,6,11
TA,121,175,495,491
Gd,13,40,35,58
Ex,0,1,1,1


In [18]:
pd.crosstab(train_df["ExterCond"], train_df["Exterior2nd_Group"])

Exterior2nd_Group,Masonry,Metal,Other,Synthetic,Wood_Based
ExterCond,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Po,0,0,0,0,1
Fa,4,4,3,7,10
TA,48,170,96,493,475
Gd,5,39,7,36,59
Ex,0,1,0,1,1


In [19]:
# I want to find all houses built OR remodeled within 5 years of the house,
# from the same neighborhood, and with same exterior quality and condition,
# and take their modes for Exterior1st and Exterior2nd to impute missing values

In [20]:
within_5_years_of_year_built = (train_df["YearBuilt"] >= 1935) & (train_df["YearBuilt"] <= 1945)
within_5_years_of_year_remodeled = (train_df["YearRemodAdd"] >= 2002) & (train_df["YearRemodAdd"] <= 2012)
is_from_clear_cr = train_df["Neighborhood"].astype(str) == "Edwards"
has_exter_qual_ta = train_df["ExterQual"].astype(str) == "TA"
has_exter_cond_ta = train_df["ExterCond"].astype(str) == "TA"

In [21]:
similar_houses = train_df[(within_5_years_of_year_built  | \
                            within_5_years_of_year_remodeled) & \
                            is_from_clear_cr & \
                            has_exter_qual_ta & \
                            has_exter_cond_ta]

In [22]:
similar_houses.shape

(29, 86)

In [23]:
counts = pd.DataFrame({
    'Exterior1st': similar_houses["Exterior1st"].value_counts(),
    'Exterior2nd': similar_houses["Exterior2nd"].value_counts()
}).fillna(0)
counts["Exterior1st"] = counts["Exterior1st"].astype("int32")
counts["Exterior2nd"] = counts["Exterior2nd"].astype("int32")
counts = counts.sort_values("Exterior1st", ascending=False)

In [24]:
print(counts)

         Exterior1st  Exterior2nd
VinylSd           12           12
MetalSd            7            7
Wd Sdng            4            4
WdShing            3            0
Stucco             1            1
Plywood            1            1
HdBoard            1            1
Wd Shng            0            3
Stone              0            0
Other              0            0
AsbShng            0            0
AsphShn            0            0
CmentBd            0            0
CemntBd            0            0
CBlock             0            0
BrkFace            0            0
BrkComm            0            0
Brk Cmn            0            0
ImStucc            0            0


In [25]:
similar_houses["Exterior1st"].mode()[0]

'VinylSd'

In [26]:
similar_houses["Exterior2nd"].value_counts()

Exterior2nd
VinylSd    12
MetalSd     7
Wd Sdng     4
Wd Shng     3
HdBoard     1
Plywood     1
Stucco      1
AsbShng     0
AsphShn     0
Brk Cmn     0
BrkFace     0
CBlock      0
CmentBd     0
ImStucc     0
Other       0
Stone       0
Name: count, dtype: int64

In [27]:
similar_houses["Exterior2nd"].mode()[0]

'VinylSd'