In [1]:
import sys
import os

# Get the parent directory
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Add it to sys.path
sys.path.insert(0, parent_dir)


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from data_consistency_check import check_data_consistency
from feature_engineering import engineer_features
from utils.load_data import load_data

In [4]:
train_df = load_data("../data/train.csv")
train_df = engineer_features(train_df)
test_df = load_data("../data/test.csv")
test_df = engineer_features(test_df)
train_df_raw = pd.read_csv("../data/train.csv")
test_df_raw = pd.read_csv("../data/test.csv")

In [5]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.width", None)

In [6]:
# Checking data consistency after initial cleaning
train_df_check = check_data_consistency(train_df)
test_df_check = check_data_consistency(test_df)

In [7]:
# checking basement data consistency
print(sum(~train_df_check["basement_features_consistent"]))
# 1
# train df has one
# Id                                       949
# BsmtQual                                  Gd
# BsmtCond                                  TA
# BsmtExposure                              NA
# BsmtFinType1                             Unf
# BsmtFinSF1                                 0
# BsmtFinType2                             Unf
# BsmtFinSF2                                 0
# BsmtUnfSF                                936
# TotalBsmtSF                              936
print(sum(~train_df_check["has_consistent_second_finished_area"]))
# 1
print(sum(~train_df_check["basement_areas_match"]))
# 0

1
1
0


In [8]:
print(sum(~test_df_check["basement_features_consistent"]))
# 7
print(sum(~test_df_check["has_consistent_second_finished_area"]))
# 2
print(sum(~test_df_check["basement_areas_match"]))
# 0

7
0
0


In [9]:
# Dealing with basement second finished area
train_df_bfa2 = train_df_check[~train_df_check["has_consistent_second_finished_area"]]
print(train_df_bfa2["BsmtFinType2"].value_counts())
# NA     1
# ALQ    0
# GLQ    0
# BLQ    0
# Rec    0
# LwQ    0
# Unf    0

BsmtFinType2
NA     1
GLQ    0
ALQ    0
BLQ    0
Rec    0
LwQ    0
Unf    0
Name: count, dtype: int64


In [10]:
train_df_bfa2[train_df_bfa2["BsmtFinType2"].astype(str) == "NA"].transpose()
# Id                                       333
# BsmtQual                                  Gd
# BsmtCond                                  TA
# BsmtExposure                              No
# BsmtFinType1                             GLQ
# BsmtFinSF1                              1124
# BsmtFinType2                              NA <- have to do something about it
# BsmtFinSF2                               479
# BsmtUnfSF                               1603
# TotalBsmtSF                             3206

Unnamed: 0,332
Id,333
MSSubClass,20
MSZoning,RL
LotFrontage,85.0
LotArea,10655
Street,Pave
Alley,
LotShape,IR1
LandContour,Lvl
Utilities,AllPub


In [11]:
sum(
    (train_df_bfa2["BsmtFinType2"].astype(str) == "Unf")
    & (train_df_bfa2["BsmtFinSF2"] == 0)
)

0

In [12]:
# Dealing with basement feature inconsistencies

In [13]:
print(sum(~train_df_check["basement_features_consistent"]))
print(sum(~test_df_check["basement_features_consistent"]))

1
7


In [14]:
# Test df first
test_df_check.loc[
    ~test_df_check["basement_features_consistent"],
    [
        "OverallQual",
        "OverallCond",
        "BsmtQual",
        "BsmtCond",
        "BsmtExposure",
        "BsmtFinType1",
        "BsmtFinSF1",
        "BsmtFinType2",
        "BsmtFinSF2",
        "TotalBsmtSF",
    ],
].transpose()

Unnamed: 0,27,580,725,757,758,888,1064
OverallQual,8,8,6,4,4,5,5
OverallCond,5,9,6,7,7,5,7
BsmtQual,Gd,Gd,TA,,,Gd,TA
BsmtCond,TA,,,Fa,TA,TA,
BsmtExposure,,Mn,No,No,No,,Av
BsmtFinType1,Unf,GLQ,BLQ,Unf,Unf,Unf,ALQ
BsmtFinSF1,0.0,1044.0,1033.0,0.0,0.0,0.0,755.0
BsmtFinType2,Unf,Rec,Unf,Unf,Unf,Unf,Unf
BsmtFinSF2,0.0,382.0,0.0,0.0,0.0,0.0,0.0
TotalBsmtSF,1595.0,1426.0,1127.0,173.0,356.0,725.0,995.0


In [None]:
# Dealing with BsmtExposure
train_df["BsmtExposure"].value_counts()

In [None]:
pd.crosstab(train_df["BsmtQual"], train_df["BsmtExposure"])
print(pd.crosstab(train_df["BsmtQual"], train_df["BsmtExposure"]))

In [None]:
pd.crosstab(train_df["BsmtFinType1"], train_df["BsmtExposure"])
print(pd.crosstab(train_df["BsmtFinType1"], train_df["BsmtExposure"]))

In [None]:
# Based on the output above, missing BsmtExposure likely is No

In [None]:
# Dealing with BsmtCond
print(pd.crosstab(train_df["BsmtQual"], train_df["BsmtCond"]))

In [None]:
print(pd.crosstab(train_df["BsmtFinType1"], train_df["BsmtCond"]))

In [None]:
print(pd.crosstab(train_df["OverallCond"], train_df["BsmtCond"]))

In [None]:
# Basement condition is likely to be average

In [None]:
# Dealing with BsmtQual
print(pd.crosstab(train_df["BsmtFinType1"], train_df["BsmtQual"]))

In [None]:
# Table above suggests that if BsmtFinType1 is GLQ then BsmtQual is likely to be Gd, otherwise it's TA

In [None]:
print(pd.crosstab(train_df["OverallQual"], train_df["BsmtQual"]))

In [None]:
# Table above suggests that if OverallQual < 7, then BsmtQual is likely to be TA, if 7 <= OverallQual < 9,
# then BsmtQual is likely to be Gd. For OverallQual 9 and 10 it's likely to be Ex  

In [None]:
# Dealing with second finished area 
print(sum(~train_df_check["has_consistent_second_finished_area"]))
print(sum(~test_df_check["has_consistent_second_finished_area"]))

In [None]:
train_df.loc[
    ~train_df_check["has_consistent_second_finished_area"],
    [
        "Id",
        "OverallQual",
        "OverallCond",
        "BsmtQual",
        "BsmtCond",
        "BsmtExposure",
        "BsmtFinType1",
        "BsmtFinSF1",
        "BsmtFinType2",
        "BsmtFinSF2",
        "BsmtUnfSF",
        "TotalBsmtSF",
    ],
].transpose()

In [None]:
train_df["BsmtFinType2"].value_counts()

In [None]:
pd.crosstab(train_df["BsmtFinType2"], train_df["BsmtFinType1"])

In [None]:
pd.crosstab(train_df["BsmtFinType2"], train_df["BsmtQual"])

In [None]:
# This observation is likely has BsmtFinType2 Unf

In [None]:
print(test_df.loc[~test_df["has_consistent_second_finished_area"], [
        "Id",
        "OverallQual",
        "OverallCond",
        "BsmtQual",
        "BsmtCond",
        "BsmtExposure",
        "BsmtFinType1",
        "BsmtFinSF1",
        "BsmtFinType2",
        "BsmtFinSF2",
        "BsmtUnfSF",
        "TotalBsmtSF",]].transpose())

In [None]:
sum((train_df["BsmtFinType2"].astype(str) == "BLQ") & (train_df["BsmtFinSF2"] < 1))

In [None]:
# train_df.loc[train_df["BsmtFinType2"].astype(str) == "BLQ", "basement_area"]
test_df_raw[test_df["Id"] == 1471].transpose()