In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from data_consistency_check import check_data_consistency
from feature_engineering import engineer_features
from utils.load_data import load_data
from data_cleaning import clean_data

In [2]:
train_df_raw = pd.read_csv("data/train.csv")
test_df_raw = pd.read_csv("data/test.csv")
train_df = load_data("data/train.csv")
train_df = engineer_features(train_df)
test_df = load_data("data/test.csv")
test_df = engineer_features(test_df)

In [3]:
print(train_df.shape)
print(test_df.shape)
# (1460, 81)

(1460, 84)
(1459, 83)


In [4]:
print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 84 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   Id                    1460 non-null   int32   
 1   MSSubClass            1460 non-null   category
 2   MSZoning              1460 non-null   category
 3   LotFrontage           1201 non-null   float32 
 4   LotArea               1460 non-null   int32   
 5   Street                1460 non-null   category
 6   Alley                 1460 non-null   category
 7   LotShape              1460 non-null   category
 8   LandContour           1460 non-null   category
 9   Utilities             1460 non-null   category
 10  LotConfig             1460 non-null   category
 11  LandSlope             1460 non-null   category
 12  Neighborhood          1460 non-null   category
 13  Condition1            1460 non-null   category
 14  Condition2            1460 non-null   category
 15  Bldg

In [5]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.width", None)

In [6]:
na_info = train_df.isna().sum()[train_df.isna().sum() > 0]
print("train df NA counts")
print(na_info)

train df NA counts
LotFrontage    259
Electrical       1
dtype: int64


In [7]:
na_info = test_df.isna().sum()[test_df.isna().sum() > 0]
print("test df NA counts")
print(na_info)

test df NA counts
MSZoning         4
LotFrontage    227
Utilities        2
Exterior1st      1
Exterior2nd      1
KitchenQual      1
Functional       2
SaleType         1
dtype: int64


In [8]:
# Checking data consistency after initial cleaning
train_df_check = check_data_consistency(train_df)
test_df_check = check_data_consistency(test_df)

In [9]:
# I've decided to change MasVnrType for houses with to small MasVnrArea (< 10 sf) to None
# and impute MasVnrType with BrkFace for None MasVnrType houses with normal MasVnrArea
train_df = clean_data(train_df)
test_df = clean_data(test_df)
train_df_check = check_data_consistency(train_df)
test_df_check = check_data_consistency(test_df)

In [10]:
print(sum(~train_df_check["garage_type_consistent"]))
print(sum(~test_df_check["garage_type_consistent"]))

0
0


In [11]:
print(sum(~train_df_check["garage_features_consistent"]))
print(sum(~test_df_check["garage_features_consistent"]))

0
0


In [12]:
print(sum(~train_df_check["garage_area_reasonable"]))
print(sum(~test_df_check["garage_area_reasonable"]))
# That's fine

55
51


In [13]:
print(sum(~train_df_check["basement_features_consistent"]))
print(sum(~test_df_check["basement_features_consistent"]))

0
0


In [14]:
print(sum(~train_df_check["has_consistent_second_finished_area"]))
print(sum(~test_df_check["has_consistent_second_finished_area"]))

0
0


In [15]:
print(sum(~train_df_check["basement_areas_match"]))
print(sum(~test_df_check["basement_areas_match"]))

0
0


In [16]:
print(sum(~train_df_check["mas_vnr_consistent"]))
print(sum(~test_df_check["mas_vnr_consistent"]))

0
0


In [17]:
na_info = train_df.isna().sum()[train_df.isna().sum() > 0]
print("train df NA counts")
print(na_info)

train df NA counts
LotFrontage    259
dtype: int64


In [18]:
na_info = test_df.isna().sum()[test_df.isna().sum() > 0]
print("test df NA counts")
print(na_info)

test df NA counts
LotFrontage    227
KitchenQual      1
SaleType         1
dtype: int64
