In [52]:
import os
import sys
from pathlib import Path

# Get the absolute path to project root directory
project_root = str(Path(os.getcwd()).resolve().parents[0])  # Go up one level if notebook is in subdirectory
if os.getcwd() == project_root:
    project_root = os.getcwd()  # If notebook is in root directory

# Add project root to path
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from data_cleaning.data_consistency_check import check_data_consistency
from feature_engineering import engineer_features
from utils.load_data import load_data
from data_cleaning.data_cleaning import clean_data

In [54]:
train_df_raw = pd.read_csv("data/train.csv")
test_df_raw = pd.read_csv("data/test.csv")
train_df = load_data("data/train.csv")
train_df = engineer_features(train_df)
test_df = load_data("data/test.csv")
test_df = engineer_features(test_df)

In [55]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.width", None)

In [56]:
print(train_df.shape)
print(test_df.shape)
# (1460, 81)

(1460, 84)
(1459, 83)


In [57]:
print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 84 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   Id                    1460 non-null   int32   
 1   MSSubClass            1460 non-null   category
 2   MSZoning              1460 non-null   category
 3   LotFrontage           1201 non-null   float32 
 4   LotArea               1460 non-null   int32   
 5   Street                1460 non-null   category
 6   Alley                 1460 non-null   category
 7   LotShape              1460 non-null   category
 8   LandContour           1460 non-null   category
 9   Utilities             1460 non-null   category
 10  LotConfig             1460 non-null   category
 11  LandSlope             1460 non-null   category
 12  Neighborhood          1460 non-null   category
 13  Condition1            1460 non-null   category
 14  Condition2            1460 non-null   category
 15  Bldg

In [58]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.width", None)

In [59]:
na_info = train_df.isna().sum()[train_df.isna().sum() > 0]
print("train df NA counts")
print(na_info)

train df NA counts
LotFrontage    259
Electrical       1
dtype: int64


In [60]:
na_info = test_df.isna().sum()[test_df.isna().sum() > 0]
print("test df NA counts")
print(na_info)

test df NA counts
MSZoning         4
LotFrontage    227
Utilities        2
Exterior1st      1
Exterior2nd      1
KitchenQual      1
Functional       2
SaleType         1
dtype: int64


In [61]:
# Checking data consistency after initial cleaning
train_df_check = check_data_consistency(train_df)
test_df_check = check_data_consistency(test_df)

In [62]:
# I've decided to change MasVnrType for houses with to small MasVnrArea (< 10 sf) to None
# and impute MasVnrType with BrkFace for None MasVnrType houses with normal MasVnrArea
train_df = clean_data(train_df)
test_df = clean_data(test_df)
train_df_check = check_data_consistency(train_df)
test_df_check = check_data_consistency(test_df)

In [63]:
print(sum(~train_df_check["garage_type_consistent"]))
print(sum(~test_df_check["garage_type_consistent"]))

0
0


In [64]:
print(sum(~train_df_check["garage_features_consistent"]))
print(sum(~test_df_check["garage_features_consistent"]))

0
0


In [65]:
print(sum(~train_df_check["garage_area_reasonable"]))
print(sum(~test_df_check["garage_area_reasonable"]))
# That's fine

55
51


In [66]:
print(sum(~train_df_check["basement_features_consistent"]))
print(sum(~test_df_check["basement_features_consistent"]))

0
0


In [67]:
print(sum(~train_df_check["has_consistent_second_finished_area"]))
print(sum(~test_df_check["has_consistent_second_finished_area"]))

0
0


In [68]:
print(sum(~train_df_check["basement_areas_match"]))
print(sum(~test_df_check["basement_areas_match"]))

0
0


In [69]:
print(sum(~train_df_check["mas_vnr_consistent"]))
print(sum(~test_df_check["mas_vnr_consistent"]))

0
0


In [70]:
na_info = train_df.isna().sum()[train_df.isna().sum() > 0]
print("train df NA counts")
print(na_info)

train df NA counts
Series([], dtype: int64)


In [71]:
na_info = test_df.isna().sum()[test_df.isna().sum() > 0]
print("test df NA counts")
print(na_info)

test df NA counts
Series([], dtype: int64)


In [72]:
test_df.shape

(1459, 90)

In [73]:
numeric_df = train_df \
    .select_dtypes(
        include=["int32", "float32"]
    ) \
    .drop(["has_garage", "has_basement", "SalePrice", "Id"], axis=1)

In [74]:
numeric_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 33 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotFrontage    1460 non-null   float32
 1   LotArea        1460 non-null   int32  
 2   YearBuilt      1460 non-null   int32  
 3   YearRemodAdd   1460 non-null   int32  
 4   MasVnrArea     1460 non-null   float32
 5   BsmtFinSF1     1460 non-null   int32  
 6   BsmtFinSF2     1460 non-null   int32  
 7   BsmtUnfSF      1460 non-null   int32  
 8   TotalBsmtSF    1460 non-null   int32  
 9   1stFlrSF       1460 non-null   int32  
 10  2ndFlrSF       1460 non-null   int32  
 11  LowQualFinSF   1460 non-null   int32  
 12  GrLivArea      1460 non-null   int32  
 13  BsmtFullBath   1460 non-null   int32  
 14  BsmtHalfBath   1460 non-null   int32  
 15  FullBath       1460 non-null   int32  
 16  HalfBath       1460 non-null   int32  
 17  BedroomAbvGr   1460 non-null   int32  
 18  KitchenA

In [75]:
numeric_df.corrwith(train_df["SalePrice"]).sort_values(ascending=False)

GrLivArea        0.708624
GarageCars       0.640409
GarageArea       0.623431
TotalBsmtSF      0.613581
1stFlrSF         0.605852
FullBath         0.560664
TotRmsAbvGrd     0.533723
YearBuilt        0.522897
GarageYrBlt      0.508043
YearRemodAdd     0.507101
MasVnrArea       0.472615
Fireplaces       0.466929
BsmtFinSF1       0.386420
LotFrontage      0.340564
WoodDeckSF       0.324413
2ndFlrSF         0.319334
OpenPorchSF      0.315856
HalfBath         0.284108
LotArea          0.263843
BsmtFullBath     0.227122
BsmtUnfSF        0.214479
BedroomAbvGr     0.168213
ScreenPorch      0.111447
PoolArea         0.092404
MoSold           0.046432
3SsnPorch        0.044584
BsmtFinSF2      -0.011378
BsmtHalfBath    -0.016844
MiscVal         -0.021190
LowQualFinSF    -0.025606
YrSold          -0.028923
EnclosedPorch   -0.128578
KitchenAbvGr    -0.135907
dtype: float64

In [76]:
kw_stats = []
kw_p_values = []
cat_vars = train_df.select_dtypes(include=["category"]).columns

for var in cat_vars:
    # Group SalePrice values by category
    groups = [group for name, group in train_df.groupby(var, observed=True)['SalePrice']]
    
    # Perform Kruskal-Wallis H-test
    stat, p = stats.kruskal(*groups)
    
    # Calculate epsilon-squared
    n = train_df.shape[0]
    epsilon_sqrd = stat / (n - 1)
    
    kw_stats.append(epsilon_sqrd)
    kw_p_values.append(p)

kw_results = pd.DataFrame({
    'epsilon_sqrd': kw_stats,
    'p': kw_p_values
}, index=cat_vars)

# Sort by effect size
kw_results = kw_results.sort_values('epsilon_sqrd', ascending=False)

In [77]:
kw_results

Unnamed: 0,epsilon_sqrd,p
OverallQual,0.662168,3.506973e-202
Neighborhood,0.595293,6.696799e-168
ExterQual,0.46843,8.180797999999999e-148
BsmtQual,0.462242,1.209826e-144
KitchenQual,0.45338,4.720112999999999e-143
GarageFinish,0.417963,7.53425e-132
GarageType,0.36511,7.581344e-112
MSSubClass,0.364315,1.89686e-104
Foundation,0.334971,2.169096e-103
FireplaceQu,0.316586,1.331155e-97
