In [1]:
import sys
import os

# Get the parent directory
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Add it to sys.path
sys.path.insert(0, parent_dir)


In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from data_consistency_check import check_data_consistency
from feature_engineering import engineer_features
from utils.load_data import load_data
from data_cleaning import clean_data
from utils.calculate_row_entropy import calculate_row_entropy

In [3]:
train_df = load_data("../data/train.csv")
train_df = engineer_features(train_df)
test_df = load_data("../data/test.csv")
test_df = engineer_features(test_df)
train_df_raw = pd.read_csv("../data/train.csv")
test_df_raw = pd.read_csv("../data/test.csv")

In [4]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.width", None)

In [5]:
na_info = train_df.isna().sum()[train_df.isna().sum() > 0]
print("train df NA counts")
print(na_info)

train df NA counts
LotFrontage    259
Electrical       1
dtype: int64


In [6]:
na_info = test_df.isna().sum()[test_df.isna().sum() > 0]
print("test df NA counts")
print(na_info)

test df NA counts
MSZoning         4
LotFrontage    227
Utilities        2
Exterior1st      1
Exterior2nd      1
KitchenQual      1
Functional       2
SaleType         1
dtype: int64


In [7]:
# Checking data consistency after initial cleaning
train_df_check = check_data_consistency(train_df)
test_df_check = check_data_consistency(test_df)

In [8]:
# train_df = clean_data(train_df)
# test_df = clean_data(test_df)
train_df_check = check_data_consistency(train_df)
test_df_check = check_data_consistency(test_df)

In [9]:
func_nas = test_df[test_df["Functional"].isna()]

In [10]:
train_df["Functional"].value_counts()

Functional
Typ     1360
Min2      34
Min1      31
Mod       15
Maj1      14
Maj2       5
Sev        1
Sal        0
Name: count, dtype: int64

In [11]:
print(func_nas.transpose())

                                        756      1013
Id                                      2217     2474
MSSubClass                                20       50
MSZoning                                 NaN       RM
LotFrontage                             80.0     60.0
LotArea                                14584    10320
Street                                  Pave     Pave
Alley                                     NA     Grvl
LotShape                                 Reg      Reg
LandContour                              Low      Lvl
Utilities                             AllPub   AllPub
LotConfig                             Inside   Corner
LandSlope                                Mod      Gtl
Neighborhood                          IDOTRR   IDOTRR
Condition1                              Norm   Artery
Condition2                              Norm     Norm
BldgType                                1Fam     1Fam
HouseStyle                            1Story   1.5Fin
OverallQual                 

In [12]:
train_df["Functional"].value_counts()

Functional
Typ     1360
Min2      34
Min1      31
Mod       15
Maj1      14
Maj2       5
Sev        1
Sal        0
Name: count, dtype: int64

In [13]:
tab_functional_vs_overall_cond = pd.crosstab(train_df["OverallCond"], train_df["Functional"])
tab_functional_vs_overall_cond["row_entropy"] = tab_functional_vs_overall_cond.apply(calculate_row_entropy, axis=1)
tab_functional_vs_overall_cond

Functional,Typ,Min1,Min2,Mod,Maj1,Maj2,Sev,row_entropy
OverallCond,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,0,0,0,1,0,0,-0.0
2,2,0,1,1,0,1,0,1.921928
3,17,1,1,3,2,1,0,1.594186
4,44,3,6,1,2,1,0,1.227984
5,785,13,10,4,7,1,1,0.353629
6,234,8,6,2,1,1,0,0.504363
7,190,4,7,4,0,0,0,0.489607
8,66,2,3,0,1,0,0,0.535413
9,22,0,0,0,0,0,0,-0.0


In [14]:
tab_functional_vs_overall_qual = pd.crosstab(train_df["OverallQual"], train_df["Functional"])
tab_functional_vs_overall_qual["row_entropy"] = tab_functional_vs_overall_qual.apply(calculate_row_entropy, axis=1)
tab_functional_vs_overall_qual

Functional,Typ,Min1,Min2,Mod,Maj1,Maj2,Sev,row_entropy
OverallQual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,0,0,0,1,0,0,1.0
2,2,0,0,0,1,0,0,0.918296
3,16,1,1,1,0,1,0,1.121928
4,104,2,8,2,0,0,0,0.609311
5,347,19,19,6,4,2,0,0.786164
6,355,6,4,3,3,2,1,0.411963
7,309,3,1,2,4,0,0,0.25899
8,165,0,1,1,1,0,0,0.157537
9,43,0,0,0,0,0,0,-0.0
10,18,0,0,0,0,0,0,-0.0


In [15]:
tab_functional_vs_sale_condition = pd.crosstab(train_df["SaleCondition"], train_df["Functional"])
tab_functional_vs_sale_condition["row_entropy"] = tab_functional_vs_sale_condition.apply(calculate_row_entropy, axis=1)
tab_functional_vs_sale_condition

Functional,Typ,Min1,Min2,Mod,Maj1,Maj2,Sev,row_entropy
SaleCondition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Abnorml,93,3,2,1,0,1,1,0.570126
AdjLand,4,0,0,0,0,0,0,-0.0
Alloca,11,0,0,0,1,0,0,0.413817
Family,19,1,0,0,0,0,0,0.286397
Normal,1108,27,32,14,13,4,0,0.540421
Partial,125,0,0,0,0,0,0,-0.0


In [16]:
summary = pd.DataFrame()
features = ["OverallCond", "OverallQual", "SaleCondition"]
for feature in features:
    for v in func_nas[feature].unique():
        summary[f"{feature}_{v}"] = train_df[train_df[feature] == v]["Functional"].value_counts(normalize=True, sort=False)

In [17]:
summary
summary.to_string("summary.txt")

In [18]:
results = []

In [19]:
for idx,row in func_nas.iterrows():
    df = pd.DataFrame({"Id": [row["Id"]]})
    for feature in features:
        summ_col = f"{feature}_{row[feature]}"
        df[feature] = row[feature]
        df[f"{feature}_pred"] = summary[summ_col].idxmax()
        df[f"{feature}_prob"] = summary[summ_col].max()
    results.append(df)

results = pd.concat(results, ignore_index=True)

In [20]:
results = results.set_index("Id")

In [21]:
results

Unnamed: 0_level_0,OverallCond,OverallCond_pred,OverallCond_prob,OverallQual,OverallQual_pred,OverallQual_prob,SaleCondition,SaleCondition_pred,SaleCondition_prob
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2217,5,Typ,0.956151,1,Typ,0.5,Abnorml,Typ,0.920792
2474,1,Maj1,1.0,4,Typ,0.896552,Abnorml,Typ,0.920792


In [22]:
# So, my most rigorous guess is Maj1 for 2474, because with OverallCond 1 it's unlikely to have typycal functionality.
# And Typ for 2217, since OverallCond is 5 out of 10, and secondary features suggest Typ as well.