In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import kruskal

from feature_engineering import engineer_features
from utils.load_data import load_data
from data_cleaning import clean_data
from utils.calculate_row_entropy import calculate_row_entropy

In [2]:
train_df_raw = pd.read_csv("data/train.csv")
test_df_raw = pd.read_csv("data/test.csv")
train_df = load_data("data/train.csv")
train_df = engineer_features(train_df)
test_df = load_data("data/test.csv")
test_df = engineer_features(test_df)

In [3]:
pd.set_option('display.width', None)          # No width restriction
pd.set_option('display.max_colwidth', None)  # No truncation of column content
pd.set_option('display.max_columns', None)   # Display all columns
pd.set_option('display.max_rows', None)      # Display all rows (if needed)

In [4]:
# train_df = clean_data(train_df)
# test_df = clean_data(test_df)

In [5]:
na_info = test_df.isna().sum()[test_df.isna().sum() > 0]
print("test df NA counts")
print(na_info)

test df NA counts
MSZoning         4
LotFrontage    227
Utilities        2
Exterior1st      1
Exterior2nd      1
KitchenQual      1
Functional       2
SaleType         1
dtype: int64


In [6]:
# MSZoning

In [7]:
def categorize_lot_area(df):
    # Calculate quintile boundaries using training data
    quintiles = np.quantile(train_df["LotArea"], [0.2, 0.4, 0.6, 0.8])
    
    # Create categories
    conditions = [
        df["LotArea"] <= quintiles[0],
        (df["LotArea"] > quintiles[0]) & (df["LotArea"] <= quintiles[1]),
        (df["LotArea"] > quintiles[1]) & (df["LotArea"] <= quintiles[2]),
        (df["LotArea"] > quintiles[2]) & (df["LotArea"] <= quintiles[3]),
        df["LotArea"] > quintiles[3]
    ]
    
    choices = ["Very Small", "Small", "Medium", "Large", "Very Large"]
    
    return pd.Series(np.select(conditions, choices), index=df.index)

# Apply to both train and test datasets
train_df["LotArea_Cat"] = categorize_lot_area(train_df)
test_df["LotArea_Cat"] = categorize_lot_area(test_df)

In [8]:
msz_nas = test_df[test_df["MSZoning"].isna()]
msz_nas.transpose()

Unnamed: 0,455,756,790,1444
Id,1916,2217,2251,2905
MSSubClass,30,20,70,20
MSZoning,,,,
LotFrontage,109.0,80.0,,125.0
LotArea,21780,14584,56600,31250
Street,Grvl,Pave,Pave,Pave
Alley,,,,
LotShape,Reg,Reg,IR1,Reg
LandContour,Lvl,Low,Low,Lvl
Utilities,,AllPub,AllPub,AllPub


In [9]:
# Checking if zoning depends on the neighbourhood
tab_nbhood_vs_mszoning = pd.crosstab(train_df["Neighborhood"], train_df["MSZoning"])

In [10]:
tab_nbhood_vs_mszoning["row_entropy"] = tab_nbhood_vs_mszoning.apply(calculate_row_entropy, axis=1)
print(tab_nbhood_vs_mszoning)

MSZoning      C (all)  FV  RH   RL  RM  row_entropy
Neighborhood                                       
Blmngtn             0   0   0   16   1     0.322757
Blueste             0   0   0    0   2    -0.000000
BrDale              0   0   0    0  16    -0.000000
BrkSide             0   0   0   28  30     0.999142
ClearCr             0   0   0   28   0    -0.000000
CollgCr             0   0   0  140  10     0.353359
Crawfor             0   0   2   46   3     0.557940
Edwards             0   0   2   90   8     0.541188
Gilbert             0   0   0   79   0    -0.000000
IDOTRR              9   0   0    0  28     0.800392
MeadowV             0   0   0    0  17    -0.000000
Mitchel             0   0   0   44   5     0.475432
NAmes               0   0   2  223   0     0.073334
NPkVill             0   0   0    9   0    -0.000000
NWAmes              0   0   0   73   0    -0.000000
NoRidge             0   0   0   41   0    -0.000000
NridgHt             0   0   0   76   1     0.100001
OldTown     

In [11]:
# Checking if some house types are more common in specific areas
pd.crosstab(train_df["MSSubClass"], train_df["MSZoning"])

MSZoning,C (all),FV,RH,RL,RM
MSSubClass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20,2,13,3,508,10
30,2,0,1,33,33
40,0,0,0,2,2
45,0,0,1,4,7
50,4,0,1,88,51
60,0,25,0,273,1
70,1,0,3,30,26
75,0,0,0,6,10
80,0,0,0,58,0
85,0,0,0,20,0


In [12]:
pd.crosstab(train_df["BldgType"], train_df["MSZoning"])

MSZoning,C (all),FV,RH,RL,RM
BldgType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1Fam,9,38,9,1025,139
2fmCon,1,0,2,16,12
Duplex,0,0,3,43,6
Twnhs,0,9,0,10,24
TwnhsE,0,18,2,57,37


In [13]:
# Checking whether MSZoning depends on various conditions

In [14]:
pd.crosstab(train_df["Condition1"], train_df["MSZoning"])

MSZoning,C (all),FV,RH,RL,RM
Condition1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Artery,0,0,1,26,21
Feedr,2,0,3,65,11
Norm,8,64,12,997,179
PosA,0,0,0,8,0
PosN,0,0,0,19,0
RRAe,0,0,0,8,3
RRAn,0,1,0,23,2
RRNe,0,0,0,2,0
RRNn,0,0,0,3,2


In [15]:
tab_bldgtype_vs_mszoning = pd.crosstab(train_df["BldgType"], train_df["MSZoning"])
tab_bldgtype_vs_mszoning["row_entropy"] = tab_bldgtype_vs_mszoning.apply(calculate_row_entropy, axis=1)
tab_bldgtype_vs_mszoning

MSZoning,C (all),FV,RH,RL,RM,row_entropy
BldgType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1Fam,9,38,9,1025,139,0.828521
2fmCon,1,0,2,16,12,1.437437
Duplex,0,0,3,43,6,0.823631
Twnhs,0,9,0,10,24,1.431202
TwnhsE,0,18,2,57,37,1.549704


In [16]:
summary = pd.DataFrame()
features = ["Neighborhood", "BldgType", "LotArea_Cat", "MSSubClass", "Condition1"]
for feature in features:
    for v in msz_nas[feature].unique():
        summary[f"{feature}_{v}"] = train_df[train_df[feature] == v]["MSZoning"].value_counts(normalize=True, sort=False)

In [17]:
summary

Unnamed: 0_level_0,Neighborhood_IDOTRR,Neighborhood_Mitchel,BldgType_1Fam,LotArea_Cat_Very Large,MSSubClass_30,MSSubClass_20,MSSubClass_70,Condition1_Norm,Condition1_Artery
MSZoning,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
C (all),0.243243,0.0,0.007377,0.003425,0.028986,0.003731,0.016667,0.006349,0.0
FV,0.0,0.0,0.031148,0.003425,0.0,0.024254,0.0,0.050794,0.0
RH,0.0,0.0,0.007377,0.0,0.014493,0.005597,0.05,0.009524,0.020833
RL,0.0,0.897959,0.840164,0.962329,0.478261,0.947761,0.5,0.79127,0.541667
RM,0.756757,0.102041,0.113934,0.030822,0.478261,0.018657,0.433333,0.142063,0.4375


In [18]:
results = []

In [19]:
for idx,row in msz_nas.iterrows():
    df = pd.DataFrame({"Id": [row["Id"]]})
    for feature in features:
        summ_col = f"{feature}_{row[feature]}"
        df[feature] = row[feature]
        df[f"{feature}_pred"] = summary[summ_col].idxmax()
        df[f"{feature}_prob"] = summary[summ_col].max()
    results.append(df)
    
results = pd.concat(results, ignore_index=True)

In [20]:
results = results.set_index("Id")

In [21]:
results

Unnamed: 0_level_0,Neighborhood,Neighborhood_pred,Neighborhood_prob,BldgType,BldgType_pred,BldgType_prob,LotArea_Cat,LotArea_Cat_pred,LotArea_Cat_prob,MSSubClass,MSSubClass_pred,MSSubClass_prob,Condition1,Condition1_pred,Condition1_prob
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1916,IDOTRR,RM,0.756757,1Fam,RL,0.840164,Very Large,RL,0.962329,30,RL,0.478261,Norm,RL,0.79127
2217,IDOTRR,RM,0.756757,1Fam,RL,0.840164,Very Large,RL,0.962329,20,RL,0.947761,Norm,RL,0.79127
2251,IDOTRR,RM,0.756757,1Fam,RL,0.840164,Very Large,RL,0.962329,70,RL,0.5,Norm,RL,0.79127
2905,Mitchel,RL,0.897959,1Fam,RL,0.840164,Very Large,RL,0.962329,20,RL,0.947761,Artery,RL,0.541667


In [22]:
# I was unable to fit CatBoost and RF and any model, because MSZoning has severe class imbalance
# So I've just calculated conditional probabilities and used "Neighborhood", "BldgType", "LotArea_Cat", "MSSubClass", "Condition1" features for it
# I believe Neighborhood is main predictor of zone type, so I will use neighborhood based predictions for the missing data, except Id=2217, where MSSubClass-based MSZoning probability is ~95% RL

In [23]:
# summary.to_string("summary.txt")