## Cleaning columns

In [1]:
import os, sys
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt   

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from dython.nominal import associations

from config_local import local_config

In [2]:
train = pd.read_csv(local_config.TRAIN_CSV, index_col="Id")#, index_col="Id"
test  = pd.read_csv(local_config.TEST_CSV, index_col="Id")#, index_col="Id"

print(f"Train shape: {train.shape}  |  Test shape: {test.shape}")
display(train.head(3))

Train shape: (1460, 80)  |  Test shape: (1459, 79)


Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500


# Missing values

In [3]:
def fill_missing_with_none_or_zero(df):
    df = df.copy()
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            df[col] = df[col].fillna(0)
        else:
            df[col] = df[col].replace(["NA", ""], pd.NA)   # normalize weird missing codes
            df[col] = df[col].fillna("<None>")
    return df


missing = train.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
missing

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
BsmtExposure      38
BsmtFinType2      38
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
MasVnrArea         8
Electrical         1
dtype: int64

In [4]:
train_filled = fill_missing_with_none_or_zero(train)
test_filled = fill_missing_with_none_or_zero(test)

In [7]:
def summarize_columns(df, max_unique=15):
    summary = {}
    for col in df.columns:
        col_type = df[col].dtype
        uniques = df[col].dropna().unique()
        n_unique = len(uniques)

        if pd.api.types.is_numeric_dtype(df[col]):
            if n_unique <= max_unique:
                summary[col] = {
                    "type": "numeric (discrete)",
                    "unique_values": sorted(uniques)
                }
            else:
                summary[col] = {
                    "type": "numeric (continuous)",
                    "unique_values": f"{n_unique} unique values"
                }
        else:
            if n_unique <= max_unique:
                summary[col] = {
                    "type": "categorical",
                    "unique_values": uniques.tolist()
                }
            else:
                summary[col] = {
                    "type": "categorical",
                    "unique_values": f"{n_unique} unique values"
                }
    return pd.DataFrame(summary).T

# Usage
feature_summary = summarize_columns(train_filled)
display(feature_summary.head(20))

feature_summary.to_csv(__import__("pathlib").Path(local_config.TRAIN_CSV).resolve().parent / "feature_summary.csv", index=True)

Unnamed: 0,type,unique_values
MSSubClass,numeric (discrete),"[20, 30, 40, 45, 50, 60, 70, 75, 80, 85, 90, 1..."
MSZoning,categorical,"[RL, RM, C (all), FV, RH]"
LotFrontage,numeric (continuous),111 unique values
LotArea,numeric (continuous),1073 unique values
Street,categorical,"[Pave, Grvl]"
Alley,categorical,"[<None>, Grvl, Pave]"
LotShape,categorical,"[Reg, IR1, IR2, IR3]"
LandContour,categorical,"[Lvl, Bnk, Low, HLS]"
Utilities,categorical,"[AllPub, NoSeWa]"
LotConfig,categorical,"[Inside, FR2, Corner, CulDSac, FR3]"


In [6]:

train_filled.to_csv(__import__("pathlib").Path(local_config.TRAIN_PROCESS1_CSV).resolve().parent / "train_process1.csv", index=False)
test_filled.to_csv(__import__("pathlib").Path(local_config.TEST_PROCESS1_CSV).resolve().parent / "test_process1.csv", index=False)