In [171]:
import numpy as np
import matplotlib.pyplot as plt
import polars as pl
import sklearn
import os, pathlib, shutil
import kaggle, kagglehub


Download the competition


In [172]:
competition_name = "home-data-for-ml-course"  # Change this to the competition you want to download
competition_path = pathlib.Path(
    kagglehub.competition.competition_download(competition_name)
)  # Downloads the competition
competition_files: list = os.listdir(competition_path)  # List of the files included in the competition

print(competition_files)

# copy files to the current directory
shutil.copyfile(competition_path / "sample_submission.csv", "sample_submission.csv")
shutil.copyfile(competition_path / "test.csv", "test.csv")
shutil.copyfile(competition_path / "train.csv", "train.csv")
shutil.copyfile(competition_path / "data_description.txt", "data_description.txt")

['data_description.txt', 'sample_submission.csv', 'sample_submission.csv.gz', 'test.csv', 'test.csv.gz', 'train.csv', 'train.csv.gz']


'data_description.txt'

In [173]:
pl.Config.set_tbl_rows(20)
train_df = pl.read_csv(competition_path / "train.csv", ignore_errors=True)
test_df = pl.read_csv(competition_path / "test.csv", ignore_errors=True)
sample_submission_df = pl.read_csv(competition_path / "sample_submission.csv")

target_feature_name: str = (set(train_df.columns) - set(test_df.columns)).pop()
target_col = train_df.get_column(target_feature_name)
print(train_df.drop(target_feature_name).schema == test_df.schema)
df = pl.concat([train_df, test_df.with_columns(pl.lit(-1).cast(pl.Int64).alias(target_feature_name))])

True


### EDA


### Extract feature types according to the `data_description.txt` file

In [174]:
# Nominal Features (Categorical, Unordered)
nominal_features = [
    "MSSubClass",  # Coded type of dwelling (treated as categorical)
    "MSZoning",
    "Street",
    "Alley",
    "LotShape",
    "LandContour",
    "Utilities",
    "LotConfig",
    "LandSlope",
    "Neighborhood",
    "Condition1",
    "Condition2",
    "BldgType",
    "HouseStyle",
    "RoofStyle",
    "RoofMatl",
    "Exterior1st",
    "Exterior2nd",
    "MasVnrType",
    "Foundation",
    "Heating",
    "Electrical",
    "GarageType",
    "MiscFeature",
    "SaleType",
    "SaleCondition",
    "PavedDrive",  # Although it has 3 values (Y, P, N), we treat it as nominal here.
]

# Ordinal Features (Categorical, with inherent order)
ordinal_features = [
    "OverallQual",  # Quality ratings 10 to 1
    "OverallCond",  # Condition ratings 10 to 1
    "ExterQual",  # Exterior quality: Ex > Gd > TA > Fa > Po
    "ExterCond",  # Exterior condition: Ex > Gd > TA > Fa > Po
    "BsmtQual",  # Basement quality: Ex > Gd > TA > Fa > Po > NA
    "BsmtCond",  # Basement condition: Ex > Gd > TA > Fa > Po > NA
    "BsmtExposure",  # Basement exposure: Gd > Av > Mn > No > NA
    "BsmtFinType1",  # Rating of basement finished area
    "BsmtFinType2",  # Rating of basement finished area (if multiple)
    "HeatingQC",  # Heating quality and condition: Ex > Gd > TA > Fa > Po
    "KitchenQual",  # Kitchen quality: Ex > Gd > TA > Fa > Po
    "FireplaceQu",  # Fireplace quality: Ex > Gd > TA > Fa > Po > NA
    "GarageFinish",  # Interior finish of the garage: Fin > RFn > Unf > NA
    "GarageQual",  # Garage quality: Ex > Gd > TA > Fa > Po > NA
    "GarageCond",  # Garage condition: Ex > Gd > TA > Fa > Po > NA
    "PoolQC",  # Pool quality: Ex > Gd > TA > Fa > NA
    "Fence",  # Fence quality: GdPrv > MnPrv / GdWo > MnWw (quality ranking implied)
    "Functional",  # Home functionality: Typ, Min1, Min2, Mod, Maj1, Maj2, Sev, Sal (ordered from best to worst)
]

# Binary Features (Exactly two distinct values)
binary_features = [
    "CentralAir"  # 'Y' or 'N'
]

# Numerical Features (Continuous or discrete numerical measurements)
numerical_features = [
    "LotFrontage",  # Linear feet of street connected to property
    "LotArea",  # Lot size in square feet
    "MasVnrArea",  # Masonry veneer area in square feet
    "BsmtFinSF1",  # Type 1 finished basement square feet
    "BsmtFinSF2",  # Type 2 finished basement square feet
    "BsmtUnfSF",  # Unfinished basement square feet
    "TotalBsmtSF",  # Total basement square feet
    "1stFlrSF",  # First Floor square feet
    "2ndFlrSF",  # Second Floor square feet
    "LowQualFinSF",  # Low quality finished square feet
    "GrLivArea",  # Above grade living area square feet
    "BsmtFullBath",  # Number of basement full bathrooms
    "BsmtHalfBath",  # Number of basement half bathrooms
    "FullBath",  # Number of full bathrooms above grade
    "HalfBath",  # Number of half baths above grade
    "Bedroom",  # Number of bedrooms above grade
    "Kitchen",  # Number of kitchens above grade
    "TotRmsAbvGrd",  # Total rooms above grade
    "Fireplaces",  # Number of fireplaces
    "GarageYrBlt",  # Year garage was built
    "GarageCars",  # Garage capacity (number of cars)
    "GarageArea",  # Garage area in square feet
    "WoodDeckSF",  # Wood deck area in square feet
    "OpenPorchSF",  # Open porch area in square feet
    "EnclosedPorch",  # Enclosed porch area in square feet
    "3SsnPorch",  # Three season porch area in square feet
    "ScreenPorch",  # Screen porch area in square feet
    "PoolArea",  # Pool area in square feet
    "MiscVal",  # Miscellaneous feature value
    "MoSold",  # Month Sold
    "YrSold",  # Year Sold
]


In [175]:
# Define the schema for Polars DataFrame
schema = {
    "Id": pl.Int64,
    "LotFrontage": pl.Float64,  # Missing values → float
    "LotArea": pl.Int64,
    "MasVnrArea": pl.Float64,  # Missing values → float
    "BsmtFinSF1": pl.Int64,
    "BsmtFinSF2": pl.Int64,
    "BsmtUnfSF": pl.Int64,
    "TotalBsmtSF": pl.Int64,
    "1stFlrSF": pl.Int64,
    "2ndFlrSF": pl.Int64,
    "LowQualFinSF": pl.Int64,
    "GrLivArea": pl.Int64,
    "BsmtFullBath": pl.Int64,
    "BsmtHalfBath": pl.Int64,
    "FullBath": pl.Int64,
    "HalfBath": pl.Int64,
    "BedroomAbvGr": pl.Int64,
    "KitchenAbvGr": pl.Int64,
    "TotRmsAbvGrd": pl.Int64,
    "Fireplaces": pl.Int64,
    "GarageYrBlt": pl.Float64,  # Missing values → float
    "GarageCars": pl.Int64,
    "GarageArea": pl.Int64,
    "WoodDeckSF": pl.Int64,
    "OpenPorchSF": pl.Int64,
    "EnclosedPorch": pl.Int64,
    "3SsnPorch": pl.Int64,
    "ScreenPorch": pl.Int64,
    "PoolArea": pl.Int64,
    "MiscVal": pl.Int64,
    "MoSold": pl.Int64,
    "YrSold": pl.Int64,
    "SalePrice": pl.Int64,
    # Ordinal Categorical Features
    "OverallQual": pl.Int64,
    "OverallCond": pl.Int64,
    "ExterQual": pl.Categorical,
    "ExterCond": pl.Categorical,
    "BsmtQual": pl.Categorical,
    "BsmtCond": pl.Categorical,
    "BsmtExposure": pl.Categorical,
    "BsmtFinType1": pl.Categorical,
    "BsmtFinType2": pl.Categorical,
    "HeatingQC": pl.Categorical,
    "KitchenQual": pl.Categorical,
    "FireplaceQu": pl.Categorical,
    "GarageFinish": pl.Categorical,
    "GarageQual": pl.Categorical,
    "GarageCond": pl.Categorical,
    "PoolQC": pl.Categorical,
    "Fence": pl.Categorical,
    "Functional": pl.Categorical,
    # Binary Features
    "CentralAir": pl.Categorical,
    # Nominal Categorical Features
    "MSSubClass": pl.Categorical,
    "MSZoning": pl.Categorical,
    "Street": pl.Categorical,
    "Alley": pl.Categorical,
    "LotShape": pl.Categorical,
    "LandContour": pl.Categorical,
    "Utilities": pl.Categorical,
    "LotConfig": pl.Categorical,
    "LandSlope": pl.Categorical,
    "Neighborhood": pl.Categorical,
    "Condition1": pl.Categorical,
    "Condition2": pl.Categorical,
    "BldgType": pl.Categorical,
    "HouseStyle": pl.Categorical,
    "RoofStyle": pl.Categorical,
    "RoofMatl": pl.Categorical,
    "Exterior1st": pl.Categorical,
    "Exterior2nd": pl.Categorical,
    "MasVnrType": pl.Categorical,
    "Foundation": pl.Categorical,
    "Heating": pl.Categorical,
    "Electrical": pl.Categorical,
    "GarageType": pl.Categorical,
    "PavedDrive": pl.Categorical,
    "MiscFeature": pl.Categorical,
    "SaleType": pl.Categorical,
    "SaleCondition": pl.Categorical,
}
# pl.enable_string_cache()
# Load CSV with schema
with pl.StringCache():
    train_df = pl.read_csv("train.csv", schema_overrides=schema, infer_schema_length=None, ignore_errors=True)
    test_df = pl.read_csv("test.csv", schema_overrides=schema, infer_schema_length=None, ignore_errors=True)

df = pl.concat([train_df, test_df.with_columns(pl.Series("SalePrice", [None] * test_df.height))])

In [176]:
df.write_parquet("train_test_merged_df_bak.parquet", statistics="full")

### Compute pearson correlations

In [177]:
numeric_cols_df = train_df.drop("Id", "SalePrice").select(pl.selectors.numeric())
pearson_correlations = (
    numeric_cols_df.select(pl.corr(pl.col("*"), target_col).abs())
    .transpose(include_header=True, header_name="Features")
    .rename({"column_0": "Correlation"})
    .sort("Correlation", descending=True)
)
print(pearson_correlations.head())

shape: (5, 2)
┌─────────────┬─────────────┐
│ Features    ┆ Correlation │
│ ---         ┆ ---         │
│ str         ┆ f64         │
╞═════════════╪═════════════╡
│ OverallQual ┆ 0.790982    │
│ GrLivArea   ┆ 0.708624    │
│ GarageCars  ┆ 0.640409    │
│ GarageArea  ┆ 0.623431    │
│ TotalBsmtSF ┆ 0.613581    │
└─────────────┴─────────────┘
