In [56]:
import numpy as np
import matplotlib.pyplot as plt
import polars as pl
import sklearn
import os, pathlib, shutil
import kaggle, kagglehub


Download the competition


In [57]:
competition_name = "home-data-for-ml-course"  # Change this to the competition you want to download
competition_path = pathlib.Path(
    kagglehub.competition.competition_download(competition_name)
)  # Downloads the competition
competition_files: list = os.listdir(
    competition_path
)  # List of the files included in the competition

print(competition_files)

# copy files to the current directory
shutil.copyfile(
    competition_path / "sample_submission.csv", "sample_submission.csv"
)
shutil.copyfile(competition_path / "test.csv", "test.csv")
shutil.copyfile(competition_path / "train.csv", "train.csv")
shutil.copyfile(
    competition_path / "data_description.txt", "data_description.txt"
)

['data_description.txt', 'sample_submission.csv', 'sample_submission.csv.gz', 'test.csv', 'test.csv.gz', 'train.csv', 'train.csv.gz']


'data_description.txt'

### Extract feature types according to the `data_description.txt` file

In [58]:
# Define the schema for Polars DataFrame
schema = {
    "Id": pl.Int64,
    "LotFrontage": pl.Float64,  # Missing values → float
    "LotArea": pl.Int64,
    "MasVnrArea": pl.Float64,  # Missing values → float
    "BsmtFinSF1": pl.Int64,
    "BsmtFinSF2": pl.Int64,
    "BsmtUnfSF": pl.Int64,
    "TotalBsmtSF": pl.Int64,
    "1stFlrSF": pl.Int64,
    "2ndFlrSF": pl.Int64,
    "LowQualFinSF": pl.Int64,
    "GrLivArea": pl.Int64,
    "BsmtFullBath": pl.Int64,
    "BsmtHalfBath": pl.Int64,
    "FullBath": pl.Int64,
    "HalfBath": pl.Int64,
    "BedroomAbvGr": pl.Int64,
    "KitchenAbvGr": pl.Int64,
    "TotRmsAbvGrd": pl.Int64,
    "Fireplaces": pl.Int64,
    "GarageYrBlt": pl.Float64,  # Missing values → float
    "GarageCars": pl.Int64,
    "GarageArea": pl.Int64,
    "WoodDeckSF": pl.Int64,
    "OpenPorchSF": pl.Int64,
    "EnclosedPorch": pl.Int64,
    "3SsnPorch": pl.Int64,
    "ScreenPorch": pl.Int64,
    "PoolArea": pl.Int64,
    "MiscVal": pl.Int64,
    "MoSold": pl.Int64,
    "YrSold": pl.Int64,
    "SalePrice": pl.Int64,
    # Ordinal Categorical Features
    "OverallQual": pl.Int64,
    "OverallCond": pl.Int64,
    "ExterQual": pl.Categorical,
    "ExterCond": pl.Categorical,
    "BsmtQual": pl.Categorical,
    "BsmtCond": pl.Categorical,
    "BsmtExposure": pl.Categorical,
    "BsmtFinType1": pl.Categorical,
    "BsmtFinType2": pl.Categorical,
    "HeatingQC": pl.Categorical,
    "KitchenQual": pl.Categorical,
    "FireplaceQu": pl.Categorical,
    "GarageFinish": pl.Categorical,
    "GarageQual": pl.Categorical,
    "GarageCond": pl.Categorical,
    "PoolQC": pl.Categorical,
    "Fence": pl.Categorical,
    "Functional": pl.Categorical,
    # Binary Features
    "CentralAir": pl.Categorical,
    # Nominal Categorical Features
    "MSSubClass": pl.Categorical,
    "MSZoning": pl.Categorical,
    "Street": pl.Categorical,
    "Alley": pl.Categorical,
    "LotShape": pl.Categorical,
    "LandContour": pl.Categorical,
    "Utilities": pl.Categorical,
    "LotConfig": pl.Categorical,
    "LandSlope": pl.Categorical,
    "Neighborhood": pl.Categorical,
    "Condition1": pl.Categorical,
    "Condition2": pl.Categorical,
    "BldgType": pl.Categorical,
    "HouseStyle": pl.Categorical,
    "RoofStyle": pl.Categorical,
    "RoofMatl": pl.Categorical,
    "Exterior1st": pl.Categorical,
    "Exterior2nd": pl.Categorical,
    "MasVnrType": pl.Categorical,
    "Foundation": pl.Categorical,
    "Heating": pl.Categorical,
    "Electrical": pl.Categorical,
    "GarageType": pl.Categorical,
    "PavedDrive": pl.Categorical,
    "MiscFeature": pl.Categorical,
    "SaleType": pl.Categorical,
    "SaleCondition": pl.Categorical,
}
# pl.enable_string_cache()
# Load CSV with schema
with pl.StringCache():
    train_df = pl.read_csv(
        "train.csv",
        schema_overrides=schema,
        infer_schema_length=None,
        ignore_errors=True,
    )
    test_df = pl.read_csv(
        "test.csv",
        schema_overrides=schema,
        infer_schema_length=None,
        ignore_errors=True,
    )
split_num = train_df.height
df = pl.concat(
    [
        train_df,
        test_df.with_columns(pl.Series("SalePrice", [None] * test_df.height)),
    ]
)
print(df.get_column('LotFrontage').describe(),
df.get_column('GarageYrBlt').describe(),
df.get_column('MasVnrArea').describe()
)


shape: (9, 2)
┌────────────┬───────────┐
│ statistic  ┆ value     │
│ ---        ┆ ---       │
│ str        ┆ f64       │
╞════════════╪═══════════╡
│ count      ┆ 2433.0    │
│ null_count ┆ 486.0     │
│ mean       ┆ 69.305795 │
│ std        ┆ 23.344905 │
│ min        ┆ 21.0      │
│ 25%        ┆ 59.0      │
│ 50%        ┆ 68.0      │
│ 75%        ┆ 80.0      │
│ max        ┆ 313.0     │
└────────────┴───────────┘ shape: (9, 2)
┌────────────┬─────────────┐
│ statistic  ┆ value       │
│ ---        ┆ ---         │
│ str        ┆ f64         │
╞════════════╪═════════════╡
│ count      ┆ 2760.0      │
│ null_count ┆ 159.0       │
│ mean       ┆ 1978.113406 │
│ std        ┆ 25.574285   │
│ min        ┆ 1895.0      │
│ 25%        ┆ 1960.0      │
│ 50%        ┆ 1979.0      │
│ 75%        ┆ 2002.0      │
│ max        ┆ 2207.0      │
└────────────┴─────────────┘ shape: (9, 2)
┌────────────┬────────────┐
│ statistic  ┆ value      │
│ ---        ┆ ---        │
│ str        ┆ f64        │
╞═════

In [59]:
df = df.with_columns(
        [
            pl.when(pl.col("MasVnrArea").is_null())
            .then(0)
            .otherwise(pl.col("MasVnrArea"))
            .alias("MasVnrArea"),
            pl.when(pl.col("LotFrontage").is_null())
            .then(0)
            .otherwise(pl.col("LotFrontage"))
            .alias("LotFrontage"),
            pl.when(pl.col("GarageYrBlt").is_null())
            .then(-1)
            .otherwise(pl.col("GarageYrBlt"))
            .alias("GarageYrBlt"),
        ]
    )
print(df.get_column('LotFrontage').describe(),
df.get_column('GarageYrBlt').describe(),
df.get_column('MasVnrArea').describe()
)

shape: (9, 2)
┌────────────┬───────────┐
│ statistic  ┆ value     │
│ ---        ┆ ---       │
│ str        ┆ f64       │
╞════════════╪═══════════╡
│ count      ┆ 2919.0    │
│ null_count ┆ 0.0       │
│ mean       ┆ 57.766701 │
│ std        ┆ 33.481636 │
│ min        ┆ 0.0       │
│ 25%        ┆ 43.0      │
│ 50%        ┆ 63.0      │
│ 75%        ┆ 78.0      │
│ max        ┆ 313.0     │
└────────────┴───────────┘ shape: (9, 2)
┌────────────┬─────────────┐
│ statistic  ┆ value       │
│ ---        ┆ ---         │
│ str        ┆ f64         │
╞════════════╪═════════════╡
│ count      ┆ 2919.0      │
│ null_count ┆ 0.0         │
│ mean       ┆ 1870.309695 │
│ std        ┆ 449.912957  │
│ min        ┆ -1.0        │
│ 25%        ┆ 1957.0      │
│ 50%        ┆ 1977.0      │
│ 75%        ┆ 2001.0      │
│ max        ┆ 2207.0      │
└────────────┴─────────────┘ shape: (9, 2)
┌────────────┬────────────┐
│ statistic  ┆ value      │
│ ---        ┆ ---        │
│ str        ┆ f64        │
╞═════

### Compute pearson correlations

In [60]:
numeric_cols_df = df.drop("Id", "SalePrice").select(pl.selectors.numeric())
pearson_correlations = (
    numeric_cols_df.select(
        pl.corr(pl.col("*"), df.get_column("SalePrice")).abs()
    )
    .transpose(include_header=True, header_name="Features")
    .rename({"column_0": "Correlation"})
    .sort("Correlation", descending=True)
)
print(pearson_correlations.head())

shape: (5, 2)
┌─────────────┬─────────────┐
│ Features    ┆ Correlation │
│ ---         ┆ ---         │
│ str         ┆ f64         │
╞═════════════╪═════════════╡
│ OverallQual ┆ 0.790982    │
│ GrLivArea   ┆ 0.708624    │
│ GarageCars  ┆ 0.640409    │
│ GarageArea  ┆ 0.623431    │
│ TotalBsmtSF ┆ 0.613581    │
└─────────────┴─────────────┘


In [61]:
set(df.dtypes)

{Categorical(ordering='physical'), Float64, Int64}

In [62]:
cat_cols_df = df.select(pl.selectors.categorical())
num_cols_df = df.select(pl.selectors.numeric())

In [63]:
cat_cols_df_unique_vals = cat_cols_df.select(pl.all().n_unique()).to_numpy()
df_null_count = df.select(pl.col("*").null_count()).to_dicts()[0]
features_null_count = [[k, v] for k, v in df_null_count.items() if v > 0]
print(*features_null_count, sep="\n")
null_included_feature_names = [i[0] for i in features_null_count]
null_included_cols: pl.DataFrame = df.select("Id").hstack(
    df.select(null_included_feature_names).drop("SalePrice")
)

['BsmtFinSF1', 1]
['BsmtFinSF2', 1]
['BsmtUnfSF', 1]
['TotalBsmtSF', 1]
['BsmtFullBath', 2]
['BsmtHalfBath', 2]
['GarageCars', 1]
['GarageArea', 1]
['SalePrice', 1459]


In [64]:
train_df = df.limit(split_num)
test_df = df.slice(split_num)

In [65]:
train_df.write_parquet('train_df.parquet', statistics='full')
test_df.write_parquet('test_df.parquet', statistics='full')