In [4]:
import numpy as np
import matplotlib.pyplot as plt
import polars as pl
import sklearn
import os, pathlib, shutil
import kaggle, kagglehub


Download the competition


In [5]:
competition_name = "home-data-for-ml-course"  # Change this to the competition you want to download
competition_path = pathlib.Path(
    kagglehub.competition.competition_download(competition_name)
)  # Downloads the competition
competition_files: list = os.listdir(competition_path)  # List of the files included in the competition

print(competition_files)

# copy files to the current directory
shutil.copyfile(competition_path / "sample_submission.csv", "sample_submission.csv")
shutil.copyfile(competition_path / "test.csv", "test.csv")
shutil.copyfile(competition_path / "train.csv", "train.csv")

['data_description.txt', 'sample_submission.csv', 'sample_submission.csv.gz', 'test.csv', 'test.csv.gz', 'train.csv', 'train.csv.gz']


'train.csv'

In [42]:
pl.Config.set_tbl_rows(20)
train_df = pl.read_csv(competition_path / "train.csv", ignore_errors=True)
test_df = pl.read_csv(competition_path / "test.csv", ignore_errors=True)
sample_submission_df = pl.read_csv(competition_path / "sample_submission.csv")

target_feature_name: str = (set(train_df.columns) - set(test_df.columns)).pop()
target_col = train_df.get_column(target_feature_name)
print(train_df.drop(target_feature_name).schema == test_df.schema)
df = pl.concat([train_df, test_df.with_columns(pl.lit(-1).cast(pl.Int64).alias(target_feature_name))])

True


### EDA


In [92]:
numeric_cols_df = train_df.drop("Id", "SalePrice").select(pl.selectors.numeric())
pearson_correlations = (
    numeric_cols_df.select(pl.corr(pl.col("*"), target_col).abs())
    .transpose(include_header=True, header_name="Features")
    .rename({"column_0": "Correlation"})
    .sort("Correlation", descending=True)
)
pearson_correlations.head()

Features,Correlation
str,f64
"""OverallQual""",0.790982
"""GrLivArea""",0.708624
"""GarageCars""",0.640409
"""GarageArea""",0.623431
"""TotalBsmtSF""",0.613581


In [93]:
numeric_col_names_sorted = pearson_correlations.get_column("Features").to_numpy()
sorted_df = df.sort(by=["OverallQual", "GrLivArea"], descending=True)
numeric_cols_df = sorted_df.select("Id", *numeric_col_names_sorted, "SalePrice")
str_cols_df = sorted_df.select(pl.selectors.string()).cast(pl.Categorical)
sorted_df = numeric_cols_df.hstack(str_cols_df)
sorted_df.head()


Id,OverallQual,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,1stFlrSF,FullBath,TotRmsAbvGrd,YearBuilt,YearRemodAdd,MasVnrArea,Fireplaces,BsmtFinSF1,WoodDeckSF,2ndFlrSF,OpenPorchSF,HalfBath,LotArea,BsmtFullBath,BsmtUnfSF,BedroomAbvGr,KitchenAbvGr,EnclosedPorch,ScreenPorch,PoolArea,MSSubClass,OverallCond,MoSold,3SsnPorch,YrSold,LowQualFinSF,MiscVal,BsmtHalfBath,BsmtFinSF2,SalePrice,MSZoning,…,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,Functional,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,cat,…,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat
1299,10,5642,2,1418,6110,4692,2,12,2008,2008,796,3,5644,214,950,292,1,63887,2,466,3,1,0,0,480,60,5,1,0,2008,0,0,0,0,160000,"""RL""",…,"""Gtl""","""Edwards""","""Feedr""","""Norm""","""1Fam""","""2Story""","""Hip""","""ClyTile""","""Stucco""","""Stucco""","""Stone""","""Ex""","""TA""","""PConc""","""Ex""","""TA""","""Gd""","""GLQ""","""Unf""","""GasA""","""Ex""","""Y""","""SBrkr""","""Ex""","""Typ""","""Gd""","""Attchd""","""2008""","""Fin""","""TA""","""TA""","""Y""","""Gd""","""NA""","""NA""","""New""","""Partial"""
2550,10,5095,3,1154,5095,5095,2,15,2008,2009,1224,2,4010,546,0,484,1,39290,1,1085,2,1,0,0,0,20,5,10,0,2007,0,17000,1,0,-1,"""RL""",…,"""Gtl""","""Edwards""","""Norm""","""Norm""","""1Fam""","""1Story""","""Hip""","""CompShg""","""CemntBd""","""CmentBd""","""Stone""","""Ex""","""TA""","""PConc""","""Ex""","""TA""","""Gd""","""GLQ""","""Unf""","""GasA""","""Ex""","""Y""","""SBrkr""","""Ex""","""Typ""","""Gd""","""Attchd""","""2008""","""Fin""","""TA""","""TA""","""Y""","""NA""","""NA""","""NA""","""New""","""Partial"""
524,10,4676,3,884,3138,3138,3,11,2007,2008,762,1,2260,208,1538,406,1,40094,1,878,3,1,0,0,0,60,5,10,0,2007,0,0,0,0,184750,"""RL""",…,"""Gtl""","""Edwards""","""PosN""","""PosN""","""1Fam""","""2Story""","""Hip""","""CompShg""","""CemntBd""","""CmentBd""","""Stone""","""Ex""","""TA""","""PConc""","""Ex""","""TA""","""Gd""","""GLQ""","""Unf""","""GasA""","""Ex""","""Y""","""SBrkr""","""Ex""","""Typ""","""Gd""","""BuiltIn""","""2007""","""Fin""","""TA""","""TA""","""Y""","""NA""","""NA""","""NA""","""New""","""Partial"""
1183,10,4476,3,813,2396,2411,3,10,1996,1996,0,2,2096,171,2065,78,1,15623,1,300,4,1,0,0,555,60,5,7,0,2007,0,0,0,0,745000,"""RL""",…,"""Gtl""","""NoRidge""","""Norm""","""Norm""","""1Fam""","""2Story""","""Hip""","""CompShg""","""Wd Sdng""","""ImStucc""","""None""","""Gd""","""TA""","""PConc""","""Ex""","""TA""","""Av""","""GLQ""","""Unf""","""GasA""","""Ex""","""Y""","""SBrkr""","""Ex""","""Typ""","""TA""","""Attchd""","""1996""","""Fin""","""TA""","""TA""","""Y""","""Ex""","""MnPrv""","""NA""","""WD""","""Abnorml"""
692,10,4316,3,832,2444,2444,3,10,1994,1995,1170,2,1455,382,1872,50,1,21535,0,989,4,1,0,0,0,60,6,1,0,2007,0,0,1,0,755000,"""RL""",…,"""Gtl""","""NoRidge""","""Norm""","""Norm""","""1Fam""","""2Story""","""Gable""","""WdShngl""","""HdBoard""","""HdBoard""","""BrkFace""","""Ex""","""TA""","""PConc""","""Ex""","""TA""","""Gd""","""GLQ""","""Unf""","""GasA""","""Ex""","""Y""","""SBrkr""","""Ex""","""Typ""","""Ex""","""Attchd""","""1994""","""Fin""","""TA""","""TA""","""Y""","""NA""","""NA""","""NA""","""WD""","""Normal"""
