In [1]:
import pandas as pd
ames_housing = pd.read_csv("../datasets/ames_housing_no_missing.csv")

target_name = "SalePrice"
data, target = ames_housing.drop(columns=target_name), ames_housing[target_name]
target = (target > 200_000).astype(int)

In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1460 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [3]:
data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,0,0,Gd,MnPrv,Shed,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,Grvl,Reg,Lvl,AllPub,FR2,...,0,0,Gd,MnPrv,Shed,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,Grvl,IR1,Lvl,AllPub,Inside,...,0,0,Gd,MnPrv,Shed,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,Grvl,IR1,Lvl,AllPub,Corner,...,0,0,Gd,MnPrv,Shed,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,Grvl,IR1,Lvl,AllPub,FR2,...,0,0,Gd,MnPrv,Shed,0,12,2008,WD,Normal


In [5]:
data.dtypes.unique()

array([dtype('int64'), dtype('O'), dtype('float64')], dtype=object)

In [24]:
from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

In [25]:
ns = numerical_columns_selector(data)
categorical_features = categorical_columns_selector(data)

In [26]:
numerical_features = [
  "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
  "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
  "GrLivArea", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces",
  "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch",
  "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal",
]

In [27]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

In [17]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('standard_scaler', numerical_preprocessor, numerical_features)])

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

model = make_pipeline(preprocessor, LogisticRegression(max_iter=100))

In [19]:
from sklearn import set_config
set_config(display='diagram')
model

In [20]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(model, data, target, cv=10)
cv_results

{'fit_time': array([0.04645061, 0.03013778, 0.02572727, 0.01704717, 0.01667857,
        0.01612496, 0.01764727, 0.0157454 , 0.0158987 , 0.01685429]),
 'score_time': array([0.00711012, 0.0066402 , 0.00379181, 0.0035553 , 0.00340748,
        0.0036068 , 0.00344849, 0.00342512, 0.00346971, 0.00357437]),
 'test_score': array([0.9109589 , 0.89041096, 0.9109589 , 0.88356164, 0.90410959,
        0.88356164, 0.88356164, 0.87671233, 0.89726027, 0.87671233])}

In [30]:
preprocessor_2 = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_features),
    ('standard_scaler', numerical_preprocessor, numerical_features)])


model_2 = make_pipeline(preprocessor_2, LogisticRegression(max_iter=500))

cv_results_2 = cross_validate(model_2, data, target, cv=10)
cv_results_2

{'fit_time': array([0.18070889, 0.1826973 , 0.14388514, 0.14058614, 0.14612079,
        0.166713  , 0.14350748, 0.12876987, 0.16800976, 0.17048883]),
 'score_time': array([0.01590872, 0.01338553, 0.01368284, 0.01320672, 0.0137198 ,
        0.01327658, 0.01317143, 0.01478004, 0.01542878, 0.01523566]),
 'test_score': array([0.95205479, 0.9109589 , 0.90410959, 0.92465753, 0.89041096,
        0.95890411, 0.87671233, 0.89041096, 0.94520548, 0.90410959])}

In [21]:
scores = cv_results["test_score"]
print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} +/- {scores.std():.3f}")

The mean cross-validation accuracy is: 0.892 +/- 0.013


In [31]:
scores_2 = cv_results_2["test_score"]
print("The mean cross-validation accuracy is: "
      f"{scores_2.mean():.3f} +/- {scores_2.std():.3f}")

The mean cross-validation accuracy is: 0.916 +/- 0.027


In [33]:
print("The model using all features is performing better "
    f"{sum(cv_results['test_score'] < cv_results_2['test_score'])} "
  "times out of 10 than the model using only numerical features.")

The model using all features is performing better 7 times out of 10 than the model using only numerical features.
