In [62]:
import pandas as pd
ames_housing = pd.read_csv("../datasets/ames_housing_no_missing.csv")

target_name = "SalePrice"
data, target = ames_housing.drop(columns=target_name), ames_housing[target_name]
target = (target > 200_000).astype(int)

In [4]:
data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,0,0,Gd,MnPrv,Shed,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,Grvl,Reg,Lvl,AllPub,FR2,...,0,0,Gd,MnPrv,Shed,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,Grvl,IR1,Lvl,AllPub,Inside,...,0,0,Gd,MnPrv,Shed,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,Grvl,IR1,Lvl,AllPub,Corner,...,0,0,Gd,MnPrv,Shed,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,Grvl,IR1,Lvl,AllPub,FR2,...,0,0,Gd,MnPrv,Shed,0,12,2008,WD,Normal


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1460 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

## Question 5:

In [56]:
numerical_features = [
  "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
  "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
  "GrLivArea", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces",
  "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch",
  "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal",
]

In [60]:
numerical_columns = data[numerical_features]

In [58]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler
model = make_pipeline(StandardScaler(), LogisticRegression(max_iter=500))

In [61]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(model, numerical_columns, target, cv=10)
cv_results

{'fit_time': array([0.02154994, 0.02344155, 0.0124383 , 0.01322079, 0.01152015,
        0.01130748, 0.01173592, 0.01122165, 0.01088214, 0.01153684]),
 'score_time': array([0.0051806 , 0.00333738, 0.00257277, 0.00245023, 0.00245476,
        0.00240755, 0.00245833, 0.00237298, 0.00235701, 0.00236011]),
 'test_score': array([0.9109589 , 0.89041096, 0.9109589 , 0.88356164, 0.90410959,
        0.88356164, 0.88356164, 0.87671233, 0.89726027, 0.87671233])}

In [27]:
scores = cv_results["test_score"]
print(
    "The mean cross-validation accuracy is: "
    f"{scores.mean():.3f} ± {scores.std():.3f}"
)

The mean cross-validation accuracy is: 0.892 ± 0.013


## Question 6:

In [76]:
categorical_data = data.columns.difference(numerical_features)

In [77]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

In [82]:
from sklearn.compose import make_column_transformer


preprocessor = make_column_transformer(
    [
        (categorical_preprocessor, categorical_data),
        (numerical_preprocessor, numerical_data),
    ]
)

In [83]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

model = make_pipeline(preprocessor, LogisticRegression(max_iter=1000))
model

In [85]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(model, data, target, cv=10)
cv_results

{'fit_time': array([0.20613575, 0.14147329, 0.13691926, 0.13101315, 0.14589071,
        0.13382792, 0.14660573, 0.13222599, 0.13373446, 0.13608241]),
 'score_time': array([0.01749825, 0.01647878, 0.01708078, 0.01642394, 0.01700854,
        0.01678181, 0.01665807, 0.01613283, 0.01682019, 0.01753259]),
 'test_score': array([0.95890411, 0.90410959, 0.89041096, 0.92465753, 0.9109589 ,
        0.93835616, 0.90410959, 0.91780822, 0.92465753, 0.89726027])}

In [86]:
scores = cv_results["test_score"]
print(
    "The mean cross-validation accuracy is: "
    f"{scores.mean():.3f} ± {scores.std():.3f}"
)

The mean cross-validation accuracy is: 0.917 ± 0.019
