# **Classification:** predicting housing values as expensive (or not)

# Preliminary steps

## Import libraries

In [None]:
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer, ColumnTransformer

## Read and split data

In [None]:
# reading TRAIN data
train_url = "google_drive_link" #update with actual location of data in Google Drive
train_path = 'https://drive.google.com/uc?export=download&id='+train_url.split('/')[-2]
train_data = pd.read_csv(train_path)

# reading TEST data
test_url = "google_drive_link" #update with actual location of data in Google Drive
test_path = 'https://drive.google.com/uc?export=download&id='+test_url.split('/')[-2]
test_data = pd.read_csv(test_path)

# creating X and y from TRAIN data
X_train = train_data.copy()
y_train = X_train.pop("Expensive")


## Explore data and engineer

In [None]:
#first look at the actual data
X_train.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,8450,65.0,856,3,0,0,2,0,0,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,9600,80.0,1262,3,1,0,2,298,0,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,11250,68.0,920,3,1,0,2,0,0,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,9550,60.0,756,3,1,0,3,0,0,RL,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,14260,84.0,1145,4,1,0,3,192,0,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [None]:
#seeing if there are any missing data
X_train.isna().sum()

Unnamed: 0,0
LotArea,0
LotFrontage,259
TotalBsmtSF,0
BedroomAbvGr,0
Fireplaces,0
...,...
PoolQC,1453
Fence,1179
MiscFeature,1406
SaleType,0


In [None]:
X_train = X_train.drop(columns=['Id']).copy()

In [None]:
#reviwing columns
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotArea        1460 non-null   int64  
 1   LotFrontage    1201 non-null   float64
 2   TotalBsmtSF    1460 non-null   int64  
 3   BedroomAbvGr   1460 non-null   int64  
 4   Fireplaces     1460 non-null   int64  
 5   PoolArea       1460 non-null   int64  
 6   GarageCars     1460 non-null   int64  
 7   WoodDeckSF     1460 non-null   int64  
 8   ScreenPorch    1460 non-null   int64  
 9   MSZoning       1460 non-null   object 
 10  Condition1     1460 non-null   object 
 11  Heating        1460 non-null   object 
 12  Street         1460 non-null   object 
 13  CentralAir     1460 non-null   object 
 14  Foundation     1460 non-null   object 
 15  ExterQual      1460 non-null   object 
 16  ExterCond      1460 non-null   object 
 17  BsmtQual       1423 non-null   object 
 18  BsmtCond

## MODEL: Random Forest

### Build pipeline

In [None]:
X_cat = X_train.select_dtypes(exclude="number").copy()
X_number = X_train.select_dtypes(include="number").copy()

In [None]:
X_cat.head()

Unnamed: 0,MSZoning,Condition1,Heating,Street,CentralAir,Foundation,ExterQual,ExterCond,BsmtQual,BsmtCond,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Norm,GasA,Pave,Y,PConc,Gd,TA,Gd,TA,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RL,Feedr,GasA,Pave,Y,CBlock,TA,TA,Gd,TA,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Norm,GasA,Pave,Y,PConc,Gd,TA,Gd,TA,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Norm,GasA,Pave,Y,BrkTil,TA,TA,TA,Gd,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,RL,Norm,GasA,Pave,Y,PConc,Gd,TA,Gd,TA,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [None]:
X_number.head(5)

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSSubClass,...,KitchenAbvGr,TotRmsAbvGrd,GarageYrBlt,GarageArea,OpenPorchSF,EnclosedPorch,3SsnPorch,MiscVal,MoSold,YrSold
0,8450,65.0,856,3,0,0,2,0,0,60,...,1,8,2003.0,548,61,0,0,0,2,2008
1,9600,80.0,1262,3,1,0,2,298,0,20,...,1,6,1976.0,460,0,0,0,0,5,2007
2,11250,68.0,920,3,1,0,2,0,0,60,...,1,6,2001.0,608,42,0,0,0,9,2008
3,9550,60.0,756,3,1,0,3,0,0,70,...,1,7,1998.0,642,35,272,0,0,2,2006
4,14260,84.0,1145,4,1,0,3,192,0,60,...,1,9,2000.0,836,84,0,0,0,12,2008


In [None]:
#pipelines

imputer = SimpleImputer
encoder = OneHotEncoder
scaler = StandardScaler
classifier = RandomForestClassifier(max_depth=11, min_samples_leaf=3)

cat_pipe = make_pipeline(
    imputer(strategy="constant", fill_value="unknown"),
    encoder(handle_unknown="ignore"))

num_pipe = make_pipeline(
    imputer(strategy="median"),
    scaler(with_mean= True, with_std=False))

#preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", num_pipe, X_number.columns),
        ("cat_pipe", cat_pipe, X_cat.columns)
    ]
)


#alternative preprocessor:
# preprocessor = make_column_transformer(
#     (num_pipe, X_number),
#     (cat_pipe, X_cat)
# )

In [None]:
#full pipeline
full_pipeline = make_pipeline(
    preprocessor,
    classifier
)

full_pipeline.fit(X_train, y_train)

In [None]:
#already ran the following and adapted parameters of model to match best parameters
# param_grid = {
#     "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
#     "columntransformer__num_pipe__standardscaler__with_mean":[True, False],
#     "columntransformer__num_pipe__standardscaler__with_std":[True, False],
#     "randomforestclassifier__max_depth": range(2, 14),
#     "randomforestclassifier__min_samples_leaf": range(3, 12)
# }

# search = GridSearchCV(full_pipeline,
#                       param_grid,
#                       cv=5,
#                       verbose=1)

# search.fit(X_train, y_train)

# scores = {"classifier": search.best_score_}

# best_parameters = search.best_params_

# print(scores)
# best_parameters

Fitting 5 folds for each of 864 candidates, totalling 4320 fits
{'classifier': 0.9527397260273972}


{'columntransformer__num_pipe__simpleimputer__strategy': 'median',
 'columntransformer__num_pipe__standardscaler__with_mean': True,
 'columntransformer__num_pipe__standardscaler__with_std': False,
 'randomforestclassifier__max_depth': 11,
 'randomforestclassifier__min_samples_leaf': 3}

### Making predictions on test data

In [None]:
#make predictions
pred_new_test = full_pipeline.predict(test_data)
pred_new_test

array([0, 0, 0, ..., 0, 0, 0])

### Saving results

In [None]:
test_predictions = pd.DataFrame({"Id": test_data["Id"], "Expensive": pred_new_test})

test_predictions.to_csv('/content/drive/MyDrive...test_predictions.csv',index=False) #update MyDrive location with actual location of folder to save output

In [None]:
#use the following code to verify that csv has been successfully saved
csv_verify = pd.read_csv('/content/drive/MyDrive...test_predictions.csv',index=False) #update MyDrive location with actual location of folder to save output
print(csv_verify.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   Id         1459 non-null   int64
 1   Expensive  1459 non-null   int64
dtypes: int64(2)
memory usage: 22.9 KB
None


### Accuracy of results (when applied to actual y_test figures)

0.9788