In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn import set_config
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
data = r"C:\Users\danie\Desktop\WBS.DataScience_Bootcamp\Week_9_supervised_ML\housing_Iowa\iter-6\iter-6\housing-classification-iter6.csv"
housing = pd.read_csv(data)
y = housing.pop("Expensive")
X = housing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=815)

In [10]:
num_feats = list(X_train.select_dtypes(exclude=["object"]))
full_cat_feats = list(X_train.select_dtypes(include=["object"]))
ord_feats = ["ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "FireplaceQu", "BsmtFinType2",
             "HeatingQC", "Fence", "PoolQC", "PavedDrive", "GarageCond", "GarageQual", "GarageFinish"]
cat_feats = list(set(list(X_train.select_dtypes(include=["object"])))-set(ord_feats))

In [11]:
# directly pipelining for both categorical encodings

num_pipe = make_pipeline(
    SimpleImputer(
    strategy="mean")
    )


ord_pipe = make_pipeline(
        SimpleImputer(
    strategy="constant", fill_value="NA"),
        OrdinalEncoder(
    categories=[["Po", "Fa", "TA", "Gd", "Ex"], ["Po", "Fa", "TA", "Gd", "Ex"], ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
                ["NA", "Po", "Fa", "TA", "Gd", "Ex"], ["NA", "No", "Mn", "Av", "Gd"], ["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
                ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
                ["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"], ["Po", "Fa", "TA", "Gd", "Ex"], ["NA", "MnWw", "GdWo", "MnPrv", "GdPrv"],
                ["NA", "Fa", "TA", "Gd", "Ex"], ["N", "P", "Y"], ["NA", "Po", "Fa", "TA", "Gd", "Ex"], ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
                ["NA", "Unf", "RFn", "Fin"]
                ])
    )

cat_pipe = make_pipeline(
        SimpleImputer(
    strategy="constant", fill_value="N_A"),
        OneHotEncoder(
    sparse_output=False, handle_unknown="ignore")
    )

In [12]:
preprocessor = ColumnTransformer(
    transformers=[
    ("num_pipe", num_pipe, num_feats),
    ("ord_pipe", ord_pipe, ord_feats),
    ("cat_pipe", cat_pipe, cat_feats)
    ]
)

In [13]:
scaler = StandardScaler()
dtree = DecisionTreeClassifier(random_state=815)
full_pipe = make_pipeline(preprocessor, scaler, dtree).set_output(transform="pandas")

In [14]:
full_pipe.fit(X_train, y_train)

In [76]:
test_tree = full_pipe.predict(X_test)
accuracy_score(y_pred=test_tree, y_true=y_test)

0.910958904109589

## forest instead of tree

In [62]:
dforest = RandomForestClassifier()

In [63]:
forest_pipe = make_pipeline(preprocessor, scaler, dforest).set_output(transform="pandas")

In [64]:
forest_pipe.fit(X_train, y_train)

In [75]:
testing = forest_pipe.predict(X_test)
accuracy_score(y_pred=testing, y_true=y_test)

0.9417808219178082

In [8]:
ch_data = r"C:\Users\danie\Desktop\WBS.DataScience_Bootcamp\Week_9_supervised_ML\housing_Iowa\test-housing-classification.csv"
challenge_data = pd.read_csv(ch_data)

In [51]:
pred = full_pipe.predict(challenge_data)

In [52]:
draft = pd.DataFrame(pred)

In [53]:
draft["Id"] = range(len(draft))

In [54]:
draft["Expensive"] = draft.iloc[:, 0]

In [55]:
draft

Unnamed: 0,0,Id,Expensive
0,0,0,0
1,0,1,0
2,0,2,0
3,0,3,0
4,0,4,0
...,...,...,...
1454,0,1454,0
1455,0,1455,0
1456,0,1456,0
1457,0,1457,0


In [56]:
solution = draft.iloc[:, 1:]

In [58]:
solution.set_index("Id")

Unnamed: 0_level_0,Expensive
Id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0
...,...
1454,0
1455,0
1456,0
1457,0


In [60]:
solution.to_csv("firstsolution_Daniel.csv", index=False)

In [48]:
solution.Expensive.value_counts()

0    1245
1     214
Name: Expensive, dtype: int64

In [50]:
solution.to_csv("firstsolution_Daniel.csv", index=False)

In [65]:
pred_for = forest_pipe.predict(challenge_data)

In [66]:
draft_for = pd.DataFrame(pred_for)

In [67]:
draft_for

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
1454,0
1455,0
1456,0
1457,0


In [68]:
draft_for["Id"] = range(len(draft_for))

In [69]:
draft_for["Expensive"] = draft_for.iloc[:, 0]

In [70]:
solution_2 = draft_for.iloc[:, 1:]

In [71]:
solution_2.set_index("Id")

Unnamed: 0_level_0,Expensive
Id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0
...,...
1454,0
1455,0
1456,0
1457,0


In [73]:
solution_2.Expensive.value_counts()

0    1274
1     185
Name: Expensive, dtype: int64

In [74]:
solution_2.to_csv("secondsolution_Daniel.csv", index=False)

## random and grid CV on forest pipeline

In [None]:
# directly pipelining for both categorical encodings

num_pipe = make_pipeline(
    SimpleImputer(
    strategy="mean")
    )


ord_pipe = make_pipeline(
        SimpleImputer(
    strategy="constant", fill_value="NA"),
        OrdinalEncoder(
    categories=[["Po", "Fa", "TA", "Gd", "Ex"], ["Po", "Fa", "TA", "Gd", "Ex"], ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
                ["NA", "Po", "Fa", "TA", "Gd", "Ex"], ["NA", "No", "Mn", "Av", "Gd"], ["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
                ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
                ["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"], ["Po", "Fa", "TA", "Gd", "Ex"], ["NA", "MnWw", "GdWo", "MnPrv", "GdPrv"],
                ["NA", "Fa", "TA", "Gd", "Ex"], ["N", "P", "Y"], ["NA", "Po", "Fa", "TA", "Gd", "Ex"], ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
                ["NA", "Unf", "RFn", "Fin"]
                ])
    )

cat_pipe = make_pipeline(
        SimpleImputer(
    strategy="constant", fill_value="N_A"),
        OneHotEncoder(
    sparse_output=False, handle_unknown="ignore")
    )

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
    ("num_pipe", num_pipe, num_feats),
    ("ord_pipe", ord_pipe, ord_feats),
    ("cat_pipe", cat_pipe, cat_feats)
    ]
)

In [None]:
scaler = StandardScaler()
dforest = RandomForestClassifier()

In [91]:
params = {
    "columntransformer__num_pipe__simpleimputer__strategy": ["mean", "median"],
    "standardscaler__with_mean": [True, False],
    "standardscaler__with_std": [True, False],
    "randomforestclassifier__max_depth": range(2, 11, 2),
    "randomforestclassifier__min_samples_split": range(5, 60, 10),
    "randomforestclassifier__min_samples_leaf": range(5, 60, 10),
}

In [87]:
forest_pipe

In [92]:
rnd_CV = RandomizedSearchCV(forest_pipe, params, cv=5, n_iter=20, verbose=1)

In [94]:
rnd_CV.fit(X_test, y_test)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [95]:
rnd_CV.best_params_

{'standardscaler__with_std': True,
 'standardscaler__with_mean': False,
 'randomforestclassifier__min_samples_split': 15,
 'randomforestclassifier__min_samples_leaf': 5,
 'randomforestclassifier__max_depth': 8,
 'columntransformer__num_pipe__simpleimputer__strategy': 'mean'}

In [96]:
rnd_CV.best_score_

0.9314436002337814

In [100]:
rnd_CV = RandomizedSearchCV(forest_pipe, params, cv=5, n_iter=100, verbose=1)
rnd_CV.fit(X_test, y_test)
print(rnd_CV.best_params_)
rnd_CV.best_score_

Fitting 5 folds for each of 100 candidates, totalling 500 fits
{'standardscaler__with_std': False, 'standardscaler__with_mean': False, 'randomforestclassifier__min_samples_split': 15, 'randomforestclassifier__min_samples_leaf': 5, 'randomforestclassifier__max_depth': 10, 'columntransformer__num_pipe__simpleimputer__strategy': 'median'}


0.9314436002337814

## trying to find some better values

In [105]:
params = {
    "columntransformer__num_pipe__simpleimputer__strategy": ["mean", "median"],
    "standardscaler__with_mean": [True, False],
    "standardscaler__with_std": [True, False],
    "randomforestclassifier__max_depth": range(6, 11),
    "randomforestclassifier__min_samples_split": range(11, 21, 2),
    "randomforestclassifier__min_samples_leaf": range(1, 15, 2),
}

In [101]:
grid_CV = GridSearchCV(forest_pipe, params, cv=5, verbose=1)

In [102]:
rnd_CV.fit(X_test, y_test)

print(f"{rnd_CV.best_score_} best score")
rnd_CV.best_params_

Fitting 5 folds for each of 100 candidates, totalling 500 fits
0.9349503214494448 best score


{'standardscaler__with_std': False,
 'standardscaler__with_mean': True,
 'randomforestclassifier__min_samples_split': 15,
 'randomforestclassifier__min_samples_leaf': 5,
 'randomforestclassifier__max_depth': 6,
 'columntransformer__num_pipe__simpleimputer__strategy': 'median'}

In [106]:
grid_CV = GridSearchCV(forest_pipe, params, cv=5, verbose=1)

## too much compution to perform in time, giving a last try for the above params

In [108]:
# directly pipelining for both categorical encodings

num_pipe = make_pipeline(
    SimpleImputer(
    strategy="median")
    )


ord_pipe = make_pipeline(
        SimpleImputer(
    strategy="constant", fill_value="NA"),
        OrdinalEncoder(
    categories=[["Po", "Fa", "TA", "Gd", "Ex"], ["Po", "Fa", "TA", "Gd", "Ex"], ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
                ["NA", "Po", "Fa", "TA", "Gd", "Ex"], ["NA", "No", "Mn", "Av", "Gd"], ["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
                ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
                ["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"], ["Po", "Fa", "TA", "Gd", "Ex"], ["NA", "MnWw", "GdWo", "MnPrv", "GdPrv"],
                ["NA", "Fa", "TA", "Gd", "Ex"], ["N", "P", "Y"], ["NA", "Po", "Fa", "TA", "Gd", "Ex"], ["NA", "Po", "Fa", "TA", "Gd", "Ex"],
                ["NA", "Unf", "RFn", "Fin"]
                ])
    )

cat_pipe = make_pipeline(
        SimpleImputer(
    strategy="constant", fill_value="N_A"),
        OneHotEncoder(
    sparse_output=False, handle_unknown="ignore")
    )

In [109]:
preprocessor = ColumnTransformer(
    transformers=[
    ("num_pipe", num_pipe, num_feats),
    ("ord_pipe", ord_pipe, ord_feats),
    ("cat_pipe", cat_pipe, cat_feats)
    ]
)

In [111]:
dforest = RandomForestClassifier(max_depth=7)

In [110]:
scaler = StandardScaler(with_std=False)

In [112]:
forest_final = make_pipeline(preprocessor, scaler, dforest).set_output(transform="pandas")

In [115]:
forest_final.fit(X_train, y_train)

In [116]:
final = forest_final.predict(challenge_data)

In [117]:
final_sol = pd.DataFrame(final)

In [118]:
final_sol["Id"] = range(len(final_sol))

In [119]:
final_sol["Expensive"] = final_sol.iloc[:, 0]

In [120]:
solution_3 = final_sol.iloc[:, 1:]

In [121]:
solution_3.set_index("Id")

Unnamed: 0_level_0,Expensive
Id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0
...,...
1454,0
1455,0
1456,0
1457,0


In [123]:
solution_3.Expensive.value_counts()

0    1291
1     168
Name: Expensive, dtype: int64

In [122]:
solution_3.to_csv("Final_try.csv", index=False)