1. Import

In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
#sklearn:
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingClassifier

housing_classification = pd.read_csv(r'cleaned-housing-classification.csv')
test_housing_classification = pd.read_csv(r'cleaned-test-housing-classification.csv')

2. Split

In [2]:
X_train, X_test, y_train, y_test = train_test_split(housing_classification.drop('Expensive', axis= 1),
                                                    housing_classification['Expensive'],
                                                    test_size=0.2,
                                                    random_state=1337)

3. Pipeline

In [3]:
numeric_pipe = make_pipeline(
    KNNImputer(n_neighbors=5, missing_values=np.nan)
)
 
categoric_pipe1 = make_pipeline(
    SimpleImputer(strategy="constant", fill_value='0'),
    OneHotEncoder(sparse_output=False, handle_unknown='infrequent_if_exist', min_frequency=6)
)

categoric_pipe2 = make_pipeline(
    SimpleImputer(strategy="constant", fill_value='0'),
    OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value= np.nan)
)

filtered_columns = X_train.columns[~X_train.columns.isin(['MSZoning', 'Neighborhood', 'Condition1', 'Condition2', 'CentralAir', 'MiscFeature'])]
X_train_filtered = X_train[filtered_columns]
X_test_filtered = X_test[filtered_columns]

preprocessor = make_column_transformer(
        (numeric_pipe, make_column_selector(dtype_include='number')),
        (categoric_pipe1, make_column_selector(pattern= "MSZoning|Neighborhood|Condition1|Condition2|CentralAir|MiscFeature")),
        (categoric_pipe2, make_column_selector(dtype_include=object))
)

scaler = QuantileTransformer(n_quantiles= 25)

full_pipeline = make_pipeline(preprocessor, 
                              scaler,
                              HistGradientBoostingClassifier(),
                              memory=None)

full_pipeline.fit(X_train_filtered, y_train)

In [4]:
# full_pipeline.named_steps['columntransformer'].transform(X_train)

In [5]:
full_pipeline.predict(X_train)

array([0, 0, 0, ..., 0, 1, 1], dtype=int64)

In [6]:
y_train_predict = full_pipeline.predict(X_train)
accuracy_score(y_train, y_train_predict)

1.0

In [7]:
# Test
y_test_predict = full_pipeline.predict(X_test)
accuracy_score(y_test, y_test_predict)

0.958904109589041

In [8]:
param_grid = {
            #'columntransformer__numeric__knnimputer__n_neighbors': [10, 25, 40],
            'quantiletransformer__n_quantiles': [2],
            'histgradientboostingclassifier__max_depth': [6],
            'histgradientboostingclassifier__learning_rate': [0.25, 0.3, 0.35],
            }

grid_search = GridSearchCV(
    full_pipeline,
    param_grid=param_grid,
    cv=5,
    verbose=1,
)

grid_search.fit(X_train_filtered, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best parameters found:  {'histgradientboostingclassifier__learning_rate': 0.35, 'histgradientboostingclassifier__max_depth': 6, 'quantiletransformer__n_quantiles': 2}
Best cross-validation score:  0.9554858589193351


In [9]:
grid_search.best_estimator_.fit(X_train_filtered, y_train)

In [10]:
predictions = grid_search.best_estimator_.predict(test_housing_classification)

In [11]:
id_column = test_housing_classification.pop('Id')

In [13]:
results = pd.DataFrame({'Id':id_column,'Expensive':predictions})
results.to_csv('test2.csv',index=False)
