In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn import set_config
set_config(transform_output='pandas')
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer


In [8]:
# reading
url = "https://drive.google.com/file/d/1J7R6UANev5xB4tNTcAIhB95EEShj_2zi/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data = pd.read_csv(path)

# X and y creation
X = data.copy()
y = X.pop("Expensive")

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12345)

In [9]:
# Import from sklearn.linear_model:
from sklearn.ensemble import RandomForestClassifier

# select categorical and numerical column names
X_cat_columns = X.select_dtypes(exclude="number").columns
X_num_columns = X.select_dtypes(include="number").columns

# Define your ordinal columns
ordinal_cols = [
    'ExterQual',
    'ExterCond',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'KitchenQual',
    'FireplaceQu'
]

ordinal_categories = [
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],                # ExterQual
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],                # ExterCond
    ['N/A', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],         # BsmtQual
    ['N/A', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],         # BsmtCond
    ['N/A', 'No', 'Mn', 'Av', 'Gd'],               # BsmtExposure
    ['N/A', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'], # BsmtFinType1
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],                # KitchenQual
    ['N/A', 'Po', 'Fa', 'TA', 'Gd', 'Ex']          # FireplaceQu
]


# Create the OrdinalEncoder with the custom categories
ordinal_encoder = OrdinalEncoder(categories=ordinal_categories)

# One-hot columns = all categorical - ordinal
onehot_cat = list(set(X_cat_columns) - set(ordinal_cols))

# Encoders
ordinal_encoder = OrdinalEncoder(categories=ordinal_categories,
     handle_unknown='use_encoded_value',
     unknown_value=-1)
onehot_encoder = Pipeline([
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

# Combined encoder
encoder = ColumnTransformer(
    transformers=[
        ('ordinal', ordinal_encoder, ordinal_cols),
        ('onehot', onehot_encoder, onehot_cat)
    ]
)

# Pipelines
numeric_pipe = Pipeline([
    ('num_imputer', KNNImputer()) #n_neighbour= 3
])

categoric_pipe = Pipeline([
    ('cat_imputer', SimpleImputer(strategy='constant', fill_value='N/A')),
    ('encoder', encoder)
])

# Full preprocessor
preprocessor = ColumnTransformer([
    ('num_pipe', numeric_pipe, X_num_columns),
    ('cat_pipe', categoric_pipe, X_cat_columns)
])

# Final pipeline
full_pipeline_2 = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

In [10]:
# Define the hyperparameter grid to be searched by the grid search
rf_param_grid = {
    #"preprocessor__num_pipe__num_imputer__weights": ["uniform", "distance"],
    #"preprocessor__cat_pipe__cat_imputer__strategy": ["constant", "most_frequent"],
    #"preprocessor__cat_pipe__encoder__onehot__onehot__handle_unknown": ["ignore", "infrequent_if_exist"],
     "classifier__n_estimators": [100, 200, 500], # Number of trees
    "classifier__max_depth": [None, 10, 20, 30],  # Tree depth
    "classifier__min_samples_split": [2, 5, 10], # Min samples to split an internal node
    "classifier__min_samples_leaf": [1, 2, 4], # Min samples at a leaf node
    "classifier__max_features": ['sqrt', 'log2', None], # Number of features to consider when splitting
    "classifier__bootstrap": [True, False] # Whether bootstrap samples are used
}


# Run a grid search to find the optimal combination of hyperparameters
rf_search = GridSearchCV(
    full_pipeline_2,
    rf_param_grid,
    cv=5,
    verbose=1
)

rf_search.fit(X_train, y_train)

best_param = rf_search.best_params_

best_param


Fitting 5 folds for each of 648 candidates, totalling 3240 fits


{'classifier__bootstrap': False,
 'classifier__max_depth': 20,
 'classifier__max_features': 'sqrt',
 'classifier__min_samples_leaf': 1,
 'classifier__min_samples_split': 10,
 'classifier__n_estimators': 200}

In [12]:
# Update parameters of the pipeline using set_params
full_pipeline_2.set_params(preprocessor__cat_pipe__cat_imputer__strategy='constant',
                         preprocessor__cat_pipe__encoder__onehot__onehot__handle_unknown= 'infrequent_if_exist',
                        preprocessor__num_pipe__num_imputer__n_neighbors=3,
                        preprocessor__num_pipe__num_imputer__weights='uniform',
                        classifier__n_estimators= 500,
                        classifier__max_depth= 20,
                        classifier__min_samples_split=10,
                        classifier__min_samples_leaf= 1,
                        classifier__max_features= 'sqrt',
                        classifier__bootstrap= False)

# Fit the pipeline with updated parameters
full_pipeline_2.fit(X_train, y_train)

In [19]:
from sklearn.metrics import (
    accuracy_score,
    recall_score,
    precision_score,
    f1_score,
    balanced_accuracy_score,
    cohen_kappa_score
)

# Function to get the scores for our model(s)
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    scores = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Specificity": recall_score(y_test, y_pred, pos_label=0),
        "F1 Score": f1_score(y_test, y_pred),
        "Balanced Accuracy": balanced_accuracy_score(y_test, y_pred),
        "Cohen's Kappa": cohen_kappa_score(y_test, y_pred)
    }
    return scores

# Create an empty DataFrame to store model evaluation results
model_scores_df = pd.DataFrame(columns=[
    "Model", "Accuracy", "Recall", "Precision",
    "Specificity", "F1 Score", "Balanced Accuracy", "Cohen's Kappa"
])

# Evaluate the Decision Tree model
dt_scores = evaluate_model(full_pipeline_2, X_test, y_test)
dt_scores["Model"] = "RandomForest"

# Convert the dictionary to a Series matching the DataFrame columns, then assign as a new row
model_scores_df.loc[len(model_scores_df)] = pd.Series(dt_scores, index=model_scores_df.columns)

# Display the DataFrame
model_scores_df

Unnamed: 0,Model,Accuracy,Recall,Precision,Specificity,F1 Score,Balanced Accuracy,Cohen's Kappa
0,RandomForest,0.952055,0.71875,0.821429,0.980769,0.766667,0.84976,0.740081


In [14]:
pred_tr= full_pipeline_2.predict(X_train)
accuracy_score(y_train, pred_tr)

0.9991438356164384

In [15]:
pred_tes= full_pipeline_2.predict(X_test)
accuracy_score(y_test, pred_tes)

0.952054794520548

#  Test on testing data from LeaderBoard

In [16]:
url = "https://drive.google.com/file/d/15PfmTxmavQCT-f7iY9tgwWxm9t4GRees/view?usp=drive_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
testing_data = pd.read_csv(path)

In [17]:
# Now you can predict using the full_pipeline_2
testing_data['Expensive'] = full_pipeline_2.predict(testing_data)

testing_data = testing_data.set_index('Id')

In [18]:
# Export the column 'Expensive' along with the index to create a submission file
testing_data['Expensive'].to_csv('./submission.csv')

# Colab only
from google.colab import files
files.download('./submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>