In [1]:
import pandas as pd
from sklearn import set_config
set_config(transform_output='pandas')

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline

# reading
url = "https://drive.google.com/file/d/1okGWVCx3Zh4tWVVx1wuy0ZLLTqWJrZ4M/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data = pd.read_csv(path)

# X and y creation
X = data.copy()
y = X.pop("Expensive")

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import GridSearchCV

In [7]:
# select categorical and numerical column names
X_cat_columns = X.select_dtypes(exclude="number").columns
X_num_columns = X.select_dtypes(include="number").columns

In [38]:
# Define your ordinal columns
ordinal_cols = [
    'ExterQual',
    'ExterCond',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'KitchenQual',
    'FireplaceQu'
]

ordinal_categories = [
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],                # ExterQual
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],                # ExterCond
    ['N/A', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],         # BsmtQual
    ['N/A', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],         # BsmtCond
    ['N/A', 'No', 'Mn', 'Av', 'Gd'],               # BsmtExposure
    ['N/A', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'], # BsmtFinType1
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],                # KitchenQual
    ['N/A', 'Po', 'Fa', 'TA', 'Gd', 'Ex']          # FireplaceQu
]


# Create the OrdinalEncoder with the custom categories
ordinal_encoder = OrdinalEncoder(categories=ordinal_categories)

In [39]:
onehot_cat= list(set(X_cat_columns)-set(ordinal_cols))
onehot_encoder= OneHotEncoder(sparse_output=False, handle_unknown='ignore')

In [40]:
encoder = ColumnTransformer(
    transformers=[
        ('ordinal', ordinal_encoder, ordinal_cols),  # Use ordinal_cols for ordinal features
        ('onehot', onehot_encoder, onehot_cat)
    ]
)

In [41]:
# Make numeric and categoric pipeline
numeric_pipe= Pipeline([('num_imputer', SimpleImputer())])
categoric_pipe= Pipeline([('cat_imputer', SimpleImputer(strategy='constant', fill_value='N/A')), ('encoder', encoder)])

In [42]:
from sklearn.compose import make_column_selector
# Preprocessing
preprocessor= ColumnTransformer([('num_pipe', numeric_pipe, X_num_columns), ('cat_pipe', categoric_pipe,  make_column_selector(dtype_include=object))]) # Change X_cat_columns to make_column_selector

In [43]:
# Final Pipeline
full_pipeline= Pipeline([('preprocessor', preprocessor), ('classifier', DecisionTreeClassifier())])

In [44]:
full_pipeline

In [45]:
full_pipeline.fit(X_train, y_train)

In [46]:
pred_tr= full_pipeline.predict(X_train)
pred_te= full_pipeline.predict(X_test)

In [47]:
accuracy_score(y_train, pred_tr)

1.0

In [48]:
accuracy_score(y_test, pred_te)

0.9143835616438356