In [1]:
import pandas as pd
from sklearn import set_config
set_config(transform_output='pandas')

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline

# reading
url = "https://drive.google.com/file/d/1J7R6UANev5xB4tNTcAIhB95EEShj_2zi/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data = pd.read_csv(path)

# X and y creation
X = data.copy()
y = X.pop("Expensive")

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV

In [7]:
# select categorical and numerical column names
X_cat_columns = X.select_dtypes(exclude="number").columns
X_num_columns = X.select_dtypes(include="number").columns

#1.&nbsp; Without Ordinal encoder


In [8]:
# Make numeric and categoric pipeline
numeric_pipe= Pipeline([('num_imputer', SimpleImputer())])
categoric_pipe= Pipeline([('cat_imputer', SimpleImputer(strategy='constant', fill_value='N/A')), ('os_encoder', OneHotEncoder(sparse_output=False,
    handle_unknown='ignore'))])

In [9]:
# Preprocessing
preprocessor= ColumnTransformer([('num_pipe', numeric_pipe, X_num_columns), ('cat_pipe', categoric_pipe, X_cat_columns)])

In [10]:
# Final Pipeline
full_pipeline_1= Pipeline([('preprocessor', preprocessor), ('classifier', DecisionTreeClassifier())])

In [11]:
# parameter grid
param_grid = {
    "preprocessor__num_pipe__num_imputer__strategy":["mean", "median"],
    "preprocessor__cat_pipe__cat_imputer__strategy":["constant", "most_frequent"],
    "preprocessor__cat_pipe__os_encoder__handle_unknown":["infrequent_if_exist", "error", "ignore"],
    "classifier__max_depth": range(2, 14, 2),
    "classifier__min_samples_leaf": range(3, 12, 2),
    "classifier__criterion": ["gini","entropy"]
}

# define GridSearchCV
search = GridSearchCV(full_pipeline_1,
                      param_grid,
                      cv=5,
                      verbose=1)

search.fit(X_train, y_train)

best_param = search.best_params_

best_param

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    return super().__call__(iterable_with_config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/joblib/parallel.py", line 1918, in __call__
    return output if self.return_generator else list(output)
                                                ^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/joblib/parallel.py", line 1847, in _get_sequential_output
    res = func(*args, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/parallel.py", line 139, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/pipeline.py", line 1531, in _transform_one
    res = transformer.transform(X, **params.transform)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/pipe

{'classifier__criterion': 'entropy',
 'classifier__max_depth': 12,
 'classifier__min_samples_leaf': 7,
 'preprocessor__cat_pipe__cat_imputer__strategy': 'most_frequent',
 'preprocessor__cat_pipe__os_encoder__handle_unknown': 'infrequent_if_exist',
 'preprocessor__num_pipe__num_imputer__strategy': 'mean'}

## Checking

In [15]:
full_pipeline_1.fit(X_train, y_train)

In [16]:
pred_train = full_pipeline_1.predict(X_train)
accuracy_score(y_train, pred_train)

1.0

In [17]:
pred_test = full_pipeline_1.predict(X_test)
accuracy_score(y_test, pred_test)

0.928082191780822

## Updating pipeline

In [28]:
# Update parameters of the pipeline using set_params
full_pipeline_1.set_params(preprocessor__num_pipe__num_imputer__strategy='mean',
                         preprocessor__cat_pipe__cat_imputer__strategy='most_frequent',
                         preprocessor__cat_pipe__os_encoder__handle_unknown= 'infrequent_if_exist',
                         classifier__max_depth=12,
                         classifier__min_samples_leaf=7,
                         classifier__criterion='entropy')

# Fit the pipeline with updated parameters
full_pipeline_1.fit(X_train, y_train)

In [29]:
#Accuracy on training set
pred_train= full_pipeline_1.predict(X_train)
accuracy_score(y_train, pred_train)

0.9717465753424658

In [30]:
#Accuracy on Test set
pred_test= full_pipeline_1.predict(X_test)
accuracy_score(y_test, pred_test)

0.9212328767123288

#2.&nbsp; With Ordinal encoder



In [12]:
from sklearn.preprocessing import OrdinalEncoder

# select categorical and numerical column names
X_cat_columns = X.select_dtypes(exclude="number").columns
X_num_columns = X.select_dtypes(include="number").columns

# Define your ordinal columns
ordinal_cols = [
    'ExterQual',
    'ExterCond',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'KitchenQual',
    'FireplaceQu'
]

ordinal_categories = [
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],                # ExterQual
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],                # ExterCond
    ['N/A', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],         # BsmtQual
    ['N/A', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],         # BsmtCond
    ['N/A', 'No', 'Mn', 'Av', 'Gd'],               # BsmtExposure
    ['N/A', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'], # BsmtFinType1
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],                # KitchenQual
    ['N/A', 'Po', 'Fa', 'TA', 'Gd', 'Ex']          # FireplaceQu
]


# Create the OrdinalEncoder with the custom categories
ordinal_encoder = OrdinalEncoder(categories=ordinal_categories)

# One-hot columns = all categorical - ordinal
onehot_cat = list(set(X_cat_columns) - set(ordinal_cols))

# Encoders
ordinal_encoder = OrdinalEncoder(categories=ordinal_categories)
onehot_encoder = Pipeline([
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

# Combined encoder
encoder = ColumnTransformer(
    transformers=[
        ('ordinal', ordinal_encoder, ordinal_cols),
        ('onehot', onehot_encoder, onehot_cat)
    ]
)

# Pipelines
numeric_pipe = Pipeline([
    ('num_imputer', SimpleImputer())
])

categoric_pipe = Pipeline([
    ('cat_imputer', SimpleImputer(strategy='constant', fill_value='N/A')),
    ('encoder', encoder)
])

# Full preprocessor
preprocessor = ColumnTransformer([
    ('num_pipe', numeric_pipe, X_num_columns),
    ('cat_pipe', categoric_pipe, X_cat_columns)
])

# Final pipeline
full_pipeline_2 = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier())
])

In [13]:
param_grid = {
    "preprocessor__num_pipe__num_imputer__strategy": ["mean", "median"],
    "preprocessor__cat_pipe__cat_imputer__strategy": ["constant", "most_frequent"],
    "preprocessor__cat_pipe__encoder__onehot__onehot__handle_unknown": ["ignore", "infrequent_if_exist"],
    "classifier__max_depth": range(2, 14),
    "classifier__min_samples_leaf": range(3, 12, 2),
    "classifier__criterion": ["gini", "entropy"]
}


# define GridSearchCV
search = GridSearchCV(full_pipeline_2,
                      param_grid,
                      cv=5,
                      verbose=1)

search.fit(X_train, y_train)

best_param = search.best_params_

best_param

Fitting 5 folds for each of 960 candidates, totalling 4800 fits


{'classifier__criterion': 'entropy',
 'classifier__max_depth': 10,
 'classifier__min_samples_leaf': 7,
 'preprocessor__cat_pipe__cat_imputer__strategy': 'most_frequent',
 'preprocessor__cat_pipe__encoder__onehot__onehot__handle_unknown': 'infrequent_if_exist',
 'preprocessor__num_pipe__num_imputer__strategy': 'median'}

#Updating pipeline

In [31]:
# Update parameters of the pipeline using set_params
full_pipeline_2.set_params(preprocessor__num_pipe__num_imputer__strategy='median',
    preprocessor__cat_pipe__cat_imputer__strategy="most_frequent",
    preprocessor__cat_pipe__encoder__onehot__onehot__handle_unknown="infrequent_if_exist",
    classifier__max_depth= 10,
    classifier__min_samples_leaf=7,
    classifier__criterion="entropy")

# Fit the pipeline with updated parameters
full_pipeline_2.fit(X_train, y_train)

In [32]:
pred_tr= full_pipeline_2.predict(X_train)
accuracy_score(y_train, pred_tr)

0.9717465753424658

In [33]:
pred_tes= full_pipeline_2.predict(X_test)
accuracy_score(y_test, pred_tes)

0.9212328767123288

# Test on testing data from LeaderBoard



In [39]:
url = "https://drive.google.com/file/d/15PfmTxmavQCT-f7iY9tgwWxm9t4GRees/view?usp=drive_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
testing_data = pd.read_csv(path)

In [40]:
# Now you can predict using the full_pipeline_2
testing_data['Expensive'] = full_pipeline_2.predict(testing_data)

In [41]:
testing_data = testing_data.set_index('Id')

In [42]:
# Export the column 'Expensive' along with the index to create a submission file
testing_data['Expensive'].to_csv('./submission.csv')

In [43]:
# Colab only
from google.colab import files
files.download('./submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

got 95% accuracy on Leaderboard.