In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn import set_config
set_config(transform_output='pandas')
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [2]:
# reading
url = "https://drive.google.com/file/d/1J7R6UANev5xB4tNTcAIhB95EEShj_2zi/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data = pd.read_csv(path)

# X and y creation
X = data.copy()
y = X.pop("Expensive")

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12345)

In [3]:
# Import from sklearn.linear_model:
from sklearn.linear_model import LogisticRegression

# select categorical and numerical column names
X_cat_columns = X.select_dtypes(exclude="number").columns
X_num_columns = X.select_dtypes(include="number").columns

# Define your ordinal columns
ordinal_cols = [
    'ExterQual',
    'ExterCond',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'KitchenQual',
    'FireplaceQu'
]

ordinal_categories = [
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],                # ExterQual
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],                # ExterCond
    ['N/A', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],         # BsmtQual
    ['N/A', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],         # BsmtCond
    ['N/A', 'No', 'Mn', 'Av', 'Gd'],               # BsmtExposure
    ['N/A', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'], # BsmtFinType1
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],                # KitchenQual
    ['N/A', 'Po', 'Fa', 'TA', 'Gd', 'Ex']          # FireplaceQu
]


# Create the OrdinalEncoder with the custom categories
ordinal_encoder = OrdinalEncoder(categories=ordinal_categories)

# One-hot columns = all categorical - ordinal
onehot_cat = list(set(X_cat_columns) - set(ordinal_cols))

# Encoders
ordinal_encoder = OrdinalEncoder(categories=ordinal_categories,
     handle_unknown='use_encoded_value',
     unknown_value=-1)
onehot_encoder = Pipeline([
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

# Combined encoder
encoder = ColumnTransformer(
    transformers=[
        ('ordinal', ordinal_encoder, ordinal_cols),
        ('onehot', onehot_encoder, onehot_cat)
    ]
)

# Pipelines
numeric_pipe = Pipeline([
    ('num_scalar', MinMaxScaler()),
    ('num_imputer', KNNImputer()) #n_neighbour= 3
])

categoric_pipe = Pipeline([
    ('cat_imputer', SimpleImputer(strategy='constant', fill_value='N/A')),
    ('encoder', encoder)
])

# Full preprocessor
preprocessor = ColumnTransformer([
    ('num_pipe', numeric_pipe, X_num_columns),
    ('cat_pipe', categoric_pipe, X_cat_columns)
])

# Final pipeline
full_pipeline_2 = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

C = 1 / λ (λ is the regularization strength)

Smaller C → Stronger regularization (simpler model, may underfit)

Larger C → Weaker regularization (more flexible model, may overfit)

| C Value | Interpretation                         |
| ------- | -------------------------------------- |
| `0.01`  | Very strong regularization             |
| `0.1`   | Strong regularization                  |
| `1`     | Default setting, balanced              |
| `10`    | Weak regularization                    |
| `100`   | Very weak regularization (almost none) |
it helps find the right balance between bias and variance.



In [None]:
# Define the hyperparameter grid to be searched by the grid search
lr_param_grid = {
    "preprocessor__num_pipe__num_imputer__weights": ["uniform", "distance"],
    "preprocessor__cat_pipe__cat_imputer__strategy": ["constant", "most_frequent"],
    "preprocessor__cat_pipe__encoder__onehot__onehot__handle_unknown": ["ignore", "infrequent_if_exist"],
    "classifier__penalty": ['l1', 'l2', 'elasticnet', 'none'],
    "classifier__C": [0.01, 0.1, 1, 10, 100],
    "classifier__solver": ['saga'], #'saga' is the only solver that supports all penalties (l1, l2, elasticnet, none)
    "classifier__l1_ratio": [0, 0.5, 1], #Only relevant when using penalty='elasticnet'; 0 = pure L2, 1 = pure L1.
    "classifier__max_iter": [100, 200, 500] #To avoid convergence issues
}


# Run a grid search to find the optimal combination of hyperparameters
lr_search = GridSearchCV(
    full_pipeline_2,
    lr_param_grid,
    cv=5,
    verbose=1
)

lr_search.fit(X_train, y_train)

best_param = lr_search.best_params_

best_param

Fitting 5 folds for each of 1440 candidates, totalling 7200 fits


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1800 fits failed out of a total of 7200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1800 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/

{'classifier__C': 1,
 'classifier__l1_ratio': 0.5,
 'classifier__max_iter': 500,
 'classifier__penalty': 'elasticnet',
 'classifier__solver': 'saga',
 'preprocessor__cat_pipe__cat_imputer__strategy': 'constant',
 'preprocessor__cat_pipe__encoder__onehot__onehot__handle_unknown': 'ignore',
 'preprocessor__num_pipe__num_imputer__weights': 'uniform'}

In [4]:
# Update parameters of the pipeline using set_params
full_pipeline_2.set_params(preprocessor__cat_pipe__cat_imputer__strategy='constant',
                         preprocessor__cat_pipe__encoder__onehot__onehot__handle_unknown= 'ignore',
                          preprocessor__num_pipe__num_imputer__n_neighbors=3,
                         preprocessor__num_pipe__num_imputer__weights='uniform',
                         classifier__penalty= 'elasticnet',
                         classifier__C= 1,
                         classifier__solver= 'saga',
                         classifier__l1_ratio=0.5,
                         classifier__max_iter= 500 )

# Fit the pipeline with updated parameters
full_pipeline_2.fit(X_train, y_train)



In [5]:
from sklearn.metrics import (
    accuracy_score,
    recall_score,
    precision_score,
    f1_score,
    balanced_accuracy_score,
    cohen_kappa_score
)

# Function to get the scores for our model(s)
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    scores = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Specificity": recall_score(y_test, y_pred, pos_label=0),
        "F1 Score": f1_score(y_test, y_pred),
        "Balanced Accuracy": balanced_accuracy_score(y_test, y_pred),
        "Cohen's Kappa": cohen_kappa_score(y_test, y_pred)
    }
    return scores

# Create an empty DataFrame to store model evaluation results
model_scores_df = pd.DataFrame(columns=[
    "Model", "Accuracy", "Recall", "Precision",
    "Specificity", "F1 Score", "Balanced Accuracy", "Cohen's Kappa"
])

# Evaluate the Decision Tree model
dt_scores = evaluate_model(full_pipeline_2, X_test, y_test)
dt_scores["Model"] = "Logistic_Regression"

# Convert the dictionary to a Series matching the DataFrame columns, then assign as a new row
model_scores_df.loc[len(model_scores_df)] = pd.Series(dt_scores, index=model_scores_df.columns)

# Display the DataFrame
model_scores_df

Unnamed: 0,Model,Accuracy,Recall,Precision,Specificity,F1 Score,Balanced Accuracy,Cohen's Kappa
0,Logistic_Regression,0.94863,0.71875,0.793103,0.976923,0.754098,0.847837,0.725495


In [40]:
pred_tr= full_pipeline_2.predict(X_train)
accuracy_score(y_train, pred_tr)

0.9743150684931506

In [45]:
pred_tes= full_pipeline_2.predict(X_test)
accuracy_score(y_test, pred_tes)

0.9691780821917808

# Test on testing data from LeaderBoard

In [41]:
url = "https://drive.google.com/file/d/15PfmTxmavQCT-f7iY9tgwWxm9t4GRees/view?usp=drive_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
testing_data = pd.read_csv(path)

In [42]:
# Now you can predict using the full_pipeline_2
testing_data['Expensive'] = full_pipeline_2.predict(testing_data)

testing_data = testing_data.set_index('Id')

In [43]:
# Export the column 'Expensive' along with the index to create a submission file
testing_data['Expensive'].to_csv('./submission.csv')

# Colab only
from google.colab import files
files.download('./submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

got 96.8% accuracy.
