# Comparison of the Different Algorithms

Author: Usman Tariq

> **About Dataset:**\
> We will use the `diamonds` dataset from the seaborn library.

> **Purpose:**
> + We will apply the following algorithms to the sample of above mentioned dataset:
>   01. Logistic Regression
>   02. Random Forest
>   03. Decision Tree
>   04. Gradient Boosting
>   05. Support Vector Machine
>   06. K-Nearest Neighbors
>   07. Extra Gradient Boosting
>   08. Ada Boost
>   09. Bagging
>   10. CatBoost
>   11. Light GBM
>   12. XGBoost
>   13. Naive Bayes
> + We will apply the `Hyperparameter Tuning` and select the best algorithm based on the following classification metrics:
>   + Accuracy Score
>   + Recall Score
>   + Precision Score
>   + F1 Score
> + We will save the best selected algorithm.
> + We will load the saved model and run it on a dummy input.

**Import Libraries**

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from time import time
import pickle

import warnings
warnings.filterwarnings("ignore")

# Set the maximum number of columns to display
pd.set_option('display.max_columns', None)

**Loading the dataset**

In [2]:
# loading the dataset
df_full = sns.load_dataset('diamonds')

**Taking the Sample**

In [3]:
# selecting the sample of 500 rows randomly.
df = df_full.sample(100, random_state=42)
df.sample(3)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
43219,0.4,Ideal,E,VVS1,62.6,56.0,1397,4.73,4.7,2.95
42377,0.43,Premium,E,VVS2,60.8,57.0,1304,4.92,4.89,2.98
43777,0.53,Good,H,IF,61.2,65.0,1440,5.16,5.27,3.19


**Data Overview**

**Applying the Different Models for Classification Problem**

In [4]:
%%time
# Assume 'cut' is the target variable
X = df.drop('cut', axis=1)
y = df['cut']

# Apply ordinal encoder to the target variable y.
y_order = [['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']]
ordinal_encoder = OrdinalEncoder(categories=y_order)
y_reshaped = y.values.reshape(-1, 1)
y_encoded = ordinal_encoder.fit_transform(y_reshaped)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Define transformers for numerical and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['category', 'object']).columns

# Your manually defined orders for each categorical feature
custom_orders = {
    'color': ['D', 'E', 'F', 'G', 'H', 'I', 'J'],
    'clarity': ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
}

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OrdinalEncoder(categories=[custom_orders[feature] for feature in categorical_features]))
])

# Use ColumnTransformer to apply transformers to different feature types
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define models with hyperparameter grids for tuning
models = {
    'Logistic Regression': (LogisticRegression(), {'C': [0.001, 0.01, 0.1, 1, 10, 100]}),
    'Random Forest': (RandomForestClassifier(), {'n_estimators': [50, 100, 200],
                                                 'max_depth': [None, 10, 20, 30],
                                                 'min_samples_split': [2, 5, 10],
                                                 'min_samples_leaf': [1, 2, 4]}),
    'Decision Tree': (DecisionTreeClassifier(), {'max_depth': [None, 10, 20, 30],
                                                 'min_samples_split': [2, 5, 10],
                                                 'min_samples_leaf': [1, 2, 4]}),
    'Gradient Boosting': (GradientBoostingClassifier(), {'n_estimators': [50, 100, 200],
                                                         'learning_rate': [0.001, 0.01, 0.1, 1],
                                                         'max_depth': [3, 5, 7]}),
    'SVM': (SVC(), {'C': [0.001, 0.01, 0.1, 1, 10],
                                        'kernel': ['linear', 'rbf'],
                                        'gamma': ['scale', 'auto']}),
    'KNN': (KNeighborsClassifier(), {'n_neighbors': [3, 5, 7],
                                     'weights': ['uniform', 'distance'],
                                     'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}),
    'Extra Gradient': (ExtraTreesClassifier(), {'n_estimators': [50, 100, 200],
                                                         'max_depth': [None, 10, 20, 30],
                                                         'min_samples_split': [2, 5, 10],
                                                         'min_samples_leaf': [1, 2, 4]}),
    'Ada Boost': (AdaBoostClassifier(), {'n_estimators': [50, 100, 200],
                                         'learning_rate': [0.001, 0.01, 0.1, 1]}),
    'Bagging': (BaggingClassifier(), {'n_estimators': [50, 100, 200],
                                       'max_samples': [1.0, 0.8, 0.6],
                                       'max_features': [1.0, 0.8, 0.6]}),
    'CatBoost': (CatBoostClassifier(verbose=False), {'n_estimators': [50, 100, 200],
                                         'learning_rate': [0.001, 0.01, 0.1, 1],
                                         'depth': [4, 6, 8, 10]}),
    'Light GBM': (LGBMClassifier(verbose=-1), {'n_estimators': [50, 100, 200],
                                     'learning_rate': [0.001, 0.01, 0.1, 1],
                                     'max_depth': [3, 5, 7]}),
    'XGBoost': (XGBClassifier(), {'n_estimators': [50, 100, 200],
                                   'learning_rate': [0.001, 0.01, 0.1, 1],
                                   'max_depth': [3, 5, 7],
                                   'gamma': [0, 0.1, 0.2, 0.3]}),
    # 'Naive Bayes': (GaussianNB(), {})
}

# Create a DataFrame to store results
results_table = pd.DataFrame(columns=['Model', 'Mean Accuracy', 'Mean Precision', 'Mean Recall', 'Mean F1'])

# Evaluate models and select the best based on multiple metrics
best_model = None
best_composite_score = 0
best_model_params = None

for model_name, (model, original_param_grid) in models.items():
    start_time = time()  # Record the start time
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', model)])

    # Include classifier name in the parameter grid for GridSearchCV
    param_grid = {f'classifier__{key}': value for key, value in original_param_grid.items()}

    # Define a composite scoring function (you can customize weights based on your preferences)
    scoring = {
        'accuracy': 'accuracy',
        'precision': 'precision_weighted',
        'recall': 'recall_weighted',
        'f1': 'f1_weighted'
    }

    # Use cross-validation with the composite scoring function
    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=scoring, refit='f1', return_train_score=False)
    grid_search.fit(X, y_encoded)

    end_time = time()
    runtime = end_time - start_time

    # Convert run_time to minutes and seconds
    minutes, seconds = divmod(runtime, 60)
    formatted_time = "{:0>2}:{:05.2f}".format(int(minutes), seconds)

    # Get the best model and its performance on multiple metrics
    best_model_name = model_name
    best_model_params = grid_search.best_params_
    results = grid_search.cv_results_

    # print("Well Performed Hyperparameters of",model_name,":\n",grid_search.best_params_)

    # Calculate mean scores for each fold
    mean_accuracy = results["mean_test_accuracy"].mean()
    mean_precision = results["mean_test_precision"].mean()
    mean_recall = results["mean_test_recall"].mean()
    mean_f1 = results["mean_test_f1"].mean()

    # Instead of results_table = results_table.append(...)
    results_table = pd.concat([results_table, pd.DataFrame([{'Model': model_name,
                                                          'Mean Accuracy': mean_accuracy,
                                                          'Mean Precision': mean_precision,
                                                          'Mean Recall': mean_recall,
                                                          'Mean F1': mean_f1,
                                                          'Runtime': formatted_time}])], ignore_index=True)

    # Update the best model if the current model has a better composite score
    if mean_f1 > best_composite_score:
        best_model = best_model_name
        best_composite_score = mean_f1
        best_model_params = grid_search.best_params_

# Display the results table
print('\nResults of Applied Models:')
print(results_table.to_string(index=False, line_width=1000))
# print(pd.DataFrame(results_table))

print('------------------------------------------')
print(f'\nBest Selected Model based on F1 Score: {best_model}')
print(f'Best Hyperparameters: {best_model_params}')



# Save the best model using pickle
best_model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                       ('classifier', models[best_model][0])])

# print(models[best_model][0])

best_model_pipeline.fit(X, y_encoded)  # Fit on the entire dataset
with open('best_model.pkl', 'wb') as model_file:
    pickle.dump(best_model_pipeline, model_file)


Results of Applied Models:
              Model  Mean Accuracy  Mean Precision  Mean Recall  Mean F1  Runtime
Logistic Regression       0.558328        0.463136     0.558328 0.482010 00:00.41
      Random Forest       0.644605        0.542626     0.644605 0.579116 00:49.66
      Decision Tree       0.646275        0.639274     0.646275 0.627872 00:01.68
  Gradient Boosting       0.637148        0.566944     0.637148 0.575885 01:15.35
                SVM       0.528981        0.389014     0.528981 0.424227 00:00.88
                KNN       0.526391        0.462862     0.526391 0.484489 00:01.37
     Extra Gradient       0.600091        0.485314     0.600091 0.524105 00:36.12
          Ada Boost       0.635769        0.492585     0.635769 0.546737 00:06.94
            Bagging       0.671277        0.588779     0.671277 0.619194 00:18.07
           CatBoost       0.657655        0.532860     0.657655 0.577128 01:49.80
          Light GBM       0.609205        0.492773     0.609205 0.5257

In [5]:
# Load the best model from the saved file (for testing purpose)
with open('best_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

def prediction(input_data):
    prediction = loaded_model.predict(input_data)
    if prediction == 0:
        print('Fair')
    elif prediction == 1:
        print('Good')
    elif prediction == 2:
        print('Very Good')
    elif prediction == 3:
        print('Premium')
    elif prediction == 4:
        print('Ideal')
    else:
        print('Unknown')

In [6]:
# Create a dummy dataframe for the input variables.
dummy_inputs_1 = pd.DataFrame({
    'carat': [0.8],
    'color': ['G'],
    'clarity': ['SI1'],
    'depth': [40.5],
    'table': [33.0],
    'price': [4401],
    'x': [4.88],
    'y': [4.66],
    'z': [2.9]
})
prediction(dummy_inputs_1)

Ideal


In [7]:
# Create another dummy dataframe for the input variables.
dummy_inputs_2 = pd.DataFrame({
    'carat': [1.2],
    'color': ['D'],
    'clarity': ['VS2'],
    'depth': [60.5],
    'table': [53.1],
    'price': [5401],
    'x': [3.88],
    'y': [3.66],
    'z': [2.9]
})
prediction(dummy_inputs_2)

Ideal
