In [None]:
# libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.simplefilter('ignore')
from tabulate import tabulate

# visualisation
import seaborn as sns

# sklearn
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier,HistGradientBoostingClassifier)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

### Load Data

In [None]:
# load
CSV = '/kaggle/input/pistachio-types-detection/pistachio.csv'
df = pd.read_csv(CSV)

# lower-casing column name (opt.)
df.columns = [c.lower() for c in df.columns]

# label encoder
le = LabelEncoder()
df['class'] = le.fit_transform(df['class'])

# view
df.head()

### Data Split

In [None]:
# split data
x = df[df.columns[:-1]]
y = df[df.columns[-1:]]

# training & validation split
size = 0.1 
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = size, stratify = y)

# view
print(f"Training Size: {x_train.shape[0]}\nValidation Size: {x_val.shape[0]}")

In [None]:
# utility function to get a dynamic pipeline
def create_pipeline(classifier):
    
    # pre processing pipeline
    numeric_features = df.columns[:-1]
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features)])
    
    # classifier pipeline
    clf = Pipeline(steps=[('preprocessor', preprocessor),('classifier', classifier)])
    return clf    


# utility function to calculate metrics
def get_metrics(clf_instance):
    # get predictions
    y_pred = clf_instance.predict(x_val)
    
    # get accuracy score
    acc = '{:.1%}'.format(accuracy_score(y_val, y_pred))
    
    # get precision
    precision = '{:.2f}'.format(precision_score(y_val, y_pred))

    # get recall
    recall = '{:.2f}'.format(recall_score(y_val, y_pred))
    
    # f1 score
    f1 = '{:.2f}'.format(f1_score(y_val, y_pred))
    
    # tabulate
    table = [["Accuracy Score",acc],["Precision Score",precision],["Recall Score",recall],["f1 Score",f1]]
    print(tabulate(table, headers=["Metrics","Score"], tablefmt='outline'))
    

### MLP Classifier

In [None]:
# model params
params = { 'hidden_layer_sizes' : [10,10],
    'activation' : 'tanh', 'solver' : 'sgd',
    'alpha' : 0.001, 'batch_size' : 20,
    'random_state' : 0, 'tol' : 0.0001,
    'nesterovs_momentum' : False,
    'learning_rate' : 'constant',
    'learning_rate_init' : 0.01,
    'max_iter' : 500, 'shuffle' : True,
    'n_iter_no_change' : 50, 'verbose' : False }

# build model pipeline
mlp = create_pipeline(MLPClassifier(**params))

# train the model
mlp.fit(x_train, y_train)

# model evaluation
get_metrics(mlp)

### Adaboost

In [None]:
# base estimator
base_estim = DecisionTreeClassifier(max_depth=1, max_features=0.06)                            

# build adaboost
ab = create_pipeline(AdaBoostClassifier(base_estimator=base_estim,
                        n_estimators=500,
                        learning_rate=0.5,
                        random_state=42))

# train the model
ab.fit(x_train, y_train)

# model evaluation
get_metrics(ab)

### Gradient Boost

In [None]:
# build GBM Classifier
gbm = create_pipeline(GradientBoostingClassifier(n_estimators=2000,
                                 subsample=0.67,
                                 max_features=0.06,
                                 validation_fraction=0.05,
                                 n_iter_no_change=15,
                                 verbose=0,
                                 random_state=42))

# train the model
gbm.fit(x_train, y_train)

# model evaluation
get_metrics(gbm)

### XGBoost

In [None]:
# build XGB Classifier
xgb = create_pipeline(XGBClassifier(n_estimators=2000,
                    tree_method='hist',
                    subsample=0.67,
                    colsample_level=0.06,
                    verbose=0,
                    n_jobs=6,
                    random_state=42))

# train the model
xgb.fit(x_train, y_train)

# model evaluation
get_metrics(xgb)

### Cat Boost

In [None]:
# build CatBoost Classifier
cb = create_pipeline(CatBoostClassifier(n_estimators=2000,
                        colsample_bylevel=0.06,
                        max_leaves=31,
                        subsample=0.67,
                        verbose=0,
                        thread_count=3,
                        random_state=42))

# train the model
cb.fit(x_train, y_train)

# model evaluation
get_metrics(cb)

### Histogram-based Gradient Boosting

In [None]:
# build HGBM Classifier
hgbm = create_pipeline(HistGradientBoostingClassifier(max_iter=2000,
                                      validation_fraction=0.05,
                                      n_iter_no_change=15,
                                      verbose=0,
                                      random_state=42))

# train the model
hgbm.fit(x_train, y_train)

# model evaluation
get_metrics(hgbm)