# Summary of this file

This file presents a comparison of various machine learning algorithms fitted to different combinations of modalities for two main purposes:

1. To observe the performance differences between these models.
2. To examine how the performance varies within the same models when using different modalities.

NOTE:
For categorical features, most algorithms cannot handle them directly, so we use one-hot encoding.

Results:
1. CatBoost and LGBM consistently outperform other models; therefore, our focus should be on these two models.
2. Including more modalities definitely leads to improved performance.

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
import catboost
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.metrics import classification_report, accuracy_score
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.metrics import mean_squared_error
from sklearn.metrics import cohen_kappa_score
import matplotlib.pyplot as plt
from functools import partial
import scipy as sp
from sklearn.metrics import confusion_matrix as sk_cmatrix
from sklearn.model_selection import KFold
from joblib import dump
import re

import pandas as pd
import numpy as np
import itertools

import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import time
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, StratifiedKFold

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
import lightgbm as lgb
import catboost
import optuna
import joblib
import shap
import pickle

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import cohen_kappa_score

import interpretableai
from julia.api import Julia
jl = Julia(compiled_modules=False)
import os
os.environ['JULIA_NUM_THREADS'] = '50'
from interpretableai import iai

  from .autonotebook import tqdm as notebook_tqdm
0d4cb87c61af7fe404c7211bf0d7d87358e244e9407b26c78823e4e4d8c5b40d


In [2]:
class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0
    def _kappa_loss(self, coef, X, y):
        preds = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3, 4])
        return -cohen_kappa_score(y, preds, weights='quadratic')
    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X = X, y = y)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')
    def predict(self, X, coef):
        preds = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3, 4])
        return preds
    def coefficients(self):
        return self.coef_['x']

In [3]:
# df = pd.read_csv(f'petfinder-adoption-prediction/train/train.csv')
# X = df.drop(columns=['AdoptionSpeed'])
# y = df['AdoptionSpeed']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
# df['iftrain'] = 0
# df.loc[df.index.isin(X_train.index), 'iftrain'] = 1

In [4]:
# df.to_csv(f'petfinder-adoption-prediction/train/train_split.csv')
df = pd.read_csv(f'petfinder-adoption-prediction/train/train_split.csv')
df_test = pd.read_csv(f'petfinder-adoption-prediction/test/test.csv')

In [5]:
def PreProcess(
    df_in, bert=True, beit=True, breed=True, txt=True, meta=True, senti=True, newcols=True,
    pca_bert=200, pca_breed=200
):
    # get normal files
    df = df_in.copy(deep=True) 
    name = 'train'
    # extra modals
    if beit:
        beit_emb = pd.read_csv(f'petfinder-adoption-prediction/{name}/beit_emb.csv')
        beit_emb = beit_emb.drop(columns=['Description', 'PhotoAmt'])
        df = pd.merge(df, beit_emb, on=['PetID'], how='left')
    if bert and pca_bert > 0: 
        bert_pca200_ALL = pd.read_csv('petfinder-adoption-prediction/train/bert_pca200_ALL.csv') # 100 enough
        bert_emb = bert_pca200_ALL[['PetID'] + [f'bert_pc_{i}' for i in range(1, pca_bert + 1)]] # PetID
        df = pd.merge(df, bert_emb, on=['PetID'], how='left')
    if breed and pca_breed > 0: 
        breed_pca200_ALL = pd.read_csv('petfinder-adoption-prediction/train/breed_pca200_ALL.csv') # 100 enough
        breed_emb = breed_pca200_ALL[['BreedID'] + [f'breed_pc_{i}' for i in range(1, pca_breed + 1)]] # BreedID
        df = pd.merge(df, breed_emb, left_on=['Breed1'], right_on=['BreedID'], how='left')
        df = df.drop(columns=['BreedID'])  
    if txt: 
        txt_emb = pd.read_csv(f'petfinder-adoption-prediction/{name}/txt_emb.csv').drop(columns=['Description', 'PhotoAmt'])
        df = pd.merge(df, txt_emb, on=['PetID'], how='left')
    if meta: 
        metadata_gr = pd.read_csv(f'petfinder-adoption-prediction/{name}/metadata_gr.csv')
        df = pd.merge(df, metadata_gr, on=['PetID'], how='left')
    if senti: 
        sentiment_gr = pd.read_csv(f'petfinder-adoption-prediction/{name}/sentiment_gr.csv')
        df = pd.merge(df, sentiment_gr, on=['PetID'], how='left')
    if newcols: 
        new_cols_ALL = pd.read_csv('petfinder-adoption-prediction/train/new_cols_ALL.csv') # PetID
        df = pd.merge(df, new_cols_ALL, on=['PetID'], how='left')
    # catagorical one-hot encoding
    non_numeric_columns = ['Type', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 'State']
    df_encoded = pd.get_dummies(df, columns=non_numeric_columns)
    # make sure feature names are valid
    df_encoded = df_encoded.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
    df_encoded = df_encoded.fillna(0)
    # split into train & test
    train_df, test_df = df_encoded[df_encoded['iftrain'] == 1], df_encoded[df_encoded['iftrain'] == 0]
    to_drop = ['AdoptionSpeed', 'Name', 'Description', 'PetID', 'RescuerID', 'iftrain']
    X_train = train_df.drop(columns=to_drop)
    Y_train = train_df['AdoptionSpeed']
    X_test = test_df.drop(columns=to_drop)
    Y_test = test_df['AdoptionSpeed']
    # return results
    return X_train, X_test, Y_train, Y_test

In [6]:
import pandas as pd

def quick_check_regression(X_train, X_test, y_train, y_test):
    grid = iai.GridSearch(
        iai.OptimalTreeRegressor(
            **{
                'max_depth': 50,
            }),
    )
    start_time = time.time()
    grid.fit(X_train, y_train)
    preds = grid.predict(X_train)
    optR = OptimizedRounder()
    optR.fit(preds, y_train.values)
    coefficients = optR.coefficients()
    pred_y = optR.predict(grid.predict(X_test), coefficients)
    kappa_score = cohen_kappa_score(y_test, pred_y, weights='quadratic')
    end_time = time.time()
    print(f'loss: {kappa_score}. time: {end_time - start_time}')
    # Initialize an empty DataFrame to store results
    results_df = pd.DataFrame(columns=["Model", "Cohen's Kappa Score"])
    # print(f"Testing Cohen's Kappa Score : {kappa_score}")
    result = pd.DataFrame([["ORT", kappa_score]], columns=["Model", "Cohen's Kappa Score"])
    results_df = pd.concat([results_df, result], ignore_index=True)

    return results_df

## basic

In [None]:
# basic
bert=False
beit=False
breed=False
txt=False
meta=False
senti=False
newcols=False
pca_bert=0
pca_breed=0
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

In [26]:
# basic + bert
bert=True
beit=False
breed=False
txt=False
meta=False
senti=False
newcols=False
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.19982954140910492. time: 1643.6413788795471


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.19983


In [27]:
# basic + bert + breed
bert=True
beit=False
breed=True
txt=False
meta=False
senti=False
newcols=False
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.19742180817340227. time: 2139.486210823059


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.197422


In [28]:
# basic + beit
bert=False
beit=True
breed=False
txt=False
meta=False
senti=False
newcols=False
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.20600579821938503. time: 4824.678575992584


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.206006


In [29]:
# basic + senti
bert=False
beit=False
breed=False
txt=False
meta=False
senti=True
newcols=False
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.20415741648256702. time: 480.3777301311493


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.204157


In [30]:
# basic + meta
bert=False
beit=False
breed=False
txt=False
meta=True
senti=False
newcols=False
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.2764025815755743. time: 548.5626170635223


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.276403


In [31]:
# basic + meta + senti
bert=False
beit=False
breed=False
txt=False
meta=True
senti=True
newcols=False
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.20742332391595697. time: 551.2858290672302


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.207423


In [32]:
# basic + bert + breed + beit
bert=True
beit=True
breed=True
txt=False
meta=False
senti=False
newcols=False
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.17177912031573017. time: 7162.766335964203


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.171779


In [33]:
# basic + txt
bert=False
beit=False
breed=False
txt=True
meta=False
senti=False
newcols=False
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.20342051122188232. time: 544.4982483386993


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.203421


In [None]:
# basic + meta + senti + txt
bert=False
beit=False
breed=False
txt=True
meta=True
senti=True
newcols=False
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

In [35]:
# basic + bert + breed + beit + meta + senti + txt
bert=True
beit=True
breed=True
txt=True
meta=True
senti=True
newcols=False
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.17616033261465025. time: 7577.685313940048


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.17616


In [36]:
# basic + meta + txt
bert=False
beit=False
breed=False
txt=True
meta=True
senti=False
newcols=False
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.20816440612540765. time: 623.7845799922943


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.208164


In [37]:
# basic + bert + breed + beit + meta + txt
bert=True
beit=True
breed=True
txt=True
meta=True
senti=False
newcols=False
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.1752955261135698. time: 13343.6002099514


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.175296


## advanced (always have newcols)

In [7]:
# advanced !!!!!!!!
bert=False
beit=False
breed=False
txt=False
meta=False
senti=False
newcols=True
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

└ 0d4cb87c61af7fe404c7211bf0d7d87358e244e9407b26c78823e4e4d8c5b40d


loss: 0.30570092321598985. time: 1720.2823100090027


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.305701


In [8]:
# advanced + bert ！！！！！！！！！
bert=True
beit=False
breed=False
txt=False
meta=False
senti=False
newcols=True
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.3175706275999882. time: 1635.547772884369


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.317571


In [40]:
# advanced + bert + breed
bert=True
beit=False
breed=True
txt=False
meta=False
senti=False
newcols=True
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.19350778897010423. time: 3690.416400909424


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.193508


In [9]:
# advanced + beit ！！！！！！！！！
bert=False
beit=True
breed=False
txt=False
meta=False
senti=False
newcols=True
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.18634916127076617. time: 2675.458956003189


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.186349


In [42]:
# advanced + senti
bert=False
beit=False
breed=False
txt=False
meta=False
senti=True
newcols=True
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.2874469721905445. time: 1861.8705520629883


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.287447


In [43]:
# advanced + meta
bert=False
beit=False
breed=False
txt=False
meta=True
senti=False
newcols=True
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.20881856624490702. time: 1908.7015919685364


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.208819


In [44]:
# advanced + meta + senti
bert=False
beit=False
breed=False
txt=False
meta=True
senti=True
newcols=True
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.20773012572891625. time: 2083.023488998413


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.20773


In [45]:
# advanced + bert + breed + beit
bert=True
beit=True
breed=True
txt=False
meta=False
senti=False
newcols=True
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.19023140664391314. time: 8504.269695997238


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.190231


In [46]:
# advanced + txt
bert=False
beit=False
breed=False
txt=True
meta=False
senti=False
newcols=True
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.20758817861471224. time: 563.1778111457825


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.207588


In [8]:
# advanced + meta + senti + txt !!!!!!!!
bert=False
beit=False
breed=False
txt=True
meta=True
senti=True
newcols=True
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.28826791118926465. time: 1883.9095659255981


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.288268


In [None]:
def quick_check_regression(X_train, X_test, y_train, y_test):
    grid = iai.GridSearch(
        iai.OptimalTreeRegressor(
            **{
                'max_depth': 20,
                'regression_features': {'All'},
                'ls_num_tree_restarts': 20
            }),
    )
    start_time = time.time()
    grid.fit(X_train, y_train)
    preds = grid.predict(X_train)
    optR = OptimizedRounder()
    optR.fit(preds, y_train.values)
    coefficients = optR.coefficients()
    pred_y = optR.predict(grid.predict(X_test), coefficients)
    kappa_score = cohen_kappa_score(y_test, pred_y, weights='quadratic')
    end_time = time.time()
    print(f'loss: {kappa_score}. time: {end_time - start_time}')
    # Initialize an empty DataFrame to store results
    results_df = pd.DataFrame(columns=["Model", "Cohen's Kappa Score"])
    # print(f"Testing Cohen's Kappa Score : {kappa_score}")
    result = pd.DataFrame([["ORT", kappa_score]], columns=["Model", "Cohen's Kappa Score"])
    results_df = pd.concat([results_df, result], ignore_index=True)

    return results_df

# advanced + bert + breed + beit + meta + senti + txt (200) !!!!!!!!
bert=True
beit=True
breed=True
txt=True
meta=True
senti=True
newcols=True
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

In [None]:
def quick_check_regression(X_train, X_test, y_train, y_test):
    grid = iai.GridSearch(
        iai.OptimalTreeRegressor(
            **{
                'max_depth': 30,
                'regression_features': {'All'},
                'ls_num_tree_restarts': 20
            }),
    )
    start_time = time.time()
    grid.fit(X_train, y_train)
    preds = grid.predict(X_train)
    optR = OptimizedRounder()
    optR.fit(preds, y_train.values)
    coefficients = optR.coefficients()
    pred_y = optR.predict(grid.predict(X_test), coefficients)
    kappa_score = cohen_kappa_score(y_test, pred_y, weights='quadratic')
    end_time = time.time()
    print(f'loss: {kappa_score}. time: {end_time - start_time}')
    # Initialize an empty DataFrame to store results
    results_df = pd.DataFrame(columns=["Model", "Cohen's Kappa Score"])
    # print(f"Testing Cohen's Kappa Score : {kappa_score}")
    result = pd.DataFrame([["ORT", kappa_score]], columns=["Model", "Cohen's Kappa Score"])
    results_df = pd.concat([results_df, result], ignore_index=True)

    return results_df

# advanced + bert + breed + beit + meta + senti + txt (200) !!!!!!!!
bert=True
beit=True
breed=True
txt=True
meta=True
senti=True
newcols=True
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

In [None]:
def quick_check_regression(X_train, X_test, y_train, y_test):
    grid = iai.GridSearch(
        iai.OptimalTreeRegressor(
            **{
                'max_depth': 5,
                'hyperplane_config': {'sparsity': 'all'},
                'regression_features': {'All'},
                'ls_num_tree_restarts': 1
            }),
    )
    start_time = time.time()
    grid.fit(X_train, y_train)
    preds = grid.predict(X_train)
    optR = OptimizedRounder()
    optR.fit(preds, y_train.values)
    coefficients = optR.coefficients()
    pred_y = optR.predict(grid.predict(X_test), coefficients)
    kappa_score = cohen_kappa_score(y_test, pred_y, weights='quadratic')
    end_time = time.time()
    print(f'loss: {kappa_score}. time: {end_time - start_time}')
    # Initialize an empty DataFrame to store results
    results_df = pd.DataFrame(columns=["Model", "Cohen's Kappa Score"])
    # print(f"Testing Cohen's Kappa Score : {kappa_score}")
    result = pd.DataFrame([["ORT", kappa_score]], columns=["Model", "Cohen's Kappa Score"])
    results_df = pd.concat([results_df, result], ignore_index=True)

    return results_df

# advanced + bert + breed + beit + meta + senti + txt (200) !!!!!!!!
bert=True
beit=True
breed=True
txt=True
meta=True
senti=True
newcols=True
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

In [None]:
def quick_check_regression(X_train, X_test, y_train, y_test):
    grid = iai.GridSearch(
        iai.OptimalTreeRegressor(
            **{
                'max_depth': 10,
                'hyperplane_config': {'sparsity': 'all'},
                'regression_features': {'All'},
                'ls_num_tree_restarts': 1
            }),
    )
    start_time = time.time()
    grid.fit(X_train, y_train)
    preds = grid.predict(X_train)
    optR = OptimizedRounder()
    optR.fit(preds, y_train.values)
    coefficients = optR.coefficients()
    pred_y = optR.predict(grid.predict(X_test), coefficients)
    kappa_score = cohen_kappa_score(y_test, pred_y, weights='quadratic')
    end_time = time.time()
    print(f'loss: {kappa_score}. time: {end_time - start_time}')
    # Initialize an empty DataFrame to store results
    results_df = pd.DataFrame(columns=["Model", "Cohen's Kappa Score"])
    # print(f"Testing Cohen's Kappa Score : {kappa_score}")
    result = pd.DataFrame([["ORT", kappa_score]], columns=["Model", "Cohen's Kappa Score"])
    results_df = pd.concat([results_df, result], ignore_index=True)

    return results_df

# advanced + bert + breed + beit + meta + senti + txt (200) !!!!!!!!
bert=True
beit=True
breed=True
txt=True
meta=True
senti=True
newcols=True
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

In [None]:
def quick_check_regression(X_train, X_test, y_train, y_test):
    grid = iai.GridSearch(
        iai.OptimalTreeRegressor(
            **{
                'max_depth': 20,
                'hyperplane_config': {'sparsity': 'all'},
                'regression_features': {'All'},
                'ls_num_tree_restarts': 1
            }),
    )
    start_time = time.time()
    grid.fit(X_train, y_train)
    preds = grid.predict(X_train)
    optR = OptimizedRounder()
    optR.fit(preds, y_train.values)
    coefficients = optR.coefficients()
    pred_y = optR.predict(grid.predict(X_test), coefficients)
    kappa_score = cohen_kappa_score(y_test, pred_y, weights='quadratic')
    end_time = time.time()
    print(f'loss: {kappa_score}. time: {end_time - start_time}')
    # Initialize an empty DataFrame to store results
    results_df = pd.DataFrame(columns=["Model", "Cohen's Kappa Score"])
    # print(f"Testing Cohen's Kappa Score : {kappa_score}")
    result = pd.DataFrame([["ORT", kappa_score]], columns=["Model", "Cohen's Kappa Score"])
    results_df = pd.concat([results_df, result], ignore_index=True)

    return results_df

# advanced + bert + breed + beit + meta + senti + txt (200) !!!!!!!!
bert=True
beit=True
breed=True
txt=True
meta=True
senti=True
newcols=True
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

In [49]:
# advanced + meta + txt
bert=False
beit=False
breed=False
txt=True
meta=True
senti=False
newcols=True
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.1941070765159696. time: 648.813099861145


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.194107


In [50]:
# advanced + bert + breed + beit + meta + txt
bert=True
beit=True
breed=True
txt=True
meta=True
senti=False
newcols=True
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.268976198894165. time: 13509.181233882904


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.268976


In [51]:
# advanced + bert + breed + beit + meta + senti + txt (100)
bert=True
beit=True
breed=True
txt=True
meta=True
senti=True
newcols=True
pca_bert=100
pca_breed=100
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.24424646810958184. time: 5964.544634103775


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.244246


In [52]:
# advanced + bert + beit + meta + senti + txt (200)
bert=True
beit=True
breed=False
txt=True
meta=True
senti=True
newcols=True
pca_bert=200
pca_breed=200
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.2648328599161782. time: 33904.699218034744


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.264833


In [53]:
# advanced + bert + breed + beit + meta + senti + txt (150)
bert=True
beit=True
breed=True
txt=True
meta=True
senti=True
newcols=True
pca_bert=150
pca_breed=150
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.18647585197472616. time: 6439.1790890693665


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.186476


In [54]:
# advanced + bert + breed + beit + meta + senti + txt (160)
bert=True
beit=True
breed=True
txt=True
meta=True
senti=True
newcols=True
pca_bert=160
pca_breed=160
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.19474986272958816. time: 18711.66781115532


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.19475


In [55]:
# advanced + bert + breed + beit + meta + senti + txt (170)
bert=True
beit=True
breed=True
txt=True
meta=True
senti=True
newcols=True
pca_bert=170
pca_breed=170
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.17702903333803843. time: 6405.00651216507


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.177029


In [56]:
# advanced + bert + breed + beit + meta + senti + txt (180)
bert=True
beit=True
breed=True
txt=True
meta=True
senti=True
newcols=True
pca_bert=180
pca_breed=180
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.20487495582014825. time: 7319.160205125809


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.204875


In [57]:
# advanced + bert + breed + beit + meta + senti + txt (190)
bert=True
beit=True
breed=True
txt=True
meta=True
senti=True
newcols=True
pca_bert=190
pca_breed=190
X_train, X_test, y_train, y_test = PreProcess(
    df, bert, beit, breed, txt, meta, senti, newcols, pca_bert, pca_breed
)
result = quick_check_regression(X_train, X_test, y_train, y_test)
result

loss: 0.2610649118540852. time: 6973.7124700546265


Unnamed: 0,Model,Cohen's Kappa Score
0,ORT,0.261065
