In [1]:
import numpy as np 
import pandas as pd 
import warnings
import os
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from lightgbm import LGBMRegressor, log_evaluation, early_stopping
from sklearn.metrics import classification_report, accuracy_score
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.metrics import mean_squared_error
from sklearn.metrics import cohen_kappa_score
import matplotlib.pyplot as plt
from functools import partial
import scipy as sp
from sklearn.metrics import confusion_matrix as sk_cmatrix
from sklearn.model_selection import KFold
from joblib import dump
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import interpretableai
from julia.api import Julia
jl = Julia(compiled_modules=False)
# interpretableai.install_julia()
# interpretableai.install_system_image()
import os
os.environ['JULIA_NUM_THREADS'] = '50'
from interpretableai import iai
# iai.add_julia_processes(20)



In [14]:
def feature_importance(model, train_data):
    feature_importances = model.booster_.feature_importance(importance_type='gain')
    feature_importance_df = pd.DataFrame({'Feature': train_data.columns, 'Importance': feature_importances})
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
    return feature_importance_df

def feature_importance_group(model, train_data):
    feature_importances = model.booster_.feature_importance(importance_type='gain')
    feature_names = train_data.columns
    # Create a DataFrame for feature importances
    feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
    # Group by the feature name pattern and sum their importances
    # Strip the digits using a regular expression to group
    feature_importance_df['Feature_Group'] = feature_importance_df['Feature'].str.replace(r'\d+', '', regex=True)
    grouped_importance = feature_importance_df.groupby('Feature_Group', as_index=False)['Importance'].sum()
    # Sort by the summed importance
    grouped_importance = grouped_importance.sort_values(by='Importance', ascending=False)
    return grouped_importance

def plot_feature_importance(grouped_importance_df):
    plt.barh(grouped_importance_df['Feature_Group'], grouped_importance_df['Importance'], color='skyblue')
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature Group')
    plt.title('Feature Importance by Group')
    plt.gca().invert_yaxis()  # To display the highest importance at the top
    plt.show()

class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0
    def _kappa_loss(self, coef, X, y):
        preds = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3, 4])
        return -cohen_kappa_score(y, preds, weights='quadratic')
    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X = X, y = y)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')
    def predict(self, X, coef):
        preds = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3, 4])
        return preds
    def coefficients(self):
        return self.coef_['x']

def PreProcessTrain(
    df, bert_emb, breed_emb, beit_emb, new_cols_ALL,
    bert=True, beit=True, breed=True, txt=True, meta=True, senti=True, newcols=True
):
    name = 'train'
    # df = pd.read_csv(f'petfinder-adoption-prediction/{name}/{name}.csv')
    txt_emb = pd.read_csv(f'petfinder-adoption-prediction/{name}/txt_emb.csv').drop(columns=['Description', 'PhotoAmt'])
    metadata_gr = pd.read_csv(f'petfinder-adoption-prediction/{name}/metadata_gr.csv')
    sentiment_gr = pd.read_csv(f'petfinder-adoption-prediction/{name}/sentiment_gr.csv')
    
    if beit:
        beit_emb = pd.read_csv(f'petfinder-adoption-prediction/{name}/beit_emb.csv')
        beit_emb = beit_emb.drop(columns=['Description', 'PhotoAmt'])
    df = pd.merge(df, beit_emb, on=['PetID'], how='left')
    
    if bert: df = pd.merge(df, bert_emb, on=['PetID'], how='left')
    if breed: 
        df = pd.merge(df, breed_emb, left_on=['Breed1'], right_on=['BreedID'], how='left')
        df = df.drop(columns=['BreedID'])    
    if txt: df = pd.merge(df, txt_emb, on=['PetID'], how='left')
    if meta: df = pd.merge(df, metadata_gr, on=['PetID'], how='left')
    if senti: df = pd.merge(df, sentiment_gr, on=['PetID'], how='left')
    if newcols: df = pd.merge(df, new_cols_ALL, on=['PetID'], how='left')
    
    X = df.drop(columns=['AdoptionSpeed', 'Name', 'Description', 'PetID', 'RescuerID'])  # Features
    Y = df['AdoptionSpeed']
    
    # non_numeric_columns = ['Type', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3']
    # for col in non_numeric_columns: X[col] = X[col].astype('category')
    
    return X, Y

def PreProcessTest(
    df, bert_emb, breed_emb, beit_emb, new_cols_ALL,
    bert=True, beit=True, breed=True, txt=True, meta=True, senti=True, newcols=True
):
    name = 'test'
    # df = pd.read_csv(f'petfinder-adoption-prediction/{name}/{name}.csv')
    txt_emb = pd.read_csv(f'petfinder-adoption-prediction/{name}/txt_emb.csv').drop(columns=['Description', 'PhotoAmt'])
    metadata_gr = pd.read_csv(f'petfinder-adoption-prediction/{name}/metadata_gr.csv')
    sentiment_gr = pd.read_csv(f'petfinder-adoption-prediction/{name}/sentiment_gr.csv')
    
    if beit:
        beit_emb = pd.read_csv(f'petfinder-adoption-prediction/{name}/beit_emb.csv')
        beit_emb = beit_emb.drop(columns=['Description', 'PhotoAmt'])
    df = pd.merge(df, beit_emb, on=['PetID'], how='left')
    
    if bert: df = pd.merge(df, bert_emb, on=['PetID'], how='left')
    if breed: 
        df = pd.merge(df, breed_emb, left_on=['Breed1'], right_on=['BreedID'], how='left')
        df = df.drop(columns=['BreedID'])
    if txt: df = pd.merge(df, txt_emb, on=['PetID'], how='left')
    if meta: df = pd.merge(df, metadata_gr, on=['PetID'], how='left')
    if senti: df = pd.merge(df, sentiment_gr, on=['PetID'], how='left')
    if newcols: df = pd.merge(df, new_cols_ALL, on=['PetID'], how='left')
    
    X = df.drop(columns=['Name', 'Description', 'PetID', 'RescuerID'])
    
    # non_numeric_columns = ['Type', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3']
    # for col in non_numeric_columns: X[col] = X[col].astype('category')
    
    return X

def TrainLGBMReg(params, X_train, y_train):
    categorical_features = ['Type', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3']
    X_train[categorical_features] = X_train[categorical_features].astype('category')
    params['metric'] = 'rmse'
    params['subsample'] = None
    params['subsample_freq'] = None
    model = LGBMRegressor(**params)
    model.fit(
        X_train, y_train,
        categorical_feature=categorical_features
    )
    preds = model.predict(X_train)
    optR = OptimizedRounder()
    optR.fit(preds, y_train.values)
    coefficients = optR.coefficients()
    pred_y = optR.predict(preds, coefficients)
    print(cohen_kappa_score(y_train, pred_y, weights='quadratic'))
    return model, optR, coefficients

def TestResultDF_LGBMReg(model, optR, coef, X_test, name):
    preds = model.predict(X_test) 
    pred_y = optR.predict(preds, coef)
    test_result = pd.DataFrame({'PetID': test_data['PetID'], 'AdoptionSpeed': pred_y})
    print(test_result)
    test_result.to_csv(name, index=False)

def TRAINMODEL(
    bert=True, beit=True, breed=True, txt=True, meta=True, senti=True, newcols=True,
    pca_bert=200, pca_breed=200, pca_beit=500,
    params=None,
    save_address=None,
):
    if not params or not save_address: return 
    
    # read files 
    bert_pca200_ALL = pd.read_csv('petfinder-adoption-prediction/train/bert_pca200_ALL.csv') # 100 enough
    breed_pca200_ALL = pd.read_csv('petfinder-adoption-prediction/train/breed_pca200_ALL.csv') # 100 enough
    beit_pca768_ALL = pd.read_csv('petfinder-adoption-prediction/train/beit_pca768_ALL.csv') # 500 enough?
    bert_emb = bert_pca200_ALL[['PetID'] + [f'bert_pc_{i}' for i in range(1, pca_bert + 1)]] # PetID
    breed_emb = breed_pca200_ALL[['BreedID'] + [f'breed_pc_{i}' for i in range(1, pca_breed + 1)]] # BreedID
    beit_emb = beit_pca768_ALL[['PetID'] + [f'beit_pc_{i}' for i in range(1, pca_beit + 1)]] # PetID
    new_cols_ALL = pd.read_csv('petfinder-adoption-prediction/train/new_cols_ALL.csv') # PetID
    
    # parse inpput
    test_data = pd.read_csv(f'petfinder-adoption-prediction/test/test.csv')
    X_train, y_train = PreProcessTrain(
        bert_emb, breed_emb, beit_emb, new_cols_ALL,
        bert, beit, breed, txt, meta, senti, newcols
    )
    X_test = PreProcessTest(
        bert_emb, breed_emb, beit_emb, new_cols_ALL,
        bert, beit, breed, txt, meta, senti, newcols
    )
    
    # train model
    cat, optR, coefficients = TrainLGBMReg(params, X_train, y_train)
    
    # evaluate model
    imp = feature_importance_group(cat, X_train)
    plot_feature_importance(imp)
    
    # save result
    TestResultDF_LGBMReg(cat, optR, coefficients, X_test, f'Results/{save_address}.csv')
    
    return cat, optR, coefficients, save_address

def OptunaTune(X_train_in, Y_train_in, initial_params=None, nt=100, nj=4):
    optuna.logging.set_verbosity(optuna.logging.WARNING)

    X_train, X_valid, y_train, y_valid = train_test_split(X_train_in, Y_train_in, test_size=0.2, random_state=42, stratify=Y_train_in)

    def objective(trial):
        if initial_params and trial.number == 0:
            param = initial_params
            print(f"using initial param for trial {trial.number}")
        else:            
            param = {
                'max_depth': trial.suggest_int('max_depth', 15, 80),
                'minbucket': trial.suggest_int('minbucket', 1, 20),
                'ls_num_tree_restarts': trial.suggest_int('ls_num_tree_restarts', 15, 100),
                'regression_lambda': trial.suggest_float('regression_lambda', 0.01, 0.1),
            }    

        grid = iai.GridSearch(
            iai.OptimalTreeRegressor(
                **param, 
            ),
        )
        start_time = time.time()
        grid.fit(X_train, y_train)
        preds = grid.predict(X_valid)
        
        optR = OptimizedRounder()
        optR.fit(preds, y_valid.values)
        coefficients = optR.coefficients()
        pred_valid = optR.predict(preds, coefficients)

        loss = cohen_kappa_score(y_valid, pred_valid, weights='quadratic')
        end_time = time.time()
        print(f'loss: {loss}. params: {param}. time: {end_time - start_time}')
        return loss

    # Create a study object and optimize the objective function.
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=nt, n_jobs=nj, show_progress_bar=True)

    print('Number of finished trials:', len(study.trials))
    print('Best trial:', study.best_trial.params)
    return study.best_trial.params

categorical_features = ['Type', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3']

In [11]:
bert=True
beit=False 
breed=True
txt=True
meta=False
senti=False
newcols=True
pca_bert=5
pca_breed=5
pca_beit=5

# read files 
bert_pca200_ALL = pd.read_csv('petfinder-adoption-prediction/train/bert_pca200_ALL.csv') # 100 enough
breed_pca200_ALL = pd.read_csv('petfinder-adoption-prediction/train/breed_pca200_ALL.csv') # 100 enough
beit_pca768_ALL = pd.read_csv('petfinder-adoption-prediction/train/beit_pca768_ALL.csv') # 500 enough?
bert_emb = bert_pca200_ALL[['PetID'] + [f'bert_pc_{i}' for i in range(1, pca_bert + 1)]] # PetID
breed_emb = breed_pca200_ALL[['BreedID'] + [f'breed_pc_{i}' for i in range(1, pca_breed + 1)]] # BreedID
beit_emb = beit_pca768_ALL[['PetID'] + [f'beit_pc_{i}' for i in range(1, pca_beit + 1)]] # PetID
new_cols_ALL = pd.read_csv('petfinder-adoption-prediction/train/new_cols_ALL.csv') # PetID

# train
name = 'train'
df_train = pd.read_csv(f'petfinder-adoption-prediction/{name}/{name}.csv')
txt_emb = pd.read_csv(f'petfinder-adoption-prediction/{name}/txt_emb.csv').drop(columns=['Description', 'PhotoAmt'])
metadata_gr = pd.read_csv(f'petfinder-adoption-prediction/{name}/metadata_gr.csv')
sentiment_gr = pd.read_csv(f'petfinder-adoption-prediction/{name}/sentiment_gr.csv')
if beit:
    beit_emb = pd.read_csv(f'petfinder-adoption-prediction/{name}/beit_emb.csv')
    beit_emb = beit_emb.drop(columns=['Description', 'PhotoAmt'])
df_train = pd.merge(df_train, beit_emb, on=['PetID'], how='left')
if bert: df_train = pd.merge(df_train, bert_emb, on=['PetID'], how='left')
if breed: 
    df_train = pd.merge(df_train, breed_emb, left_on=['Breed1'], right_on=['BreedID'], how='left')
    df_train = df_train.drop(columns=['BreedID'])    
if txt: df_train = pd.merge(df_train, txt_emb, on=['PetID'], how='left')
if meta: df_train = pd.merge(df_train, metadata_gr, on=['PetID'], how='left')
if senti: df_train = pd.merge(df_train, sentiment_gr, on=['PetID'], how='left')
if newcols: df_train = pd.merge(df_train, new_cols_ALL, on=['PetID'], how='left')
    
# test
name = 'test'
df_test = pd.read_csv(f'petfinder-adoption-prediction/{name}/{name}.csv')
txt_emb = pd.read_csv(f'petfinder-adoption-prediction/{name}/txt_emb.csv').drop(columns=['Description', 'PhotoAmt'])
metadata_gr = pd.read_csv(f'petfinder-adoption-prediction/{name}/metadata_gr.csv')
sentiment_gr = pd.read_csv(f'petfinder-adoption-prediction/{name}/sentiment_gr.csv')
if beit:
    beit_emb = pd.read_csv(f'petfinder-adoption-prediction/{name}/beit_emb.csv')
    beit_emb = beit_emb.drop(columns=['Description', 'PhotoAmt'])
df_test = pd.merge(df_test, beit_emb, on=['PetID'], how='left')
if bert: df_test = pd.merge(df_test, bert_emb, on=['PetID'], how='left')
if breed: 
    df_test = pd.merge(df_test, breed_emb, left_on=['Breed1'], right_on=['BreedID'], how='left')
    df_test = df_test.drop(columns=['BreedID'])    
if txt: df_test = pd.merge(df_test, txt_emb, on=['PetID'], how='left')
if meta: df_test = pd.merge(df_test, metadata_gr, on=['PetID'], how='left')
if senti: df_test = pd.merge(df_test, sentiment_gr, on=['PetID'], how='left')
if newcols: df_test = pd.merge(df_test, new_cols_ALL, on=['PetID'], how='left')

# get dummies
# categorical_features = ['Type', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3']
categorical_features = ['Type', 'Gender', 'Color1', 'Color2', 'Color3']

df_test['AdoptionSpeed'] = -1
df_main = pd.concat([df_train, df_test], axis=0, ignore_index=True)
df_main = pd.get_dummies(df_main, columns=categorical_features)
df_train = df_main[df_main['AdoptionSpeed'] > -1]
df_test = df_main[df_main['AdoptionSpeed'] == -1]

# get dataset
X_train = df_train.drop(columns=['AdoptionSpeed', 'Name', 'Description', 'PetID', 'RescuerID', 'Breed1', 'Breed2'])  # Features
y_train = df_train['AdoptionSpeed']
X_test = df_test.drop(columns=['AdoptionSpeed', 'Name', 'Description', 'PetID', 'RescuerID', 'Breed1', 'Breed2'])  # Features
Y_test = df_test['AdoptionSpeed']

# # parse inpput
test_data = pd.read_csv(f'petfinder-adoption-prediction/test/test.csv')
# X_train, y_train = PreProcessTrain(
#     df_train, bert_emb, breed_emb, beit_emb, new_cols_ALL,
#     bert, beit, breed, txt, meta, senti, newcols
# )
# X_test = PreProcessTest(
#     df_test, bert_emb, breed_emb, beit_emb, new_cols_ALL,
#     bert, beit, breed, txt, meta, senti, newcols
# )

In [12]:
X_train = X_train.fillna(0)
X_train

Unnamed: 0,Age,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,...,Color2_4,Color2_5,Color2_6,Color2_7,Color3_0,Color3_3,Color3_4,Color3_5,Color3_6,Color3_7
0,3,1,1,2,2,2,1,1,100,41326,...,False,False,False,True,True,False,False,False,False,False
1,1,2,2,3,3,3,1,1,0,41401,...,False,False,False,False,True,False,False,False,False,False
2,1,2,2,1,1,2,1,1,0,41326,...,False,False,False,True,True,False,False,False,False,False
3,4,2,1,1,1,2,1,1,150,41401,...,False,False,False,False,True,False,False,False,False,False
4,1,2,1,2,2,2,1,1,0,41326,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14988,2,2,2,2,2,2,1,4,0,41326,...,False,False,False,False,True,False,False,False,False,False
14989,60,2,2,1,1,1,1,2,0,41326,...,True,False,False,False,False,False,False,False,False,True
14990,2,3,2,2,1,3,1,5,30,41326,...,False,False,True,False,False,False,False,False,False,True
14991,9,1,1,1,1,1,1,1,0,41336,...,False,False,False,True,True,False,False,False,False,False


In [16]:
# params = OptunaTune(X_train, y_train, nt=50, nj=1)

In [18]:
X_train_, X_valid_, y_train_, y_valid_ = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

for max_depth in [10, 20, 40, 80]:
    for minbucket in [1, 2, 4, 8]:
        for regression_lambda in [0.01, 0.05, 0.1]:
            param = {
                'max_depth': max_depth,
                'minbucket': minbucket,
                'regression_lambda': regression_lambda,
                'ls_num_tree_restarts': 50
            } 
            grid = iai.GridSearch(
                iai.OptimalTreeRegressor(
                    **param, 
                ),
            )
            start_time = time.time()
            grid.fit(X_train_, y_train_)
            preds = grid.predict(X_valid_)
            
            optR = OptimizedRounder()
            optR.fit(preds, y_valid_.values)
            coefficients = optR.coefficients()
            pred_valid = optR.predict(preds, coefficients)
            loss = cohen_kappa_score(y_valid_, pred_valid, weights='quadratic')
            end_time = time.time()
            print(f'loss: {loss}. params: {param}. time: {end_time - start_time}')

loss: 0.18049322719122451. params: {'max_depth': 10, 'minbucket': 1, 'regression_lambda': 0.01, 'ls_num_tree_restarts': 50}. time: 993.5479788780212
loss: 0.2658158741488822. params: {'max_depth': 10, 'minbucket': 1, 'regression_lambda': 0.05, 'ls_num_tree_restarts': 50}. time: 144.70159792900085
loss: 0.17839541580284923. params: {'max_depth': 10, 'minbucket': 1, 'regression_lambda': 0.1, 'ls_num_tree_restarts': 50}. time: 140.23096919059753
loss: 0.18217813824053397. params: {'max_depth': 10, 'minbucket': 2, 'regression_lambda': 0.01, 'ls_num_tree_restarts': 50}. time: 138.55543994903564
loss: 0.20404581667639743. params: {'max_depth': 10, 'minbucket': 2, 'regression_lambda': 0.05, 'ls_num_tree_restarts': 50}. time: 145.32408690452576
loss: 0.25547979825996414. params: {'max_depth': 10, 'minbucket': 2, 'regression_lambda': 0.1, 'ls_num_tree_restarts': 50}. time: 132.74078178405762
loss: 0.19902588061314352. params: {'max_depth': 10, 'minbucket': 4, 'regression_lambda': 0.01, 'ls_num_

In [9]:
param = {
    'max_depth': 10,
    'minbucket': 2,
    'regression_lambda': 0.01,
    'ls_num_tree_restarts': 10
}

grid = iai.GridSearch(
    iai.OptimalTreeRegressor(
        **param, 
    ),
)
start_time = time.time()
grid.fit(X_train, y_train)
end_time = time.time()
print(f"The code took {end_time - start_time} seconds to run.")

The code took 750.0688607692719 seconds to run.




In [11]:
preds = grid.predict(X_train)
optR = OptimizedRounder()
optR.fit(preds, y_train.values)
coefficients = optR.coefficients()
pred_y = optR.predict(preds, coefficients)
print(cohen_kappa_score(y_train, pred_y, weights='quadratic'))

0.22465400763783672


In [13]:
test_data = pd.read_csv(f'petfinder-adoption-prediction/test/test.csv')
TestResultDF_LGBMReg(grid, optR, coefficients, X_test, f'Results/octnotune.csv')

          PetID AdoptionSpeed
0     e2dfc2935             3
1     f153b465f             2
2     3c90f3f54             2
3     e02abc8a3             3
4     09f0df7d1             3
...         ...           ...
3967  ae57f8d52             3
3968  83432904d             3
3969  399013029             3
3970  fd80b8c80             3
3971  493ed84ae             3

[3972 rows x 2 columns]
