In [None]:
# Basics
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Plotly
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.express as px
from plotly.subplots import make_subplots


# Tools
from copy import copy # Shallow copy
from itertools import product
from collections import defaultdict
from functools import partial
from IPython.display import display # Allows functions to simultaneously return values and show tables

# Styling
from colorama import Fore
from colorama import Style
from matplotlib.colors import Colormap


# Assessing Feature Importance
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import mutual_info_classif

# Pipeline
from sklearn.pipeline import make_pipeline
#from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer




from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


from sklearn.metrics import accuracy_score



# t-SNE
from sklearn.manifold import TSNE


# Dendogram
from scipy.cluster.hierarchy import linkage
from scipy.spatial.distance import squareform


# Kde Plots
from scipy.stats import gaussian_kde



# Probability plots
from scipy.stats import probplot

# The Tree Trio
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


import scipy.stats as stats

# Good ol utils
from utils import *

# Sequential Feature Selection
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs


import optuna
from optuna.samplers import TPESampler, CmaEsSampler
import optuna.visualization as vis

In [None]:
import pandas as pd
train = pd.read_csv(r"C:\Users\Nebula PC\Desktop\Projects\Academic-Success-Prediction\data\train.csv", index_col="id").rename(columns=str.strip)
test = pd.read_csv(r"C:\Users\Nebula PC\Desktop\Projects\Academic-Success-Prediction\data\test.csv", index_col="id").rename(columns=str.strip)

target = "Target"

value_mapping = {
    'Enrolled': 2,
    'Dropout': 0,
    'Graduate': 1
}

# Replace the values in the "Target" column
train['Target'] = train['Target'].replace(value_mapping)

In [None]:
BOTTOM_15_FEATURES = [
    'International',
    'Educational special needs',
    'Nacionality',
    'Marital status',
    'Inflation rate',
    'Curricular units 1st sem (without evaluations)',
    'Father\'s qualification',
    'Mother\'s qualification',
    'Displaced',
    'Curricular units 2nd sem (credited)',
    'Curricular units 1st sem (credited)',
    'Application order',
    'Previous qualification',
    'Daytime/evening attendance',
    'Curricular units 2nd sem (without evaluations)'
]

train = train.drop(columns = BOTTOM_15_FEATURES)
test = test.drop(columns = BOTTOM_15_FEATURES)

In [None]:
TOP_12_FEATURES = [
    'Curricular units 2nd sem (approved)',
    'Curricular units 1st sem (approved)',
    'Curricular units 2nd sem (grade)',
    'Tuition fees up to date',
    'Curricular units 1st sem (grade)',
    'Curricular units 2nd sem (evaluations)',
    'Curricular units 2nd sem (enrolled)',
    'Scholarship holder',
    'Curricular units 1st sem (evaluations)',
    'Course',
    'Curricular units 1st sem (enrolled)'
]



ONE_HOT_COLUMNS_SELECTED = [
                   'Application mode', 
                   'Course', 
                   'Mother\'s occupation', 
                   'Father\'s occupation']





BINARY_INDICATOR_FEATURES_SELECTED = [
                             'Curricular units 2nd sem (grade)',
                             'Curricular units 2nd sem (approved)',
                             'Curricular units 2nd sem (evaluations)',
                             'Curricular units 1st sem (grade)',
                             'Curricular units 1st sem (approved)',
                             'Curricular units 1st sem (evaluations)',
]



BINARY_COLUMNS = []
for column in train.columns:
    if len(train[column].value_counts()) == 2:
        BINARY_COLUMNS.append(column)

binary_data = train[BINARY_COLUMNS+ ['Target']]



NUMERIC_COLUMNS = train.drop(columns = ONE_HOT_COLUMNS_SELECTED + BINARY_COLUMNS + ['Target']).columns.to_list()
numeric_data = train[NUMERIC_COLUMNS + ['Target']]



FLOAT_COLUMNS = train[NUMERIC_COLUMNS].select_dtypes('float').columns.to_list()
float_data = train[FLOAT_COLUMNS + ['Target']]



INTEGER_COLUMNS = train[NUMERIC_COLUMNS].select_dtypes('int').columns.to_list()
integer_data = train[INTEGER_COLUMNS + ['Target']]


positive_features = list(train[NUMERIC_COLUMNS].describe().T.query("min > 0").index)
zero_features = list(train[NUMERIC_COLUMNS].describe().T.query("min == 0").index)
negative_features = list(train[NUMERIC_COLUMNS].describe().T.query("min < 0").index)

In [None]:
def feature_engineering(df):
    df['sum_evaluations'] = df['Curricular units 2nd sem (evaluations)'] + df['Curricular units 1st sem (evaluations)']
    df['min_evaluations'] = df[['Curricular units 2nd sem (evaluations)', 'Curricular units 1st sem (evaluations)']].min(axis = 1)
    df['max_evaluations'] = df[['Curricular units 2nd sem (evaluations)', 'Curricular units 1st sem (evaluations)']].max(axis = 1)
    df['difference_evaluations'] = abs(df['Curricular units 2nd sem (evaluations)'] - df['Curricular units 1st sem (evaluations)'])
    df['change_evaluations'] = df['Curricular units 2nd sem (evaluations)'] - df['Curricular units 1st sem (evaluations)']

    df['sum_enrolled'] = df['Curricular units 2nd sem (enrolled)'] + df['Curricular units 1st sem (enrolled)']
    df['min_enrolled'] = df[['Curricular units 2nd sem (enrolled)', 'Curricular units 1st sem (enrolled)']].min(axis = 1)
    df['max_enrolled'] = df[['Curricular units 2nd sem (enrolled)', 'Curricular units 1st sem (enrolled)']].max(axis = 1)
    df['difference_enrolled'] = abs(df['Curricular units 2nd sem (enrolled)'] - df['Curricular units 1st sem (enrolled)'])
    df['change_enrolled'] = df['Curricular units 2nd sem (enrolled)'] - df['Curricular units 1st sem (enrolled)']


    #IMPORTANT
    df['sum_grade'] = df['Curricular units 2nd sem (grade)'] + df['Curricular units 1st sem (grade)']
    df['min_grade'] = df[['Curricular units 2nd sem (grade)', 'Curricular units 1st sem (grade)']].min(axis = 1)
    df['max_grade'] = df[['Curricular units 2nd sem (grade)', 'Curricular units 1st sem (grade)']].max(axis = 1)
    df['difference_grade'] = abs(df['Curricular units 2nd sem (grade)'] - df['Curricular units 1st sem (grade)'])
    df['change_grade'] = df['Curricular units 2nd sem (grade)'] - df['Curricular units 1st sem (grade)']



    #IMPORTANT
    df['sum_approved'] = df['Curricular units 2nd sem (approved)'] + df['Curricular units 1st sem (approved)']
    df['min_approved'] = df[['Curricular units 2nd sem (approved)', 'Curricular units 1st sem (approved)']].min(axis = 1)
    df['max_approved'] = df[['Curricular units 2nd sem (approved)', 'Curricular units 1st sem (approved)']].max(axis = 1)
    df['difference_approved'] = abs(df['Curricular units 2nd sem (approved)'] - df['Curricular units 1st sem (approved)'])
    df['change_approved'] = df['Curricular units 2nd sem (approved)'] - df['Curricular units 1st sem (approved)']


    #Interactions using important features
    df['approved_minus_grade_s1'] = df['Curricular units 1st sem (approved)'] - df['Curricular units 1st sem (grade)']
    df['grade_minus_approved_s1'] = df['Curricular units 1st sem (grade)'] - df['Curricular units 1st sem (approved)']
    df['approved_minus_grade_s2'] = df['Curricular units 2nd sem (approved)'] - df['Curricular units 2nd sem (grade)']
    df['grade_minus_approved_s2'] = df['Curricular units 2nd sem (grade)'] - df['Curricular units 2nd sem (approved)']

    df['approved_add_grade_s1'] = df['Curricular units 1st sem (approved)'] + df['Curricular units 1st sem (grade)']
    df['approved_add_grade_s2'] = df['Curricular units 2nd sem (approved)'] + df['Curricular units 2nd sem (grade)']

    df['approved_add_grade_s1_s2'] = df['approved_add_grade_s1'] + df['approved_add_grade_s2']

    #other interactions
    df['curricular_units_sum_s1'] = df['Curricular units 1st sem (evaluations)'] + df['Curricular units 1st sem (enrolled)'] + df['Curricular units 1st sem (grade)'] + df['Curricular units 1st sem (approved)']
    df['curricular_units_sum_s2'] = df['Curricular units 2nd sem (evaluations)'] + df['Curricular units 2nd sem (enrolled)'] + df['Curricular units 2nd sem (grade)'] + df['Curricular units 2nd sem (approved)']
    df['curricular_units_sum_s1_s2'] = df['curricular_units_sum_s1'] + df['curricular_units_sum_s2']
    df['curricular_units_difference_s1_s2'] = abs(df['curricular_units_sum_s2'] - df['curricular_units_sum_s1'])
    df['curricular_units_change_s1_s2'] = df['curricular_units_sum_s2'] - df['curricular_units_sum_s1']

    
    df['fees_plus_scholarship'] = df['Scholarship holder'] + 2*df['Tuition fees up to date']  # Use 2* so we can differentiate between scholarship and fees

    for col in BINARY_INDICATOR_FEATURES_SELECTED:
        df[f'{col} (binary)'] = (df[col] > 0).astype(int)

    return df

In [None]:
train = feature_engineering(train)
test = feature_engineering(test)

In [None]:
NUMERIC_COLUMNS_ENGINEERED = train.drop(columns = ONE_HOT_COLUMNS_SELECTED + BINARY_COLUMNS + ['Target']).columns.to_list()
numeric_data = train[NUMERIC_COLUMNS_ENGINEERED + ['Target']]

positive_features = list(train[NUMERIC_COLUMNS_ENGINEERED].describe().T.query("min > 0").index)
zero_features = list(train[NUMERIC_COLUMNS_ENGINEERED].describe().T.query("min == 0").index)
negative_features = list(train[NUMERIC_COLUMNS_ENGINEERED].describe().T.query("min < 0").index)

r2_scores = defaultdict(tuple)

for feature in NUMERIC_COLUMNS_ENGINEERED:
    orig = numeric_data[feature].dropna()
    if feature in positive_features:
        _, (*_, R_orig) = probplot(orig, rvalue=True)
        _, (*_, R_log) = probplot(np.log(orig), rvalue=True)
        _, (*_, R_log1p) = probplot(np.log1p(orig), rvalue=True)
        #_, (*_, R_exp) = probplot(np.exp(orig), rvalue=True)
        _, (*_, R_sqrt) = probplot(np.sqrt(orig), rvalue=True)
        _, (*_, R_square) = probplot(np.square(orig), rvalue=True)
        _, (*_, R_reci) = probplot(np.reciprocal(orig), rvalue=True)
        _, (*_, R_boxcox) = probplot(stats.boxcox(orig)[0], rvalue=True)
        _, (*_, R_yeojohn) = probplot(stats.yeojohnson(orig)[0], rvalue=True)
    elif feature in zero_features:
        _, (*_, R_orig) = probplot(orig, rvalue=True)
        _, (*_, R_log) = probplot(orig, rvalue=True)
        _, (*_, R_log1p) = probplot(np.log1p(orig), rvalue=True)
        #_, (*_, R_exp) = probplot(np.exp(orig), rvalue=True)
        _, (*_, R_sqrt) = probplot(np.sqrt(orig), rvalue=True)
        _, (*_, R_square) = probplot(np.square(orig), rvalue=True)
        _, (*_, R_reci) = probplot(orig, rvalue=True)
        _, (*_, R_boxcox) = probplot(orig, rvalue=True)
        _, (*_, R_yeojohn) = probplot(stats.yeojohnson(orig)[0], rvalue=True)

    elif feature in negative_features:
        _, (*_, R_orig) = probplot(orig, rvalue=True)
        _, (*_, R_log) = probplot(orig, rvalue=True)
        _, (*_, R_log1p) = probplot(orig, rvalue=True)
        #_, (*_, R_exp) = probplot(np.exp(orig), rvalue=True)
        _, (*_, R_sqrt) = probplot(orig, rvalue=True)
        _, (*_, R_square) = probplot(np.square(orig), rvalue=True)
        _, (*_, R_reci) = probplot(orig, rvalue=True)
        _, (*_, R_boxcox) = probplot(orig, rvalue=True)
        _, (*_, R_yeojohn) = probplot(stats.yeojohnson(orig)[0], rvalue=True)

    r2_scores[feature] = (
        R_orig * R_orig,
        R_log * R_log,
        R_log1p * R_log1p,
        #R_exp * R_exp,
        R_sqrt * R_sqrt,
        R_square * R_square,
        R_reci * R_reci,
        R_boxcox * R_boxcox,
        R_yeojohn * R_yeojohn
    )

r2_scores = pd.DataFrame(
    r2_scores, index=("Original", "Log", "Log1p", "Sqrt", "Square", "Reciprocal", "BoxCox", "YeoJohnson")
).T

r2_scores["HighestScore"] = r2_scores[["Original", "Log", "Log1p", "Sqrt", "Square", "Reciprocal", "BoxCox", "YeoJohnson"]].max(axis = 1)
r2_scores["Winner"] = r2_scores.idxmax(axis=1)


def highlight_max(s):
    is_max = s == s.max()
    return [f'background-color: {TEAL}' if v else '' for v in is_max]

r2_scores['Improvement'] = r2_scores['HighestScore'] - r2_scores['Original']
r2_scores.style.set_table_styles(DF_STYLE).apply(highlight_max, subset= ["Original", "Log", "Log1p", "Sqrt", "Square", "Reciprocal", "BoxCox", "YeoJohnson"], axis=1).background_gradient(
                                                                                                                            cmap = DF_CMAP2, subset = 'Improvement').format(precision = 3)

In [None]:
no_transform_cols = r2_scores.query("Improvement < 0.01").index
log_transform_cols = r2_scores.query("Winner == 'Log' & Improvement >= 0.01").index
log1p_transform_cols = r2_scores.query("Winner == 'Log1p' & Improvement >= 0.01").index
sqrt_transform_cols = r2_scores.query("Winner == 'Sqrt' & Improvement >= 0.01").index
square_transform_cols = r2_scores.query("Winner == 'Square' & Improvement >= 0.01").index
reciprocal_transform_cols = r2_scores.query("Winner == 'Reciprocal' & Improvement >= 0.01").index
boxcox_transform_cols = r2_scores.query("Winner == 'BoxCox' & Improvement >= 0.01").index
yeojohnson_transform_cols = r2_scores.query("Winner == 'YeoJohnson' & Improvement >= 0.01").index

In [None]:
column_transformers = make_pipeline(
    make_column_transformer(
        (
            StandardScaler(),
            no_transform_cols.to_list(),
        ),
        (
            make_pipeline(
                FunctionTransformer(func=np.log, feature_names_out="one-to-one"),
                StandardScaler(),
            ),
            log_transform_cols.to_list(),
        ),
        (
            make_pipeline(
                FunctionTransformer(func=np.log1p, feature_names_out="one-to-one"),
                StandardScaler(),
            ),
            log1p_transform_cols.to_list(),
        ),
        (
            make_pipeline(
                FunctionTransformer(func=np.sqrt, feature_names_out="one-to-one"),
                StandardScaler(),
            ),
            sqrt_transform_cols.to_list(),
        ),
        (
            make_pipeline(
                FunctionTransformer(func=np.square, feature_names_out="one-to-one"),
                StandardScaler(),
            ),
            square_transform_cols.to_list(),
        ),
        (
            PowerTransformer(method="box-cox", standardize=True),
            boxcox_transform_cols.to_list(),
        ),
        (
            PowerTransformer(method="yeo-johnson", standardize=True),
            yeojohnson_transform_cols.to_list(),
        ),
        
        remainder="passthrough",
        verbose_feature_names_out=False,
    ),
)

In [None]:
column_transformers

In [None]:
X = train.drop(target, axis=1)
y = train[target]

In [None]:
# 0.832...
xgb_minmax_params = {'learning_rate': 0.04808572634755236, 'gamma': 9.560327584505747e-05, 'max_depth': 6, 'min_child_weight': 0.11743307971084631, 'subsample': 0.7204079538816981, 'colsample_bytree': 0.4523296155877883, 'lambda': 0.5492675245323843, 'alpha': 8.819969375852153e-08, 'n_estimators': 435}

#0.831098
xgb2_minmax_params = {'learning_rate': 0.15267345813242902, 'gamma': 0.0002202210602923774, 'max_depth': 3, 'min_child_weight': 0.0032093132626038303, 'subsample': 0.7789317921899368, 'colsample_bytree': 0.4944876602452024, 'lambda': 4.088956032379317e-05, 'alpha': 1.264701759175353e-07, 'n_estimators': 369}
#0.831647
xgb3_minmax_params = {'learning_rate': 0.020498556354593255, 'gamma': 4.0117054063305194e-06, 'max_depth': 9, 'min_child_weight': 9.097349005570713, 'subsample': 0.814672362304106, 'colsample_bytree': 0.5074813732573049, 'lambda': 8.132240585431906e-06, 'alpha': 3.493464282610354e-07, 'n_estimators': 479}

lgbm_minmax_params = {'learning_rate': 0.12548315204042748, 'gamma': 4.578734755794731e-06, 'max_depth': 11, 'min_child_weight': 0.006074836332063258, 'subsample': 0.542429661988086, 'colsample_bytree': 0.8889283657514138, 'lambda': 0.0774449841007967, 'alpha': 0.006148549497097544, 'scale_pos_weight': 6.7529237049752915, 'n_estimators': 482, 'boosting_type': 'dart'}
#0.83190
lgbm2_minmax_params = {'learning_rate': 0.07311929223191145, 'gamma': 0.0001465824143858157, 'max_depth': 5, 'min_child_weight': 1.1101772255986377, 'subsample': 0.7717416516622793, 'colsample_bytree': 0.5521824473706827, 'lambda': 1.204402096691422e-08, 'alpha': 6.446090081481055e-07, 'scale_pos_weight': 3.2424584929164473, 'n_estimators': 494, 'boosting_type': 'gbdt'}
#0.83172
lgbm3_minmax_params = {'learning_rate': 0.05305666675975752, 'gamma': 3.737093538969974e-05, 'max_depth': 10, 'min_child_weight': 0.015065986984559972, 'subsample': 0.4107383480784791, 'colsample_bytree': 0.4754708058469771, 'lambda': 1.1778913998726071e-09, 'alpha': 0.00046716162072521424, 'scale_pos_weight': 7.601656443319444, 'n_estimators': 798, 'boosting_type': 'gbdt'}

cat_minmax_params = {'learning_rate': 0.0822257813369298, 'depth': 6, 'l2_leaf_reg': 1.5125953816160678, 'iterations': 378, 'random_strength': 0.08419072361237345, 'grow_policy': 'Lossguide', 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli'}

In [None]:
def cross_validate_score(model, data, folds = 5, target='Target'): #include_original=True):
    X = data.drop(columns = target)
    y = data[target]
    
    skfold = StratifiedKFold(n_splits= folds, shuffle=True, random_state= 42) 

    # Initiate prediction arrays and score lists
    val_predictions = np.zeros((len(X), 3))  # Adjust for class probabilities
    train_scores, val_scores = [], []
    
    # Training model and evaluating metrics
    for fold, (train_idx, val_idx) in enumerate(skfold.split(X, y)):
        # Define train set
        X_train, y_train = X.iloc[train_idx].reset_index(drop=True), y.iloc[train_idx].reset_index(drop=True)
        
        # Define validation set
        X_val, y_val = X.iloc[val_idx].reset_index(drop=True), y.iloc[val_idx].reset_index(drop=True)

        
        X_train = column_transformers.fit_transform(X_train)
        X_val = column_transformers.transform(X_val)


        #if include_original:
        #    X_train = pd.concat([pipe_original.drop(label, axis=1), X_train]).reset_index(drop=True)
        #    y_train = pd.concat([pipe_original[label], y_train]).reset_index(drop=True)
        
        # Train model
        model.fit(X_train, y_train)
        
        # Make predictions
        train_preds = model.predict(X_train)
        val_preds_proba = model.predict_proba(X_val)  # Get class probabilities
        
        # Store validation predictions
        val_predictions[val_idx] = val_preds_proba
        
        # Evaluate model for a fold
        val_preds = np.argmax(val_preds_proba, axis=1)  # Get predicted classes for accuracy score
        train_score = accuracy_score(y_train, train_preds)
        val_score = accuracy_score(y_val, val_preds)
        
        print(f'Fold {fold}: {val_score:.5f}')
        
        # Append model score for a fold to list
        train_scores.append(train_score)
        val_scores.append(val_score)
    
    # Refit the model on the entire dataset, including the original data
    #if include_original:
    #    X_full = pd.concat([pipe_original.drop(label, axis=1), X]).reset_index(drop=True)
    #    y_full = pd.concat([pipe_original[label], y]).reset_index(drop=True)

    X_full = column_transformers.fit_transform(X)
    y_full = y


    
    model.fit(X_full, y_full)
    
    test_data = column_transformers.transform(test)

    # Make final predictions on the test set
    test_predictions_proba = model.predict_proba(test_data)  # Get class probabilities
    
    print(f'Val Score: {np.mean(val_scores):.7f} ± {np.std(val_scores):.7f} | Train Score: {np.mean(train_scores):.7f} ± {np.std(train_scores):.7f} | {target}')
    
    return val_scores, val_predictions, test_predictions_proba

In [None]:
cv_summary, oof_predictions_df, submission_predictions_df = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

In [None]:
xgb_tuned = XGBClassifier(**xgb_minmax_params, random_state= 42)
xgb2_tuned = XGBClassifier(**xgb2_minmax_params, random_state= 42)
xgb3_tuned = XGBClassifier(**xgb3_minmax_params, random_state= 42)

lgbm_tuned = LGBMClassifier(**lgbm_minmax_params, random_state= 42, n_jobs = -1)
lgbm2_tuned = LGBMClassifier(**lgbm2_minmax_params, random_state= 42, n_jobs = -1)
lgbm3_tuned = LGBMClassifier(**lgbm3_minmax_params, random_state= 42, n_jobs = -1)

cat_tuned = CatBoostClassifier(**cat_minmax_params, random_state = 42, thread_count = -1)

cv_summary['xgb'], oof_predictions_df[['xgb_0', 'xgb_1', 'xgb_2']], submission_predictions_df[['xgb_0', 'xgb_1', 'xgb_2']] = cross_validate_score(xgb_tuned, train, 5, 'Target')
cv_summary['xgb2'], oof_predictions_df[['xgb2_0', 'xgb2_1', 'xgb2_2']], submission_predictions_df[['xgb2_0', 'xgb2_1', 'xgb2_2']] = cross_validate_score(xgb2_tuned, train, 5, 'Target')
cv_summary['xgb3'], oof_predictions_df[['xgb3_0', 'xgb3_1', 'xgb3_2']], submission_predictions_df[['xgb3_0', 'xgb3_1', 'xgb3_2']] = cross_validate_score(xgb3_tuned, train, 5, 'Target')

cv_summary['lgbm'], oof_predictions_df[['lgbm_0', 'lgbm_1', 'lgbm_2']], submission_predictions_df[['lgbm_0', 'lgbm_1', 'lgbm_2']] = cross_validate_score(lgbm_tuned, train, 5, 'Target')
cv_summary['lgbm2'], oof_predictions_df[['lgbm2_0', 'lgbm2_1', 'lgbm2_2']], submission_predictions_df[['lgbm2_0', 'lgbm2_1', 'lgbm2_2']] = cross_validate_score(lgbm2_tuned, train, 5, 'Target')
cv_summary['lgbm3'], oof_predictions_df[['lgbm3_0', 'lgbm3_1', 'lgbm3_2']], submission_predictions_df[['lgbm3_0', 'lgbm3_1', 'lgbm3_2']] = cross_validate_score(lgbm3_tuned, train, 5, 'Target')

cv_summary['cat'], oof_predictions_df[['cat_0', 'cat_1', 'cat_2']], submission_predictions_df[['cat_0', 'cat_1', 'cat_2']] = cross_validate_score(cat_tuned, train, 5, 'Target')

In [None]:
transposed_df = cv_summary.transpose()
transposed_df.columns = ['fold1','fold2','fold3','fold4','fold5']
transposed_df['Mean'] = transposed_df.mean(axis=1)
transposed_df['Std'] = transposed_df.std(axis=1)
transposed_df.sort_values(by = 'Mean', ascending=False)

In [None]:
skfold = StratifiedKFold(n_splits = 6, shuffle = True, random_state = 42)

In [None]:
def objective(trial):
    params = {
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log = True),  # Minimum loss reduction
        'n_estimators': trial.suggest_int('n_estimators', 10, 700),  # Number of boosting rounds
        'max_depth': trial.suggest_int('max_depth', 2, 8),  # Maximum depth of a tree
        'subsample': trial.suggest_float('subsample', 0.2, 1.0),  # Subsample ratio of training instances
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0),  # Subsample ratio of columns
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log = True),  # L1 regularization term
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 10.0, log = True),  # Minimum sum of instance weight
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 6e-1, log = True),  # Learning rate
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log = True),  # L2 regularization term


    }


    xgb_regressor = XGBClassifier(**params, random_state = 42)
    


    score = cross_val_score(xgb_regressor, oof_predictions_df, train['Target'], scoring= 'accuracy',  cv= skfold)
    #score = min(score.mean(), score.median())
    score = score.mean()
    return score

study = optuna.create_study(direction="maximize", sampler = TPESampler(seed=42))
study.optimize(objective, n_trials=40, n_jobs= -1)

print("Best hyperparameters: ", study.best_params)

In [None]:
vis.plot_optimization_history(study).show()
vis.plot_parallel_coordinate(study).show() 

In [None]:
meta_model_params = {'gamma': 3.3742991305902555e-05, 'n_estimators': 229, 'max_depth': 6, 'subsample': 0.27046522899393777, 'colsample_bytree': 0.3877794399997429, 'alpha': 2.5005198691264295e-07, 'min_child_weight': 0.01569095380228693, 'learning_rate': 0.00014463340964593574, 'lambda': 1.727921780705232e-05}

#83.213
meta_model_params2 = {'gamma': 0.5567419996563112, 'n_estimators': 198, 'max_depth': 4, 'subsample': 0.6302436248568573, 'colsample_bytree': 0.8901839913364566, 'alpha': 9.363345107468037e-06, 'min_child_weight': 0.0013704803385012033, 'learning_rate': 0.00014660365021402523, 'lambda': 1.3664437673539847e-08}

In [None]:
meta_model = XGBClassifier(**meta_model_params2, random_state= 42, objective = 'multi:softmax')

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline


min_features_to_select = 1

#Selection 
pipeline = Pipeline([
    ('rfecv', RFECV(estimator=meta_model,
                    step=1,
                    cv= skfold,
                    scoring="accuracy",
                    min_features_to_select=min_features_to_select,
                    n_jobs=-1))
])

# Fit the pipeline on the training data
pipeline.fit(oof_predictions_df, train['Target'])

#CV score
print("Best CV score: ")
selected_features = np.array( oof_predictions_df.columns)[pipeline.named_steps['rfecv'].support_]
print( pipeline.named_steps['rfecv'].cv_results_["mean_test_score"][len(selected_features) - 1])



# Selected features after RFECV
print('Number of evaluated features:', len(oof_predictions_df.columns))
print('Number of selected features:', len(selected_features))
print("Selected Features:", selected_features)

In [None]:
meta_model.fit(oof_predictions_df, train['Target'])

preds_test =  meta_model.predict(submission_predictions_df)
submission = pd.DataFrame({'id': test.index,
                       'Target': preds_test})

submission['Target'].value_counts()

In [None]:
target_dict = {
    2: 'Enrolled',
    0: 'Dropout',
    1: 'Graduate'
}
# Replace the values in the "Target" column
submission['Target'] = submission['Target'].replace(target_dict)
submission.to_csv('academic-success-predictions_ensemble_minmax2.csv', index = False)

In [None]:
from sklearn.model_selection import learning_curve

curve = learning_curve(meta_model, oof_predictions_df, train['Target'], cv = skfold, scoring = 'accuracy')

In [None]:
train_sizes, train_scores, test_scores = learning_curve(meta_model, oof_predictions_df, train['Target'], cv = skfold, scoring = 'accuracy', train_sizes=np.linspace(0.001, 0.1, 20))

# Calculate the mean and standard deviation for training and test scores
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

# Plot the learning curve
plt.figure()
plt.title("Learning Curve (SVC)")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
plt.grid()

# Plot the fill between standard deviations
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1,
                 color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="g")

# Plot the mean scores
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
         label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
         label="Cross-validation score")

plt.legend(loc="best")
plt.show()

In [None]:
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    precision_score,
    recall_score,
    accuracy_score,
    roc_auc_score,
    f1_score,
    classification_report,
    confusion_matrix,
    roc_curve,
    auc,
    precision_recall_curve,
)
import plotly.graph_objects as go
import plotly.figure_factory as ff

# Define your train and target data
X = train.drop(columns=target)
y = train[target]

# Stratified K-Fold cross-validation
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initiate prediction arrays and score lists
val_predictions = np.zeros((len(X), 3))  # Adjust for class probabilities
train_scores, val_scores = [], []

# Class weights to give more importance to class 0
class_weights = {0: 1, 1: 1, 2: 1}

# Training model and evaluating metrics
for fold, (train_idx, val_idx) in enumerate(skfold.split(X, y)):
    # Define train set
    X_train, y_train = X.iloc[train_idx].reset_index(drop=True), y.iloc[train_idx].reset_index(drop=True)
    
    # Define validation set
    X_val, y_val = X.iloc[val_idx].reset_index(drop=True), y.iloc[val_idx].reset_index(drop=True)
    
    # Apply column transformers
    X_train = column_transformers.fit_transform(X_train)
    X_val = column_transformers.transform(X_val)

    # Initialize the LightGBM model with class weights
    lgbm2_tuned = LGBMClassifier(**lgbm2_minmax_params, class_weight=class_weights)
    
    # Train the model
    lgbm2_tuned.fit(X_train, y_train)
    
    # Make predictions
    train_preds = lgbm2_tuned.predict(X_train)
    val_preds_proba = lgbm2_tuned.predict_proba(X_val)  # Get class probabilities
    val_preds = np.argmax(val_preds_proba, axis=1)
    
    # Store validation predictions
    val_predictions[val_idx] = val_preds_proba

# Compute metrics
val_preds = np.argmax(val_predictions, axis=1)
#precision, recall, _ = precision_recall_curve(y, val_preds) # Binary only
#pr_auc = auc(recall, precision)
precision = precision_score(y, val_preds, average=None)
recall = recall_score(y, val_preds, average=None)
accuracy = accuracy_score(y, val_preds)
roc_auc = roc_auc_score(y, val_predictions, multi_class='ovr')
f1_weighted = f1_score(y, val_preds, average='weighted')
#f1_macro = f1_score(y, val_preds, average='macro') # Only for balanced classes
f1_micro = f1_score(y, val_preds, average='micro')

class_report = classification_report(y, val_preds)#, output_dict=True)
conf_matrix = confusion_matrix(y, val_preds)

# Display metrics
#print(f"Precision-Recall AUC: {pr_auc}")
print(f"Precision for each class: {precision}")
print(f"Recall for each class: {recall}")
print(f"Accuracy: {accuracy}")
print(f"ROC AUC: {roc_auc}")
print(f"Weighted F1 Score for each class: {f1_weighted}")
print(f"Micro F1 Score for each class: {f1_micro}")
print("\nClassification Report:\n", class_report)
print("\nConfusion Matrix:\n", conf_matrix)

#report_df = pd.DataFrame(class_report).transpose()
#report_df

In [None]:
report_df.T

In [None]:
fig = ff.create_annotated_heatmap(
    z=conf_matrix,
    x=['Class 0', 'Class 1', 'Class 2'],
    y=['Class 0', 'Class 1', 'Class 2'],
    annotation_text=conf_matrix,
    colorscale=color_map,
    showscale=True,
    hoverinfo="z",
    colorbar=dict(title='Count'),
    
)

fig.update_layout(
    title=dict(text='Confusion Matrix', font=dict(size=20, color=DARK_TEAL)),
    xaxis=dict(title='Predicted Label', tickangle=0, titlefont=dict(size=15, color=DARK_TEAL)),
    yaxis=dict(title='True Label', titlefont=dict(size=15, color=DARK_TEAL)),
    annotations=[
        dict(
            x=col, y=row, text=str(conf_matrix[row][col]),
            showarrow=False,
            font=dict(size=12, color='black'),
            align="center"
        ) for row in range(conf_matrix.shape[0]) for col in range(conf_matrix.shape[1])
    ],
    margin=dict(l=60, r=60, b=100, t=100, pad=4),
    paper_bgcolor = BACKGROUND_COLOR,
    plot_bgcolor = BACKGROUND_COLOR,
    width = 1000,
    height = 1000,
    
    
)
fig.show()

# ROC Curve for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(3):
    fpr[i], tpr[i], _ = roc_curve(y, val_predictions[:, i], pos_label=i)
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curve using Plotly
roc_fig = go.Figure()


for i, color in zip(range(3), (TEAL, ORANGE, DARK_TEAL)):
    roc_fig.add_trace(go.Scatter(
        x=fpr[i],
        y=tpr[i],
        mode='lines',
        name=f'Class {i} (AUC = {roc_auc[i]:.2f})',
        line=dict(color = color, width=2)
    ))

roc_fig.add_trace(go.Scatter(
    x=[0, 1],
    y=[0, 1],
    mode='lines',
    line=dict(color=DARK_TEAL, dash='dash'),
    showlegend=False
))

roc_fig.update_layout(
    title=dict(text='Receiver Operating Characteristic (ROC) Curve', font=dict(size=20, color=DARK_TEAL)),
    xaxis=dict(title='False Positive Rate', titlefont=dict(size=15, color=DARK_TEAL)),
    yaxis=dict(title='True Positive Rate', titlefont=dict(size=15, color=DARK_TEAL)),
    legend=dict(title=dict(text='Classes', font=dict(size=15, color=DARK_TEAL))),
    margin=dict(l=60, r=60, b=100, t=100, pad=4),
    paper_bgcolor=BACKGROUND_COLOR,
    plot_bgcolor=BACKGROUND_COLOR,
    width = 1000,
    height = 1000
)
roc_fig.show()

In [None]:
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    precision_score,
    recall_score,
    accuracy_score,
    roc_auc_score,
    f1_score,
    classification_report,
    confusion_matrix,
)
import plotly.graph_objects as go
import plotly.figure_factory as ff

# Define your train and target data
X = train.drop(columns=target)
y = train[target]

# Stratified K-Fold cross-validation
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initiate prediction arrays and score lists
val_predictions = np.zeros((len(X), 3))  # Adjust for class probabilities
train_scores, val_scores = [], []

# Number of iterations and class weight range
num_iterations = 10
class_weight_range = np.linspace(1, 10, num_iterations)

# Metrics storage
precision_scores = []
recall_scores = []
accuracy_scores = []
roc_auc_scores = []
f1_weighted_scores = []
f1_micro_scores = []
class_reports = []
conf_matrices = []

# Iterate over different class weights for class 0
for weight in class_weight_range:
    class_weights = {0: weight, 1: 1, 2: 1}  # Update class weights
    
    # Training model and evaluating metrics
    for fold, (train_idx, val_idx) in enumerate(skfold.split(X, y)):
        # Define train set
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        
        # Define validation set
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        
        # Apply column transformers
        X_train = column_transformers.fit_transform(X_train)
        X_val = column_transformers.transform(X_val)

        # Initialize the LightGBM model with class weights
        lgbm2_tuned = LGBMClassifier(**lgbm2_minmax_params, class_weight=class_weights)
        
        # Train the model
        lgbm2_tuned.fit(X_train, y_train)
        
        # Make predictions
        train_preds = lgbm2_tuned.predict(X_train)
        val_preds_proba = lgbm2_tuned.predict_proba(X_val)  # Get class probabilities
        val_preds = np.argmax(val_preds_proba, axis=1)
        
        # Store validation predictions
        val_predictions[val_idx] = val_preds_proba
    
    # Compute metrics for this iteration
    val_preds = np.argmax(val_predictions, axis=1)
    precision = precision_score(y, val_preds, average=None)
    recall = recall_score(y, val_preds, average=None)
    accuracy = accuracy_score(y, val_preds)
    roc_auc = roc_auc_score(y, val_predictions, multi_class='ovr')
    f1_weighted = f1_score(y, val_preds, average='weighted')
    f1_micro = f1_score(y, val_preds, average='micro')
    class_report = classification_report(y, val_preds)
    conf_matrix = confusion_matrix(y, val_preds)
    
    # Append scores to lists
    precision_scores.append(precision)
    recall_scores.append(recall)
    accuracy_scores.append(accuracy)
    roc_auc_scores.append(roc_auc)
    f1_weighted_scores.append(f1_weighted)
    f1_micro_scores.append(f1_micro)
    class_reports.append(class_report)
    conf_matrices.append(conf_matrix)

# Convert lists to arrays
precision_scores = np.array(precision_scores)
recall_scores = np.array(recall_scores)
accuracy_scores = np.array(accuracy_scores)
roc_auc_scores = np.array(roc_auc_scores)
f1_weighted_scores = np.array(f1_weighted_scores)
f1_micro_scores = np.array(f1_micro_scores)

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=class_weight_range, y=precision_scores[:, 0], mode='lines+markers', name='Precision Class 0'))
fig.add_trace(go.Scatter(x=class_weight_range, y=precision_scores[:, 1], mode='lines+markers', name='Precision Class 1'))
fig.add_trace(go.Scatter(x=class_weight_range, y=precision_scores[:, 2], mode='lines+markers', name='Precision Class 2'))

fig.add_trace(go.Scatter(x=class_weight_range, y=recall_scores[:, 0], mode='lines+markers', name='Recall Class 0'))
fig.add_trace(go.Scatter(x=class_weight_range, y=recall_scores[:, 1], mode='lines+markers', name='Recall Class 1'))
fig.add_trace(go.Scatter(x=class_weight_range, y=recall_scores[:, 2], mode='lines+markers', name='Recall Class 2'))

fig.add_trace(go.Scatter(x=class_weight_range, y=accuracy_scores, mode='lines+markers', name='Accuracy'))
fig.add_trace(go.Scatter(x=class_weight_range, y=roc_auc_scores, mode='lines+markers', name='ROC AUC'))
fig.add_trace(go.Scatter(x=class_weight_range, y=f1_weighted_scores, mode='lines+markers', name='Weighted F1 Score'))
fig.add_trace(go.Scatter(x=class_weight_range, y=f1_micro_scores, mode='lines+markers', name='Micro F1 Score'))

fig.update_layout(
    title='Metrics vs. Class 0 Weight',
    xaxis=dict(title='Class 0 Weight'),
    yaxis=dict(title='Score'),
    legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1),
    width = 1500,
    height = 1500
)

fig.show()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size= 0.8, shuffle = True)

X_train_transformed = column_transformers.fit_transform(X_train)
X_test_transformed = column_transformers.transform(X_test)

lgbm2_tuned = LGBMClassifier(**lgbm2_minmax_params)
        
        # Train the model
lgbm2_tuned.fit(X_train_transformed, y_train)
        
        # Make predictions
train_preds = lgbm2_tuned.predict(X_train_transformed)
test_preds_proba = lgbm2_tuned.predict_proba(X_test_transformed)  # Get class probabilities
test_preds = np.argmax(test_preds_proba, axis=1)



In [None]:
# Sanity check
train_score = accuracy_score(y_train, train_preds)
print(f"Train score: {train_score}")
test_score = accuracy_score(y_test, test_preds)
print(f"Test score: {test_score}")

pd.DataFrame(test_preds).set_index(X_test.index)

In [None]:
predictions_df_0 = pd.concat([X_test, y_test, pd.DataFrame(test_preds_proba).set_index(X_test.index)], axis = 1).rename(columns = {0:'predicted_prob_0', 1:'predicted_prob_1', 2:'predicted_prob_2'})
predictions_df = pd.concat([predictions_df_0, pd.DataFrame(test_preds).set_index(X_test.index)], axis = 1).rename(columns = {0:'predictions'})
predictions_df



In [None]:
predictions_df['correct_prediction'] = predictions_df.apply(lambda row: 1 if row['Target'] == row['predictions'] else 0, axis=1)

In [None]:
predictions_df.head(5)

In [None]:
TOP_12_FEATURES = [
    'Curricular units 2nd sem (approved)',
    'Curricular units 1st sem (approved)',
    'Curricular units 2nd sem (grade)',
    'Tuition fees up to date',
    'Curricular units 1st sem (grade)',
    'Curricular units 2nd sem (evaluations)',
    'Curricular units 2nd sem (enrolled)',
    'Scholarship holder',
    'Curricular units 1st sem (evaluations)',
    'Course',
    'Curricular units 1st sem (enrolled)'
]

predictions_df_top_features = predictions_df[TOP_12_FEATURES]

In [None]:
tsne_2D = TSNE(n_components=2, n_jobs=-1, random_state=42, perplexity=250)

X_2D = pd.DataFrame(
    tsne_2D.fit_transform(predictions_df_top_features), columns=["dim1", "dim2"], index=predictions_df_top_features.index
).join(predictions_df['correct_prediction'].astype(str))

In [None]:
def get_n_rows_axes(n_features, n_cols=5, n_rows=None):
    n_rows = int(np.ceil(n_features / n_cols))
    current_col = range(1, n_cols + 1)
    current_row = range(1, n_rows + 1)
    return n_rows, list(product(current_row, current_col))



tsne_2D = TSNE(n_components=2, n_jobs=-1, random_state=42, perplexity=250)

X_2D = pd.DataFrame(
    tsne_2D.fit_transform(predictions_df_top_features), columns=["dim1", "dim2"], index=predictions_df_top_features.index
).join(predictions_df['correct_prediction'].astype(str))



fig = px.scatter(
    X_2D.reset_index(),
    x="dim1",
    y="dim2",
    symbol="correct_prediction",
    symbol_sequence=["diamond", "square"],
    color="correct_prediction",
    color_discrete_sequence=[TEAL, ORANGE],
    category_orders={"Target": ("0", "1")},
    hover_data="id",
    opacity=0.7,
    height=1600,
    width=1600,
    title="Test Dataset - 2D Projection with t-SNE",
)
fig.update_layout(
    font_color=FONT_COLOR,
    title_font_size=18,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        xanchor="right",
        y=1.05,
        x=1,
        title="Correct or Incorrect Prediction",
        itemsizing="constant",
    ),
)
fig.update_traces(marker_size=6)
fig.show()

In [None]:
def get_n_rows_axes(n_features, n_cols=5, n_rows=None):
    n_rows = int(np.ceil(n_features / n_cols))
    current_col = range(1, n_cols + 1)
    current_row = range(1, n_rows + 1)
    return n_rows, list(product(current_row, current_col))


import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Assuming predictions_df_top_features and predictions_df are defined

# Initialize perplexities
perplexities = range(10, 61, 10)  # From 10 to 60 in steps of 10

# Create subplots with defined rows and columns
fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=[f"Perplexity: {perp}" for perp in perplexities],
    shared_xaxes=False,  # Adjust as needed
    shared_yaxes=False,  # Adjust as needed
    horizontal_spacing=0.06,
    vertical_spacing=0.02,
)

# Iterate over each perplexity value and add t-SNE plots to subplots
for i, perplexity in enumerate(perplexities, start=1):
    tsne_2D = TSNE(n_components=2, n_jobs=-1, random_state=42, perplexity=perplexity)
    X_2D = pd.DataFrame(tsne_2D.fit_transform(predictions_df_top_features), columns=["dim1", "dim2"])

    # Add scatter plot to subplot
    fig.add_trace(
        go.Scatter(
            x=X_2D["dim1"],
            y=X_2D["dim2"],
            mode='markers',
            marker=dict(color=predictions_df['correct_prediction'], colorscale='Viridis', size=8),
            name=f"Perplexity: {perplexity}",
            showlegend=True if i == 1 else False,  # Show legend only for the first plot
        ),
        row=(i - 1) // 3 + 1,
        col=(i - 1) % 3 + 1,
    )

# Update layout and annotations
fig.update_layout(
    title="t-SNE Projection with Different Perplexities",
    title_font_size=25,
    font_color="black",  # Adjust font color as needed
    plot_bgcolor="white",  # Adjust background color as needed
    paper_bgcolor="white",  # Adjust paper color as needed
    width=1200,
    height=800,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        xanchor="right",
        y=1.01,
        x=1,
    ),
)

# Update axis labels
fig.update_xaxes(title_text="Dimension 1", row=1, col=1)
fig.update_yaxes(title_text="Dimension 2", row=1, col=1)

fig.show()


In [None]:

def get_n_rows_axes(n_features, n_cols=5, n_rows=None):
    n_rows = int(np.ceil(n_features / n_cols))
    current_col = range(1, n_cols + 1)
    current_row = range(1, n_rows + 1)
    return n_rows, list(product(current_row, current_col))

# Initialize t-SNE with different perplexity values
perplexities = [10, 20, 30, 40, 50, 60]
N_COLS = 2  # Number of columns for subplots
n_rows, axes = get_n_rows_axes(len(perplexities), n_cols=N_COLS)

fig = make_subplots(rows=n_rows, cols=N_COLS, subplot_titles=[f"Perplexity: {perp}" for perp in perplexities])

for i, perplexity in enumerate(perplexities):
    tsne_2D = TSNE(n_components=2, n_jobs=-1, random_state=42, perplexity=perplexity)
    X_2D = pd.DataFrame(tsne_2D.fit_transform(predictions_df_top_features), columns=["dim1", "dim2"], index=predictions_df_top_features.index)

    row_idx, col_idx = axes[i]
    fig.add_trace(
        go.Scatter(
            x=X_2D['dim1'],
            y=X_2D['dim2'],
            mode='markers',
            marker=dict(
                symbol=predictions_df['correct_prediction'].astype(str),
              #  symbol_sequence=["diamond", "square"],
                color=predictions_df['correct_prediction'],
                #colorscale=[[0, TEAL], [1, ORANGE]],
                opacity=0.7,
                size=6,
                line=dict(width=0.5, color='DarkSlateGrey')
            ),
            name=f"Perplexity: {perplexity}",
            showlegend=True if i == 0 else False,
           # hoverdata = 'id'
        ),
        row=row_idx + 1, col=col_idx + 1
    )

# Update layout and show figure
fig.update_layout(
    height=1600,
    width=1600,
    title="Test Dataset - 2D Projection with t-SNE",
    font_color=FONT_COLOR,
    title_font_size=18,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        xanchor="right",
        y=1.05,
        x=1,
        title="Correct or Incorrect Prediction",
        itemsizing="constant",
    ),
)
fig.update_xaxes(title_text="Dimension 1")
fig.update_yaxes(title_text="Dimension 2")
fig.show()
