In [14]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib as plt
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score, make_scorer, balanced_accuracy_score, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from ucimlrepo import fetch_ucirepo 
from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import precision_recall_fscore_support
from scipy.stats import randint, uniform
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.ensemble import RandomForestClassifier
# fetch dataset 
# predict_students_dropout_and_academic_success = fetch_ucirepo(id=697) 
# variable information 
# print(predict_students_dropout_and_academic_success.variables) 

In [15]:
path='Resources/data.csv'
delimiter = ';'
target_column='Target'
not_categorical = ['GDP', 'Inflation rate', 'Unemployment rate', 'Admission grade', 'Previous qualification (grade)','Target']
def load_data(path,target_column,not_categorical=None,delimiter=','):
    df=pd.read_csv(path,delimiter=delimiter)
    df[target_column] = pd.Categorical(df[target_column])
    df[target_column] = df[target_column].cat.codes
    print(df[target_column].value_counts())
    y=df[target_column]
    X=df.drop(columns=target_column)
    if not_categorical != None:
        categorical_df=df.drop(columns=['GDP', 'Inflation rate', 'Unemployment rate', 'Admission grade', 'Previous qualification (grade)','Target'])
        categorical_columns=list(categorical_df.columns)
        X=pd.get_dummies(X,columns=categorical_columns).astype(int) # I put astype int here because my enviorment has some quirk where get dummies gives boolean values
    X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=1,test_size=0.2, stratify=y)
    return X_train,X_test,y_train,y_test

In [16]:
def weight_optimizer(X_test,y_test,X_resampled,y_resampled,weight_combinations,best_params = {},best_weights = None,best_f1_score=0):
    for weights in weight_combinations:
        weight = class_weight.compute_sample_weight(weights, y=y_resampled)

        # Train the model
        best_params['eval_metric']= 'aucpr'
        best_params['early_stopping_rounds']=20
        best_params['objective']= 'binary:logistic'
        best_params['random_state']= 1
        model = XGBClassifier(**best_params)
        model.fit(X_resampled, y_resampled,sample_weight=weight,verbose=True,eval_set=[(X_test,y_test)])

        # Convert probabilities to class labels
        predict = model.predict(X_test)
        # Get the precision, recall, f1-score, and support
        precision, recall, f1_score, support = precision_recall_fscore_support(y_test, predict)

        # Find the index of the smallest f1 score
        min_index = f1_score.argmin()

        if f1_score[min_index]>best_f1_score:
            best_weights=weights
            best_f1_score=f1_score[min_index]
            best_f1_score
            best_report=classification_report(y_test, predict)
    print(best_f1_score)
    print(best_weights)
    print(best_report)
    return best_weights

In [17]:
def hyperparameter_optimizer(X_resampled,y_resampled,estimator,param_grid,scoring,n_jobs=4,cv=3,verbose=2,sample_weight=None,n_iter=180):
    weights = class_weight.compute_sample_weight(sample_weight, y=y_resampled)
    rand_search=RandomizedSearchCV(estimator=estimator, param_distributions=param_grid, scoring=scoring, n_jobs=n_jobs, cv=cv, verbose=verbose,n_iter=n_iter)
    rand_search.fit(X_resampled,y_resampled,sample_weight=weights)
    best_params = rand_search.best_params_
    return best_params

In [18]:
def model_rating(X_test,y_test,X_resampled,y_resampled,params,ranking=None,weights={0:1,1:1,2:1}):    
    if ranking!=None:
        id=sorted(ranking, key=lambda x: x['id'], reverse=True)[0]['id']+1
    if ranking==None:
        ranking=[]
        id=1
    weight = class_weight.compute_sample_weight(weights, y=y_resampled)   
    params['eval_metric']= 'aucpr'
    params['early_stopping_rounds']=20
    params['objective']= 'binary:logistic'
    params['random_state']= 1
    model = XGBClassifier(**params)
    model.fit(X_resampled,y_resampled,sample_weight=weight,verbose=True, eval_set=[(X_test,y_test)])
    predict = model.predict(X_test)
    
    # Get the precision, recall, f1-score, and support
    precision, recall, f1_score, support = precision_recall_fscore_support(y_test, predict)

    # Find the index of the smallest f1 score
    min_f1 = f1_score.argmin()
    min_recall = recall.argmin()
    min_precision = precision.argmin()
    full_report=classification_report(y_test, predict)
    entry= {'id':id,'weights':weights,'parameters':params,'f1':f1_score[min_f1],'recall':recall[min_recall],'precision':precision[min_precision],'full_report':full_report}
    ranking.append(entry)
    best_f1_score = sorted(ranking, key=lambda x: x['f1'], reverse=True)
    best_recall = sorted(ranking, key=lambda x: x['recall'], reverse=True)
    best_precision = sorted(ranking, key=lambda x: x['precision'], reverse=True)
    print('The best f1: ',best_f1_score[0])
    print('The best recall: ',best_recall[0])    
    print('The best precision: ',best_precision[0])  
    return ranking, best_f1_score[0]['f1'], best_recall[0]['recall'], best_precision[0]['precision']

In [19]:
from itertools import combinations
import numpy as np

def generate_class_pairs(y):
    """ Generate all unique pairs of class labels. """
    unique_classes = np.unique(y)
    # print('unique classes: ',unique_classes)
    # print('unique class combos: ', list(combinations(unique_classes, 2)))
    return list(combinations(unique_classes, 2))

def create_binary_dataset(X, y, class_pair):
    """ Create a subset of the dataset for a given pair of classes. """
    mask = np.isin(y, class_pair)
    # print(mask)
    X_binary = X[mask]
    y_binary = y[mask]
    # print(y_binary)
    # Relabel the classes as 0 and 1
    y_binary = np.where(y_binary == class_pair[0], 0, 1)
    #print(y_binary)
    return X_binary, y_binary

In [20]:
def opt(i):
    X_train,X_test,y_train,y_test = load_data(
        path='Resources/data.csv',delimiter = ';',
        target_column='Target',
        not_categorical = ['GDP', 'Inflation rate', 'Unemployment rate', 'Admission grade', 'Previous qualification (grade)','Target']
        )
    # Generate binary datasets for each class pair and store in a list of dictionaries
    class_pairs = generate_class_pairs(y_train)
    binary_datasets = []

    for pair in class_pairs:
        X_binary, y_binary = create_binary_dataset(X_train, y_train, pair)
        binary_datasets.append({
            'pair': pair,
            'X_binary': X_binary,
            'y_binary': y_binary
        })
    X=binary_datasets[i]['X_binary']
    y=binary_datasets[i]['y_binary']
    X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=1,test_size=0.2, stratify=y)
    # Assuming y_train contains your training labels
    unique_classes, counts = np.unique(y_train, return_counts=True)
    class_counts = dict(zip(unique_classes, counts))
    # Calculate the ratio
    # For a binary classification (class 0 and class 1 for example)
    if len(class_counts) == 2:
        ratio = max(counts) / min(counts)
        print("Class Imbalance Ratio:", ratio)
    else:
        print("Class counts:", class_counts)
    # Define the SMOTE method
    smote = SMOTE()
    # Create the resampled feature set
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    base_params= {'eval_metric': 'aucpr', 'objective': 'binary:logistic', 'random_state': 1, 'device': 'cpu'}
    # create model
    xgb_bin_0_1=XGBClassifier(**base_params)
    xgb_bin_0_1.fit(X_resampled,y_resampled,verbose=True,eval_set=[(X_test,y_test)])
    # Create weights to test
    weight_options = {
    'class_0': [.2,.3,.4,.5,.6,.7,.8,.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2],
    'class_1': [.2,.3,.4,.5,.6,.7,.8,.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2],
    }
    ratios=[]
    weight_combinations=[]
    for w0 in weight_options['class_0']:
        for w1 in weight_options['class_1']:
            if w0/w1 not in ratios: # exclude weights who have the same ratio as previous weights to save on computation time
                dic={}
                dic[0]=w0
                dic[1]=w1
                weight_combinations.append(dic)
                ratios.append(w0/w1)
    weight_combinations
    params = {'gamma': uniform(0,.5), 'learning_rate': uniform(.05,.25), 'max_depth': randint(3,7), 'reg_lambda': randint(1,10), 'subsample': uniform(.2,.7), 'scale_pos_weight':uniform(ratio-(ratio/2),ratio)}
    #
    best_params = hyperparameter_optimizer(X_resampled,y_resampled,xgb_bin_0_1,param_grid=params,scoring='f1_weighted')
    best_weights = weight_optimizer(X_test,y_test,X_resampled,y_resampled,weight_combinations,best_params=best_params)
    improvement_threshold = 0.01  # Minimum improvement to continue optimization
    previous_best_score = 0
    ranking, best_f1_score, best_recall, best_precision = model_rating(X_test,y_test,X_resampled,y_resampled,params=best_params,weights=best_weights)
    current_best_score = best_f1_score
    for i in range(0,1):
        while current_best_score - previous_best_score > improvement_threshold:
            previous_best_score = current_best_score
            # Refine hyperparameters with current best weights
            best_params = hyperparameter_optimizer(X_resampled,y_resampled,xgb_bin_0_1,param_grid=params,scoring='f1_weighted',sample_weight=best_weights)

            # Refine weights with current best hyperparameters
            best_weights = best_weights = weight_optimizer(X_test,y_test,X_resampled,y_resampled,weight_combinations,best_params=best_params)

            # Evaluate the model with the new hyperparameters and weights
            ranking, best_f1_score, best_recall, best_precision = model_rating(X_test,y_test,X_resampled,y_resampled,params=best_params,weights=best_weights,ranking=ranking)
            current_best_score = best_f1_score
        # Final evaluation with the test set
    best_f1_score = sorted(ranking, key=lambda x: x['f1'], reverse=True)
    best_recall = sorted(ranking, key=lambda x: x['recall'], reverse=True)
    best_precision = sorted(ranking, key=lambda x: x['precision'], reverse=True)
    print(best_precision[0]['full_report'])
    model_p=best_precision[0]
    print(best_recall[0]['full_report'])
    model_r=best_recall[0]
    print(best_f1_score[0]['full_report'])
    model_f=best_f1_score[0]
    return model_p, model_r, model_f

In [24]:
# model_p_01, model_r_01, model_f_01=opt(0)
# ranking = None
# model_p_02, model_r_02, model_f_02=opt(1)
# ranking = None
# model_p_12, model_r_12, model_f_12=opt(2)
models=[
{'id': 1, 'weights': {0: 1.2, 1: 1.5}, 'parameters': {'gamma': 0.08861231550920928, 'learning_rate': 0.1821225689068316, 'max_depth': 6, 'reg_lambda': 6, 'scale_pos_weight': 2.199993113533957, 'subsample': 0.7846868610097693, 'eval_metric': 'aucpr', 'early_stopping_rounds': 20, 'objective': 'binary:logistic', 'random_state': 1}, 'f1': 0.7482993197278912, 'recall': 0.75, 'precision': 0.6586826347305389, 'full_report': '              precision    recall  f1-score   support\n\n           0       0.91      0.75      0.82       228\n           1       0.66      0.87      0.75       127\n\n    accuracy                           0.79       355\n   macro avg       0.78      0.81      0.79       355\nweighted avg       0.82      0.79      0.80       355\n'},
{'id': 1, 'weights': {0: 1.9, 1: 1.5}, 'parameters': {'gamma': 0.3551562610166091, 'learning_rate': 0.26385003376731664, 'max_depth': 6, 'reg_lambda': 9, 'scale_pos_weight': 1.0084256361043618, 'subsample': 0.6986207661858479, 'eval_metric': 'aucpr', 'early_stopping_rounds': 20, 'objective': 'binary:logistic', 'random_state': 1}, 'f1': 0.8624708624708625, 'recall': 0.8149779735682819, 'precision': 0.8891820580474934, 'full_report': '              precision    recall  f1-score   support\n\n           0       0.92      0.81      0.86       227\n           1       0.89      0.95      0.92       354\n\n    accuracy                           0.90       581\n   macro avg       0.90      0.88      0.89       581\nweighted avg       0.90      0.90      0.90       581\n'},
{'id': 1, 'weights': {0: 1.5, 1: 0.7}, 'parameters': {'gamma': 0.12100925571721932, 'learning_rate': 0.110628940710754, 'max_depth': 5, 'reg_lambda': 1, 'scale_pos_weight': 2.548108314502983, 'subsample': 0.7254879519422643, 'eval_metric': 'aucpr', 'early_stopping_rounds': 20, 'objective': 'binary:logistic', 'random_state': 1}, 'f1': 0.6749999999999999, 'recall': 0.6377952755905512, 'precision': 0.7168141592920354, 'full_report': '              precision    recall  f1-score   support\n\n           0       0.72      0.64      0.67       127\n           1       0.88      0.91      0.89       354\n\n    accuracy                           0.84       481\n   macro avg       0.80      0.77      0.78       481\nweighted avg       0.83      0.84      0.83       481\n'}

]
model_f_01=models[0]
model_f_02=models[1]
model_f_12=models[2]

In [25]:
print(model_f_01['full_report'])
print(model_f_02['full_report'])
print(model_f_12['full_report'])


              precision    recall  f1-score   support

           0       0.91      0.75      0.82       228
           1       0.66      0.87      0.75       127

    accuracy                           0.79       355
   macro avg       0.78      0.81      0.79       355
weighted avg       0.82      0.79      0.80       355

              precision    recall  f1-score   support

           0       0.92      0.81      0.86       227
           1       0.89      0.95      0.92       354

    accuracy                           0.90       581
   macro avg       0.90      0.88      0.89       581
weighted avg       0.90      0.90      0.90       581

              precision    recall  f1-score   support

           0       0.72      0.64      0.67       127
           1       0.88      0.91      0.89       354

    accuracy                           0.84       481
   macro avg       0.80      0.77      0.78       481
weighted avg       0.83      0.84      0.83       481



In [26]:
{0:{0: 909, 1: 508}, 1:{0: 910, 1: 1413}, 2:{0: 508, 1: 1413}}

{0: {0: 909, 1: 508}, 1: {0: 910, 1: 1413}, 2: {0: 508, 1: 1413}}

In [28]:
def ovo_voting(models, X):
    """
    Apply One-vs-One voting mechanism to an array of samples.

    :param models: List of trained binary classifiers.
    :param X: Array of samples to predict.
    :return: Final predicted classes.
    """
    votes = np.zeros((X.shape[0], len(models)))

    for i, model in enumerate(models):
        predictions = model.predict(X)
        for j, pred in enumerate(predictions):
            # Increment the vote for the predicted class
            votes[j, pred] += 1

    # The final class is the one with the most votes
    final_predictions = np.argmax(votes, axis=1)
    return final_predictions
X_train,X_test,y_train,y_test=load_data('Resources/data.csv','Target',not_categorical=not_categorical,delimiter=';')
class_pairs = generate_class_pairs(y_train)
binary_datasets = []

for pair in class_pairs:
    X_binary, y_binary = create_binary_dataset(X_train, y_train, pair)
    binary_datasets.append({
        'pair': pair,
        'X_binary': X_binary,
        'y_binary': y_binary
    })
#
X=binary_datasets[0]['X_binary']
y=binary_datasets[0]['y_binary']
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=1,test_size=0.2, stratify=y)
smote=SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
xgb_01=XGBClassifier(**model_f_01['parameters'])
weight_01=class_weight.compute_sample_weight(model_f_01['weights'],y=y_resampled)
xgb_01.fit(X_resampled,y_resampled,sample_weight=weight_01,eval_set=[(X_test,y_test)])
#
X=binary_datasets[1]['X_binary']
y=binary_datasets[1]['y_binary']
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=1,test_size=0.2, stratify=y)
smote=SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
xgb_02=XGBClassifier(**model_f_02['parameters'])
weight_02=class_weight.compute_sample_weight(model_f_02['weights'],y=y_resampled)
xgb_02.fit(X_resampled,y_resampled,sample_weight=weight_02,eval_set=[(X_test,y_test)])
#
X=binary_datasets[2]['X_binary']
y=binary_datasets[2]['y_binary']
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=1,test_size=0.2, stratify=y)
smote=SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
xgb_12=XGBClassifier(**model_f_12['parameters'])
weight_12=class_weight.compute_sample_weight(model_f_12['weights'],y=y_resampled)
xgb_12.fit(X_resampled,y_resampled,sample_weight=weight_12,eval_set=[(X_test,y_test)])
#
X_train,X_test,y_train,y_test=load_data('Resources/data.csv','Target',not_categorical=not_categorical,delimiter=';')
# Example usage
final_predictions = ovo_voting([xgb_01, xgb_02, xgb_12], X_test)
#xgb_12.predict(X_test)

Target
2    2209
0    1421
1     794
Name: count, dtype: int64
[0]	validation_0-aucpr:0.60053
[1]	validation_0-aucpr:0.62484
[2]	validation_0-aucpr:0.61242
[3]	validation_0-aucpr:0.60098
[4]	validation_0-aucpr:0.61206
[5]	validation_0-aucpr:0.61460
[6]	validation_0-aucpr:0.62464
[7]	validation_0-aucpr:0.62356
[8]	validation_0-aucpr:0.65882
[9]	validation_0-aucpr:0.65433
[10]	validation_0-aucpr:0.64187
[11]	validation_0-aucpr:0.65036
[12]	validation_0-aucpr:0.65473
[13]	validation_0-aucpr:0.65877
[14]	validation_0-aucpr:0.66793
[15]	validation_0-aucpr:0.68575
[16]	validation_0-aucpr:0.69783
[17]	validation_0-aucpr:0.69668
[18]	validation_0-aucpr:0.70279
[19]	validation_0-aucpr:0.70216
[20]	validation_0-aucpr:0.70582
[21]	validation_0-aucpr:0.68817
[22]	validation_0-aucpr:0.69783
[23]	validation_0-aucpr:0.69881
[24]	validation_0-aucpr:0.69792
[25]	validation_0-aucpr:0.71155
[26]	validation_0-aucpr:0.73055
[27]	validation_0-aucpr:0.73129
[28]	validation_0-aucpr:0.73143
[29]	validation_0-a

In [29]:
def transform_predictions(binary_classifier, class_pair, X):
    """
    Transform binary predictions from a classifier to original class labels.

    :param binary_classifier: The binary classifier trained on class_pair.
    :param class_pair: Tuple containing the original class labels (e.g., (1, 2)).
    :param X: The input features to predict on.
    :return: Array of predictions with original class labels.
    """
    binary_predictions = binary_classifier.predict(X)
    # Map binary predictions (0, 1) back to original class labels
    original_predictions = np.where(binary_predictions == 0, class_pair[0], class_pair[1])
    return original_predictions

# Example usage for a classifier trained on class pair (1, 2)
# class_pair = (1, 2)
# predictions_12 = transform_predictions(xgb_12, class_pair, X_test)
def ovo_voting(classifiers, class_pairs, X):
    votes = np.zeros((X.shape[0], len(np.unique(np.concatenate(class_pairs)))))

    for classifier, pair in zip(classifiers, class_pairs):
        predictions = transform_predictions(classifier, pair, X)
        for i, pred in enumerate(predictions):
            votes[i, pred] += 1

    final_predictions = np.argmax(votes, axis=1)
    return final_predictions

# classifiers = [model_f_01, model_f_02, model_f_12]
# final_predictions = ovo_voting(classifiers, class_pairs, X_test)


In [30]:
X_train,X_test,y_train,y_test=load_data('Resources/data.csv','Target',not_categorical=not_categorical,delimiter=';')
# Example usage
final_predictions = ovo_voting([xgb_01, xgb_02, xgb_12],class_pairs, X_test)
print(classification_report(y_test,final_predictions))

Target
2    2209
0    1421
1     794
Name: count, dtype: int64
              precision    recall  f1-score   support

           0       0.83      0.68      0.74       284
           1       0.48      0.49      0.49       159
           2       0.81      0.90      0.85       442

    accuracy                           0.75       885
   macro avg       0.71      0.69      0.69       885
weighted avg       0.76      0.75      0.75       885



In [31]:
#
probabilities = xgb_12.predict_proba(X_test)
new_col = np.zeros((probabilities.shape[0]))
probabilities_12 = np.insert(probabilities, 0, new_col, axis=1)
#
probabilities = xgb_02.predict_proba(X_test)
new_col = np.zeros((probabilities.shape[0]))
probabilities_01 = np.insert(probabilities, 1, new_col, axis=1)
#
probabilities = xgb_02.predict_proba(X_test)
new_col = np.zeros((probabilities.shape[0]))
probabilities_02 = np.insert(probabilities, 2, new_col, axis=1)


In [32]:
result = np.add(probabilities_01, probabilities_02, probabilities_12)
predictions = np.argmax(result, axis=1)

In [33]:
probabilities_01
# probabilty array for each classifier, add column of zeros to index corresponding to the missing class for earch binary, add all arrays together predict based on highest number
# same as above except square the probabilities, this is to increase the power of confident guesses and decrease middle of the road guesses.
# Process of elimination voting 
# which model has the most confident prediction -> What number does it predict -> look at the model that predicts the most confident models prefered number and the number not in the most confident model. 

array([[0.20983171, 0.        , 0.7901683 ],
       [0.0575068 , 0.        , 0.9424932 ],
       [0.47997916, 0.        , 0.52002084],
       ...,
       [0.9971354 , 0.        , 0.00286458],
       [0.9864331 , 0.        , 0.01356691],
       [0.0538035 , 0.        , 0.9461965 ]], dtype=float32)

In [34]:
def probability_summation(classifiers, class_pairs, X):
    num_classes = len(np.unique(np.concatenate(class_pairs)))
    total_probabilities = np.zeros((X.shape[0], num_classes))

    for classifier, pair in zip(classifiers, class_pairs):
        # Get probabilities for each class pair
        probs = classifier.predict_proba(X)
        probs = np.where(probs < 0,0,probs)
        # Create a placeholder for adjusted probabilities
        adjusted_probs = np.zeros((probs.shape[0], num_classes))
        
        # Fill in the known probabilities
        adjusted_probs[:, pair[0]] = probs[:, 0]
        adjusted_probs[:, pair[1]] = probs[:, 1]

        # Sum probabilities
        total_probabilities += adjusted_probs

    # Final prediction based on highest probability
    final_predictions = np.argmax(total_probabilities, axis=1)
    return final_predictions


In [35]:
X_train,X_test,y_train,y_test=load_data('Resources/data.csv','Target',not_categorical=not_categorical,delimiter=';')
# Example usage
final_predictions = probability_summation([xgb_01, xgb_02, xgb_12],class_pairs, X_test)
print(classification_report(y_test,final_predictions))

Target
2    2209
0    1421
1     794
Name: count, dtype: int64
              precision    recall  f1-score   support

           0       0.88      0.67      0.76       284
           1       0.49      0.53      0.51       159
           2       0.81      0.90      0.85       442

    accuracy                           0.76       885
   macro avg       0.72      0.70      0.71       885
weighted avg       0.77      0.76      0.76       885



In [44]:
def squared_probability_summation(classifiers, class_pairs, X):
    num_classes = len(np.unique(np.concatenate(class_pairs)))
    total_squared_probabilities = np.zeros((X.shape[0], num_classes))
    i=1
    for classifier, pair in zip(classifiers, class_pairs):
        # Get and square probabilities
        probs = classifier.predict_proba(X) ** 2
        print(probs.mean())

        # Adjust for missing classes and sum squared probabilities
        adjusted_probs = np.zeros((probs.shape[0], num_classes))
        adjusted_probs[:, pair[0]] = probs[:, 0]
        adjusted_probs[:, pair[1]] = probs[:, 1]
        adjusted_probs = np.where(probs[i, 2] < .5,0,adjusted_probs)

        total_squared_probabilities += adjusted_probs
        i+=1

    final_predictions = np.argmax(total_squared_probabilities, axis=1)
    return final_predictions


In [45]:
X_train,X_test,y_train,y_test=load_data('Resources/data.csv','Target',not_categorical=not_categorical,delimiter=';')
# Example usage
final_predictions = squared_probability_summation([xgb_01, xgb_02, xgb_12],class_pairs, X_test)
print(classification_report(y_test,final_predictions))

Target
2    2209
0    1421
1     794
Name: count, dtype: int64
0.37405366


IndexError: index 2 is out of bounds for axis 1 with size 2

In [None]:
def most_confident_prediction_voting(classifiers, class_pairs, X):
    num_samples = X.shape[0]
    num_classes = len(np.unique(np.concatenate(class_pairs)))
    highest_confidence = np.zeros(num_samples)
    final_predictions = np.zeros(num_samples, dtype=int)

    for classifier, pair in zip(classifiers, class_pairs):
        # Get probability estimates
        probs = classifier.predict_proba(X)

        # Find the most confident predictions
        for i in range(num_samples):
            # Check if this classifier is more confident about its prediction
            class_index = np.argmax(probs[i])
            confidence = probs[i, class_index]

            if confidence > highest_confidence[i]:
                highest_confidence[i] = confidence
                # Map the binary prediction to the original class label
                final_predictions[i] = pair[class_index]

    return final_predictions


In [None]:
def process_of_elimination_voting(classifiers, class_pairs, X):
    num_classes = len(np.unique(np.concatenate(class_pairs)))
    votes = np.zeros((X.shape[0], num_classes))

    for classifier, pair in zip(classifiers, class_pairs):
        # Get probabilities and most confident predictions
        probs = classifier.predict_proba(X)
        confident_predictions = np.argmax(probs, axis=1)

        # Map binary predictions to original class labels
        original_predictions = np.where(confident_predictions == 0, pair[0], pair[1])

        # Increment votes for the most confident predictions
        for i, pred in enumerate(original_predictions):
            votes[i, pred] += probs[i, confident_predictions[i]]

    final_predictions = np.argmax(votes, axis=1)
    return final_predictions


In [None]:
X_train,X_test,y_train,y_test=load_data('Resources/data.csv','Target',not_categorical=not_categorical,delimiter=';')
# Example usage
final_predictions = process_of_elimination_voting([xgb_01, xgb_02, xgb_12],class_pairs, X_test)
print(classification_report(y_test,final_predictions))

Target
2    2209
0    1421
1     794
Name: count, dtype: int64
              precision    recall  f1-score   support

           0       0.87      0.69      0.77       284
           1       0.48      0.48      0.48       159
           2       0.80      0.90      0.85       442

    accuracy                           0.76       885
   macro avg       0.71      0.69      0.70       885
weighted avg       0.76      0.76      0.76       885



In [None]:
X_train,X_test,y_train,y_test=load_data('Resources/data.csv','Target',not_categorical=not_categorical,delimiter=';')
# Example usage
final_predictions = most_confident_prediction_voting([xgb_01, xgb_02, xgb_12],class_pairs, X_test)
print(classification_report(y_test,final_predictions))

Target
2    2209
0    1421
1     794
Name: count, dtype: int64
              precision    recall  f1-score   support

           0       0.80      0.77      0.78       284
           1       0.41      0.54      0.46       159
           2       0.84      0.76      0.80       442

    accuracy                           0.72       885
   macro avg       0.68      0.69      0.68       885
weighted avg       0.75      0.72      0.73       885



In [None]:
def consensus_with_fallback_voting(classifiers, class_pairs, X, fallback_method):
    num_samples = X.shape[0]
    final_predictions = np.zeros(num_samples, dtype=int)

    # Get predictions from model_f_01 and model_f_12
    predictions_01 = transform_predictions(classifiers[0], class_pairs[0], X) # model_f_01
    predictions_12 = transform_predictions(classifiers[2], class_pairs[2], X) # model_f_12
    
    for i in range(num_samples):
        if predictions_01[i] == 1 and predictions_12[i] == 1:
            # Both classifiers agree on class 1
            final_predictions[i] = 1
        else:
            # Use fallback method
            final_predictions[i] = fallback_method(classifiers, class_pairs, X.iloc[i].values.reshape(1, -1))

    return final_predictions


In [None]:
X_train,X_test,y_train,y_test=load_data('Resources/data.csv','Target',not_categorical=not_categorical,delimiter=';')
# Example usage
final_predictions = consensus_with_fallback_voting([xgb_01, xgb_02, xgb_12],class_pairs, X_test,process_of_elimination_voting)
print(classification_report(y_test,final_predictions))

Target
2    2209
0    1421
1     794
Name: count, dtype: int64


  final_predictions[i] = fallback_method(classifiers, class_pairs, X.iloc[i].values.reshape(1, -1))


              precision    recall  f1-score   support

           0       0.87      0.69      0.77       284
           1       0.48      0.48      0.48       159
           2       0.80      0.90      0.85       442

    accuracy                           0.76       885
   macro avg       0.71      0.69      0.70       885
weighted avg       0.76      0.76      0.76       885



In [None]:
def create_probability_features(classifiers, class_pairs, X):
    num_samples = X.shape[0]
    num_classes = len(np.unique(np.concatenate(class_pairs)))
    probability_features = np.zeros((num_samples, num_classes * 2))

    for i, (classifier, pair) in enumerate(zip(classifiers, class_pairs)):
        # Get probability estimates for each class pair
        probs = classifier.predict_proba(X)

        # Place these probabilities in the feature matrix
        probability_features[:, i*2:(i+1)*2] = probs

    return probability_features

# Create probability features for the entire dataset
X_train_prob_features = create_probability_features([xgb_01, xgb_02, xgb_12], class_pairs, X_train)
X_test_prob_features = create_probability_features([xgb_01, xgb_02, xgb_12], class_pairs, X_test)

# Train a new multiclass classifier on these features
meta_classifier = XGBClassifier()  # You can choose any classifier
meta_classifier.fit(X_train_prob_features, y_train)

# Make predictions with the new classifier
final_predictions = meta_classifier.predict(X_test_prob_features)


In [None]:
def meta_opt():
    X_train,X_test,y_train,y_test = load_data(
        path='Resources/data.csv',delimiter = ';',
        target_column='Target',
        not_categorical = ['GDP', 'Inflation rate', 'Unemployment rate', 'Admission grade', 'Previous qualification (grade)','Target']
        )
    X_train_prob_features = create_probability_features([xgb_01, xgb_02, xgb_12], class_pairs, X_train)
    X_test_prob_features = create_probability_features([xgb_01, xgb_02, xgb_12], class_pairs, X_test)

    # Define the SMOTE method
    base_params= {'eval_metric': 'aucpr', 'objective': 'multiprob:soft', 'random_state': 1, 'device': 'cpu'}
    # create model
    xgb_bin_0_1=XGBClassifier(**base_params)
    xgb_bin_0_1.fit(X_train_prob_features,y_train,verbose=True,eval_set=[(X_test_prob_features,y_test)])
    # Create weights to test
    weight_options = {
        'class_0': [.2,.3,.4,.5,.6,.7,.8,.9,1],
        'class_1': [1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2],
        'class_2': [.2,.3,.4,.5,.6,.7,.8,.9,1]
    }
    weight_combinations=[]
    for w0 in weight_options['class_0']:
        for w1 in weight_options['class_1']:
            for w2 in weight_options['class_2']:
                dic={}
                dic[0]=w0
                dic[1]=w1
                dic[2]=w2
                weight_combinations.append(dic)
    weight_combinations
    params = {'gamma': uniform(0,.5), 'learning_rate': uniform(.05,.25), 'max_depth': randint(3,7), 'reg_lambda': randint(1,10), 'subsample': uniform(.2,.7)}
    #
    best_params = hyperparameter_optimizer(X_train_prob_features,y_train,xgb_bin_0_1,param_grid=params,scoring='f1_weighted')
    best_weights = weight_optimizer(X_test_prob_features,y_test,X_train_prob_features,y_train,weight_combinations,best_params=best_params)
    improvement_threshold = 0.01  # Minimum improvement to continue optimization
    previous_best_score = 0
    ranking, best_f1_score, best_recall, best_precision = model_rating(X_test_prob_features,y_test,X_train_prob_features,y_train,params=best_params,weights=best_weights)
    current_best_score = best_f1_score
    for i in range(0,1):
        while current_best_score - previous_best_score > improvement_threshold:
            previous_best_score = current_best_score
            # Refine hyperparameters with current best weights
            best_params = hyperparameter_optimizer(X_train_prob_features,y_train,xgb_bin_0_1,param_grid=params,scoring='f1_weighted',sample_weight=best_weights)

            # Refine weights with current best hyperparameters
            best_weights = best_weights = weight_optimizer(X_test_prob_features,y_test,X_train_prob_features,y_train,weight_combinations,best_params=best_params)

            # Evaluate the model with the new hyperparameters and weights
            ranking, best_f1_score, best_recall, best_precision = model_rating(X_test_prob_features,y_test,X_train_prob_features,y_train,params=best_params,weights=best_weights,ranking=ranking)
            current_best_score = best_f1_score
        # Final evaluation with the test set
    best_f1_score = sorted(ranking, key=lambda x: x['f1'], reverse=True)
    best_recall = sorted(ranking, key=lambda x: x['recall'], reverse=True)
    best_precision = sorted(ranking, key=lambda x: x['precision'], reverse=True)
    print(best_precision[0]['full_report'])
    model_p=best_precision[0]
    print(best_recall[0]['full_report'])
    model_r=best_recall[0]
    print(best_f1_score[0]['full_report'])
    model_f=best_f1_score[0]
    return model_p, model_r, model_f

In [None]:
model_p, model_r, model_f = meta_opt()
print(model_f['full_report'])

Target
2    2209
0    1421
1     794
Name: count, dtype: int64
[0]	validation_0-aucpr:0.72196
[1]	validation_0-aucpr:0.72761
[2]	validation_0-aucpr:0.72870
[3]	validation_0-aucpr:0.73026
[4]	validation_0-aucpr:0.73129
[5]	validation_0-aucpr:0.72679
[6]	validation_0-aucpr:0.72736
[7]	validation_0-aucpr:0.73237
[8]	validation_0-aucpr:0.73185
[9]	validation_0-aucpr:0.73388
[10]	validation_0-aucpr:0.73253
[11]	validation_0-aucpr:0.73042
[12]	validation_0-aucpr:0.73041
[13]	validation_0-aucpr:0.72930
[14]	validation_0-aucpr:0.72957
[15]	validation_0-aucpr:0.73031
[16]	validation_0-aucpr:0.73009
[17]	validation_0-aucpr:0.73134
[18]	validation_0-aucpr:0.72918
[19]	validation_0-aucpr:0.72888
[20]	validation_0-aucpr:0.72807
[21]	validation_0-aucpr:0.72868
[22]	validation_0-aucpr:0.72868
[23]	validation_0-aucpr:0.72892
[24]	validation_0-aucpr:0.72822
[25]	validation_0-aucpr:0.72820
[26]	validation_0-aucpr:0.72794
[27]	validation_0-aucpr:0.72768
[28]	validation_0-aucpr:0.72774
[29]	validation_0-a