In [95]:
# Basics
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Plotly
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.express as px
from plotly.subplots import make_subplots


# Tools
from copy import copy # Shallow copy
from itertools import product
from collections import defaultdict
from functools import partial
from IPython.display import display # Allows functions to simultaneously return values and show tables

# Styling
from colorama import Fore
from colorama import Style
from matplotlib.colors import Colormap


# Assessing Feature Importance
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import mutual_info_classif

# Pipeline
from sklearn.pipeline import make_pipeline
#from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer




from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


from sklearn.metrics import accuracy_score



# t-SNE
from sklearn.manifold import TSNE


# Dendogram
from scipy.cluster.hierarchy import linkage
from scipy.spatial.distance import squareform


# Kde Plots
from scipy.stats import gaussian_kde



# Probability plots
from scipy.stats import probplot

# The Tree Trio
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


import scipy.stats as stats

# Good ol utils
from utils import *

# Sequential Feature Selection
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

In [96]:
import pandas as pd
train = pd.read_csv(r"C:\Users\Nebula PC\Desktop\Projects\Academic-Success-Prediction\data\train.csv", index_col="id").rename(columns=str.strip)
test = pd.read_csv(r"C:\Users\Nebula PC\Desktop\Projects\Academic-Success-Prediction\data\test.csv", index_col="id").rename(columns=str.strip)

target = "Target"

value_mapping = {
    'Enrolled': 2,
    'Dropout': 0,
    'Graduate': 1
}

# Replace the values in the "Target" column
train['Target'] = train['Target'].replace(value_mapping)

In [97]:
TOP_12_FEATURES = [
    'Curricular units 2nd sem (approved)',
    'Curricular units 1st sem (approved)',
    'Curricular units 2nd sem (grade)',
    'Tuition fees up to date',
    'Curricular units 1st sem (grade)',
    'Curricular units 2nd sem (evaluations)',
    'Curricular units 2nd sem (enrolled)',
    'Scholarship holder',
    'Curricular units 1st sem (evaluations)',
    'Course',
    'Curricular units 1st sem (enrolled)'
]


BOTTOM_15_FEATURES = [
    'International',
    'Educational special needs',
    'Nacionality',
    'Marital status',
    'Inflation rate',
    'Curricular units 1st sem (without evaluations)',
    'Father\'s qualification',
    'Mother\'s qualification',
    'Displaced',
    'Curricular units 2nd sem (credited)',
    'Curricular units 1st sem (credited)',
    'Application order',
    'Previous qualification',
    'Daytime/evening attendance',
    'Curricular units 2nd sem (without evaluations)'
]




#SELECTED_FEATURES = [
#    'Curricular units 2nd sem (approved)',
#    'Tuition fees up to date',
#    'Curricular units 1st sem (grade)',
#    'Curricular units 2nd sem (evaluations)',
#    'Scholarship holder',
#    'Curricular units 1st sem (evaluations)',
#    'Course',
#    'Curricular units 1st sem (enrolled)'
#]



SQUARE_TRANSFORM_COLS = [
    'Curricular units 2nd sem (grade)',
    'Curricular units 1st sem (grade)'
]

YEOJOHNSON_TRANSFORM_COLS = ['Age at enrollment']

NO_TRANSFORM_COLS = [
        'Previous qualification (grade)', 'Admission grade',
        'Curricular units 1st sem (credited)',
        'Curricular units 1st sem (enrolled)',
        'Curricular units 1st sem (evaluations)',
        'Curricular units 1st sem (approved)',
        'Curricular units 1st sem (without evaluations)',
        'Curricular units 2nd sem (credited)',
        'Curricular units 2nd sem (enrolled)',
        'Curricular units 2nd sem (evaluations)',
        'Curricular units 2nd sem (approved)',
        'Curricular units 2nd sem (without evaluations)', 'Unemployment rate',
        'Inflation rate', 'GDP'
]




BINARY_COLUMNS = []
for column in train.columns:
    if len(train[column].value_counts()) == 2:
        BINARY_COLUMNS.append(column)

binary_data = train[BINARY_COLUMNS+ ['Target']]



ONE_HOT_COLUMNS = ['Marital status', 
                   'Application order', 
                   'Application mode', 
                   'Course', 
                   'Previous qualification', 
                   'Nacionality', 
                   'Mother\'s qualification', 
                   'Father\'s qualification', 
                   'Mother\'s occupation', 
                   'Father\'s occupation']



one_hot_data = train[ONE_HOT_COLUMNS + ['Target']]




NUMERIC_COLUMNS = train.drop(columns = ONE_HOT_COLUMNS + BINARY_COLUMNS + ['Target']).columns.to_list()
numeric_data = train[NUMERIC_COLUMNS + ['Target']]



FLOAT_COLUMNS = train[NUMERIC_COLUMNS].select_dtypes('float').columns.to_list()
float_data = train[FLOAT_COLUMNS + ['Target']]



INTEGER_COLUMNS = train[NUMERIC_COLUMNS].select_dtypes('int').columns.to_list()
integer_data = train[INTEGER_COLUMNS + ['Target']]




BINARY_INDICATOR_FEATURES = [
                             'Curricular units 2nd sem (grade)',
                             'Curricular units 2nd sem (approved)',
                             'Curricular units 2nd sem (evaluations)',
                             'Curricular units 1st sem (grade)',
                             'Curricular units 1st sem (approved)',
                             'Curricular units 1st sem (evaluations)',
]



ONE_HOT_COLUMNS_SELECTED = [
                   'Application mode', 
                   'Course', 
                   'Mother\'s occupation', 
                   'Father\'s occupation']



In [98]:
#pearson_corr, lower_tri_corr = correlation_values(train[SELECTED_FEATURES + ['Target']], target, 15, 1000, 900)

In [99]:


def feature_engineering(df):
    df['sum_evaluations'] = df['Curricular units 2nd sem (evaluations)'] + df['Curricular units 1st sem (evaluations)']
    df['min_evaluations'] = df[['Curricular units 2nd sem (evaluations)', 'Curricular units 1st sem (evaluations)']].min(axis = 1)
    df['max_evaluations'] = df[['Curricular units 2nd sem (evaluations)', 'Curricular units 1st sem (evaluations)']].max(axis = 1)
    df['difference_evaluations'] = abs(df['Curricular units 2nd sem (evaluations)'] - df['Curricular units 1st sem (evaluations)'])
    df['change_evaluations'] = df['Curricular units 2nd sem (evaluations)'] - df['Curricular units 1st sem (evaluations)']

    df['sum_enrolled'] = df['Curricular units 2nd sem (enrolled)'] + df['Curricular units 1st sem (enrolled)']
    df['min_enrolled'] = df[['Curricular units 2nd sem (enrolled)', 'Curricular units 1st sem (enrolled)']].min(axis = 1)
    df['max_enrolled'] = df[['Curricular units 2nd sem (enrolled)', 'Curricular units 1st sem (enrolled)']].max(axis = 1)
    df['difference_enrolled'] = abs(df['Curricular units 2nd sem (enrolled)'] - df['Curricular units 1st sem (enrolled)'])
    df['change_enrolled'] = df['Curricular units 2nd sem (enrolled)'] - df['Curricular units 1st sem (enrolled)']


    #IMPORTANT
    df['sum_grade'] = df['Curricular units 2nd sem (grade)'] + df['Curricular units 1st sem (grade)']
    df['min_grade'] = df[['Curricular units 2nd sem (grade)', 'Curricular units 1st sem (grade)']].min(axis = 1)
    df['max_grade'] = df[['Curricular units 2nd sem (grade)', 'Curricular units 1st sem (grade)']].max(axis = 1)
    df['difference_grade'] = abs(df['Curricular units 2nd sem (grade)'] - df['Curricular units 1st sem (grade)'])
    df['change_grade'] = df['Curricular units 2nd sem (grade)'] - df['Curricular units 1st sem (grade)']



    #IMPORTANT
    df['sum_approved'] = df['Curricular units 2nd sem (approved)'] + df['Curricular units 1st sem (approved)']
    df['min_approved'] = df[['Curricular units 2nd sem (approved)', 'Curricular units 1st sem (approved)']].min(axis = 1)
    df['max_approved'] = df[['Curricular units 2nd sem (approved)', 'Curricular units 1st sem (approved)']].max(axis = 1)
    df['difference_approved'] = abs(df['Curricular units 2nd sem (approved)'] - df['Curricular units 1st sem (approved)'])
    df['change_approved'] = df['Curricular units 2nd sem (approved)'] - df['Curricular units 1st sem (approved)']


    #Interactions using important features
    df['approved_minus_grade_s1'] = df['Curricular units 1st sem (approved)'] - df['Curricular units 1st sem (grade)']
    df['grade_minus_approved_s1'] = df['Curricular units 1st sem (grade)'] - df['Curricular units 1st sem (approved)']
    df['approved_minus_grade_s2'] = df['Curricular units 2nd sem (approved)'] - df['Curricular units 2nd sem (grade)']
    df['grade_minus_approved_s2'] = df['Curricular units 2nd sem (grade)'] - df['Curricular units 2nd sem (approved)']

    df['approved_add_grade_s1'] = df['Curricular units 1st sem (approved)'] + df['Curricular units 1st sem (grade)']
    df['approved_add_grade_s2'] = df['Curricular units 2nd sem (approved)'] + df['Curricular units 2nd sem (grade)']

    df['approved_add_grade_s1_s2'] = df['approved_add_grade_s1'] + df['approved_add_grade_s2']

    #other interactions
    df['curricular_units_sum_s1'] = df['Curricular units 1st sem (evaluations)'] + df['Curricular units 1st sem (enrolled)'] + df['Curricular units 1st sem (grade)'] + df['Curricular units 1st sem (approved)']
    df['curricular_units_sum_s2'] = df['Curricular units 2nd sem (evaluations)'] + df['Curricular units 2nd sem (enrolled)'] + df['Curricular units 2nd sem (grade)'] + df['Curricular units 2nd sem (approved)']
    df['curricular_units_sum_s1_s2'] = df['curricular_units_sum_s1'] + df['curricular_units_sum_s2']
    df['curricular_units_difference_s1_s2'] = abs(df['curricular_units_sum_s2'] - df['curricular_units_sum_s1'])
    df['curricular_units_change_s1_s2'] = df['curricular_units_sum_s2'] - df['curricular_units_sum_s1']

    
    df['fees_plus_scholarship'] = df['Scholarship holder'] + 2*df['Tuition fees up to date']  # Use 2* so we can differentiate between scholarship and fees

    for col in BINARY_INDICATOR_FEATURES:
        df[f'{col} (binary)'] = (df[col] > 0).astype(int)




    #df = df.drop(columns = BOTTOM_15_FEATURES)

    return df


    


In [100]:
train = feature_engineering(train)
test = feature_engineering(test)




#train['Scholarship holder'].value_counts()

In [101]:
pearson_corr, lower_tri_corr = correlation_values(train, target, 15, 1000, 900)

In [102]:
NUMERIC_COLUMNS = train.drop(columns = ONE_HOT_COLUMNS + BINARY_COLUMNS + ['Target']).columns.to_list()
numeric_data = train[NUMERIC_COLUMNS + ['Target']]

In [103]:
positive_features = list(train[NUMERIC_COLUMNS].describe().T.query("min > 0").index)
zero_features = list(train[NUMERIC_COLUMNS].describe().T.query("min == 0").index)
negative_features = list(train[NUMERIC_COLUMNS].describe().T.query("min < 0").index)

In [104]:
r2_scores = defaultdict(tuple)

for feature in NUMERIC_COLUMNS:
    orig = numeric_data[feature].dropna()
    if feature in positive_features:
        _, (*_, R_orig) = probplot(orig, rvalue=True)
        _, (*_, R_log) = probplot(np.log(orig), rvalue=True)
        _, (*_, R_log1p) = probplot(np.log1p(orig), rvalue=True)
        #_, (*_, R_exp) = probplot(np.exp(orig), rvalue=True)
        _, (*_, R_sqrt) = probplot(np.sqrt(orig), rvalue=True)
        _, (*_, R_square) = probplot(np.square(orig), rvalue=True)
        _, (*_, R_reci) = probplot(np.reciprocal(orig), rvalue=True)
        _, (*_, R_boxcox) = probplot(stats.boxcox(orig)[0], rvalue=True)
        _, (*_, R_yeojohn) = probplot(stats.yeojohnson(orig)[0], rvalue=True)
    elif feature in zero_features:
        _, (*_, R_orig) = probplot(orig, rvalue=True)
        _, (*_, R_log) = probplot(orig, rvalue=True)
        _, (*_, R_log1p) = probplot(np.log1p(orig), rvalue=True)
        #_, (*_, R_exp) = probplot(np.exp(orig), rvalue=True)
        _, (*_, R_sqrt) = probplot(np.sqrt(orig), rvalue=True)
        _, (*_, R_square) = probplot(np.square(orig), rvalue=True)
        _, (*_, R_reci) = probplot(orig, rvalue=True)
        _, (*_, R_boxcox) = probplot(orig, rvalue=True)
        _, (*_, R_yeojohn) = probplot(stats.yeojohnson(orig)[0], rvalue=True)

    elif feature in negative_features:
        _, (*_, R_orig) = probplot(orig, rvalue=True)
        _, (*_, R_log) = probplot(orig, rvalue=True)
        _, (*_, R_log1p) = probplot(orig, rvalue=True)
        #_, (*_, R_exp) = probplot(np.exp(orig), rvalue=True)
        _, (*_, R_sqrt) = probplot(orig, rvalue=True)
        _, (*_, R_square) = probplot(np.square(orig), rvalue=True)
        _, (*_, R_reci) = probplot(orig, rvalue=True)
        _, (*_, R_boxcox) = probplot(orig, rvalue=True)
        _, (*_, R_yeojohn) = probplot(stats.yeojohnson(orig)[0], rvalue=True)



    r2_scores[feature] = (
        R_orig * R_orig,
        R_log * R_log,
        R_log1p * R_log1p,
        #R_exp * R_exp,
        R_sqrt * R_sqrt,
        R_square * R_square,
        R_reci * R_reci,
        R_boxcox * R_boxcox,
        R_yeojohn * R_yeojohn
    )

r2_scores = pd.DataFrame(
    r2_scores, index=("Original", "Log", "Log1p", "Sqrt", "Square", "Reciprocal", "BoxCox", "YeoJohnson")
).T

r2_scores["HighestScore"] = r2_scores[["Original", "Log", "Log1p", "Sqrt", "Square", "Reciprocal", "BoxCox", "YeoJohnson"]].max(axis = 1)
r2_scores["Winner"] = r2_scores.idxmax(axis=1)


def highlight_max(s):
    is_max = s == s.max()
    return [f'background-color: {TEAL}' if v else '' for v in is_max]

r2_scores['Improvement'] = r2_scores['HighestScore'] - r2_scores['Original']
r2_scores.style.set_table_styles(DF_STYLE).apply(highlight_max, subset= ["Original", "Log", "Log1p", "Sqrt", "Square", "Reciprocal", "BoxCox", "YeoJohnson"], axis=1).background_gradient(cmap = DF_CMAP2, subset = 'Improvement').format(precision = 3)

Unnamed: 0,Original,Log,Log1p,Sqrt,Square,Reciprocal,BoxCox,YeoJohnson,HighestScore,Winner,Improvement
Previous qualification (grade),0.979,0.979,0.979,0.98,0.966,0.968,0.98,0.98,0.98,YeoJohnson,0.002
Admission grade,0.984,0.991,0.991,0.989,0.964,0.985,0.991,0.991,0.991,Log1p,0.007
Age at enrollment,0.653,0.728,0.726,0.693,0.558,0.0,0.864,0.864,0.864,BoxCox,0.212
Curricular units 1st sem (credited),0.152,0.152,0.189,0.19,0.087,0.152,0.152,0.195,0.195,YeoJohnson,0.042
Curricular units 1st sem (enrolled),0.69,0.69,0.478,0.522,0.51,0.69,0.69,0.69,0.69,YeoJohnson,0.0
Curricular units 1st sem (evaluations),0.91,0.91,0.673,0.743,0.715,0.91,0.91,0.91,0.91,YeoJohnson,0.0
Curricular units 1st sem (approved),0.87,0.87,0.752,0.77,0.701,0.87,0.87,0.866,0.87,Original,0.0
Curricular units 1st sem (grade),0.69,0.69,0.571,0.599,0.835,0.69,0.69,0.759,0.835,Square,0.144
Curricular units 1st sem (without evaluations),0.126,0.126,0.162,0.165,0.045,0.126,0.126,0.166,0.166,YeoJohnson,0.041
Curricular units 2nd sem (credited),0.134,0.134,0.171,0.173,0.069,0.134,0.134,0.178,0.178,YeoJohnson,0.044


In [None]:
SELECTED_FEATURE = 'difference_grade'


plot_data = train[SELECTED_FEATURE].dropna().sample(20000)
(osm, osr), (slope, intercept, R) = probplot(plot_data, rvalue=True)
x_theory = np.array([osm[0], osm[-1]])
y_theory = intercept + slope * x_theory

fig = make_subplots(
    rows=1,
    cols=2,
    subplot_titles=["Probability Plot against Normal Distribution", "Histogram"],
)

fig.add_scatter(x=osm, y=osr, mode="markers", row=1, col=1, name="AB")
fig.add_scatter(x=x_theory, y=y_theory, mode="lines", row=1, col=1)
fig.add_annotation(
    x=-1.25,
    y=osr[-1] * 0.4,
    text=f"R\u00b2 = {R * R:.3f}",
    showarrow=False,
    row=1,
    col=1,
)
fig.update_yaxes(title_text="Observed Values", row=1, col=1)
fig.update_xaxes(title_text="Theoretical Quantiles", row=1, col=1)
fig.update_traces(
    marker=dict(size=3, symbol="circle", line=dict(width=2, color=DARK_TEAL)),
    line_color= ORANGE,
)

fig.add_histogram(
    x=plot_data,
    marker_color= DARK_TEAL,
    opacity=0.75,
    name="Mothers Occupation",
    row=1,
    col=2,
)
fig.update_yaxes(title_text="Count", row=1, col=2)
fig.update_xaxes(title_text="Mothers Occupation", row=1, col=2)

fig.update_layout(
    font_color=FONT_COLOR,
    title="Mothers Occupation Feature - Original",
    title_font_size=18,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
    showlegend=False,
    width=1600,
    height=900,
    bargap=0.2,
)

fig.update_annotations(font_size=14)
fig.show()



In [105]:

def transform_data(transformation, sample_fraction = 1.0):
    if transformation == 'log':
        transformed_plot_data = np.log(train[SELECTED_FEATURE].dropna().sample(frac = sample_fraction))
    elif transformation == 'log1p':
        transformed_plot_data = np.log1p(train[SELECTED_FEATURE].dropna().sample(frac = sample_fraction))
    elif transformation == 'log10':
        transformed_plot_data = np.log10(train[SELECTED_FEATURE].dropna().sample(frac = sample_fraction)) # extra
    elif transformation == 'sqrt':
        transformed_plot_data = np.sqrt(train[SELECTED_FEATURE].dropna().sample(frac = sample_fraction))
    elif transformation == 'square':
        transformed_plot_data = np.square(train[SELECTED_FEATURE].dropna().sample(frac = sample_fraction))
    elif transformation == 'reciprocal':
        transformed_plot_data = np.reciprocal(train[SELECTED_FEATURE].dropna().sample(frac = sample_fraction))
    elif transformation == 'boxcox':
        transformed_plot_data = stats.boxcox(train[SELECTED_FEATURE].dropna().sample(frac = sample_fraction))[0]
    elif transformation == 'yeojohnson':
        transformed_plot_data = stats.yeojohnson(train[SELECTED_FEATURE].dropna().sample(frac = sample_fraction))[0]

    return transformed_plot_data



TRANSFORMATION = 'yeojohnson'

transformed_plot_data = transform_data(TRANSFORMATION, 0.5)


(osm, osr), (slope, intercept, R) = probplot(transformed_plot_data, rvalue=True)
x_theory = np.array([osm[0], osm[-1]])
y_theory = intercept + slope * x_theory

fig = make_subplots(
    rows=1,
    cols=2,
    subplot_titles=["Probability Plot against Normal Distribution", "Histogram"],
)

fig.add_scatter(x=osm, y=osr, mode="markers", row=1, col=1, name="Yeo Johnson(Mothers Occupation)")
fig.add_scatter(x=x_theory, y=y_theory, mode="lines", row=1, col=1)
fig.add_annotation(
    x=-1.25,
    y=osr[-1] * 0.6,
    text=f"R\u00b2 = {R * R:.3f}",
    showarrow=False,
    row=1,
    col=1,
)
fig.update_yaxes(title_text="Observed Values", row=1, col=1)
fig.update_xaxes(title_text="Theoretical Quantiles", row=1, col=1)
fig.update_traces(
    marker=dict(size=3, symbol="circle", line=dict(width=2, color=DARK_TEAL)),
    line_color=ORANGE,
)

fig.add_histogram(
    x=transformed_plot_data,
    marker_color=DARK_TEAL,
    opacity=0.75,
    name="Mother\'s Occupation(Yeo Johnson)",
    row=1,
    col=2,
)
fig.update_yaxes(title_text="Count", row=1, col=2)
fig.update_xaxes(title_text=f"{SELECTED_FEATURE} ({TRANSFORMATION})", row=1, col=2)

fig.update_layout(
    font_color=FONT_COLOR,
    title=f"{SELECTED_FEATURE} Feature - {TRANSFORMATION} Transformation",
    title_font_size=18,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
    showlegend=False,
    width=1600,
    height=900,
    bargap=0.2,
)

fig.update_annotations(font_size=14)
fig.show()

In [106]:
no_transform_cols = r2_scores.query("Improvement < 0.01").index
log_transform_cols = r2_scores.query("Winner == 'Log' & Improvement >= 0.01").index
log1p_transform_cols = r2_scores.query("Winner == 'Log1p' & Improvement >= 0.01").index
sqrt_transform_cols = r2_scores.query("Winner == 'Sqrt' & Improvement >= 0.01").index
square_transform_cols = r2_scores.query("Winner == 'Square' & Improvement >= 0.01").index
reciprocal_transform_cols = r2_scores.query("Winner == 'Reciprocal' & Improvement >= 0.01").index
boxcox_transform_cols = r2_scores.query("Winner == 'BoxCox' & Improvement >= 0.01").index
yeojohnson_transform_cols = r2_scores.query("Winner == 'YeoJohnson' & Improvement >= 0.01").index






In [107]:
#log_transform_cols
#log1p_transform_cols
#sqrt_transform_cols
#square_transform_cols
#boxcox_transform_cols
#yeojohnson_transform_cols

from sklearn.preprocessing import MinMaxScaler

In [110]:
column_transformers = make_pipeline(
    make_column_transformer(
        (
            StandardScaler(),
            no_transform_cols.to_list(),
        ),
        (
            make_pipeline(
                FunctionTransformer(func=np.log, feature_names_out="one-to-one"),
                StandardScaler(),
            ),
            log_transform_cols.to_list(),
        ),
        (
            make_pipeline(
                FunctionTransformer(func=np.log1p, feature_names_out="one-to-one"),
                StandardScaler(),
            ),
            log1p_transform_cols.to_list(),
        ),
        (
            make_pipeline(
                FunctionTransformer(func=np.sqrt, feature_names_out="one-to-one"),
                StandardScaler(),
            ),
            sqrt_transform_cols.to_list(),
        ),
        (
            make_pipeline(
                FunctionTransformer(func=np.square, feature_names_out="one-to-one"),
                StandardScaler(),
            ),
            square_transform_cols.to_list(),
        ),
        (
            PowerTransformer(method="box-cox", standardize=True),
            boxcox_transform_cols.to_list(),
        ),
        (
            PowerTransformer(method="yeo-johnson", standardize=True),
            yeojohnson_transform_cols.to_list(),
        ),
        
        remainder="passthrough",
        verbose_feature_names_out=False,
    ),
)

In [111]:
X = train.drop(target, axis=1)
y = train[target]


# We're not altering our X and y, we're just checking to make sure the pipeline works 
X_processed = column_transformers.fit_transform(X)
X_processed_frame = pd.DataFrame(
    X_processed,
    columns=column_transformers.get_feature_names_out(),
    index=X.index,
)
X_processed_frame.head().style.set_table_styles(DF_STYLE).format(precision=3)

ValueError: No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed

In [64]:
n_bags = 5
n_folds = 5
np.random.seed(42)
seeds = np.random.randint(0, 19937, size=n_bags)

classifiers = ['rf', 'cat', 'lgbm', 'xgb']
forest_info_average = np.zeros(X.shape[1]) #36 features
cat_info_average = np.zeros(X.shape[1])
lgbm_info_average = np.zeros(X.shape[1])
xgb_info_average = np.zeros(X.shape[1])

for selected_classifier in classifiers:
    for seed in seeds:
        skfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)

        for train_ids, valid_ids in skfold.split(X, y):
            X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
            X_valid, y_valid = X.iloc[valid_ids], y.iloc[valid_ids]

            X_train = column_transformers.fit_transform(X_train)
            X_valid = column_transformers.transform(X_valid)


        if selected_classifier == 'rf':
            classifier = RandomForestClassifier(class_weight="balanced", random_state = seed, n_jobs = -1, max_features = 1)
            classifier.fit(X_train, y_train)
            print(f"forest Train Accuracy: {accuracy_score(y_train, classifier.predict(X_train))}")
            print(f"forest Accuracy: {accuracy_score(y_valid, classifier.predict(X_valid))}")
            forest_info = classifier.feature_importances_
            forest_info = forest_info / forest_info.sum()
            forest_info_average += forest_info


        elif selected_classifier == 'cat':
            classifier = CatBoostClassifier(random_state = seed, auto_class_weights= "Balanced", classes_count= 3, thread_count= -1, rsm = 1, learning_rate = 0.02)
            classifier.fit(X_train, y_train)
            print(f"cat Train Accuracy: {accuracy_score(y_train, classifier.predict(X_train))}")
            print(f"cat Accuracy: {accuracy_score(y_valid, classifier.predict(X_valid))}")
            cat_info = classifier.feature_importances_
            cat_info = cat_info / cat_info.sum()
            cat_info_average += cat_info


        elif selected_classifier == 'lgbm':
            classifier = LGBMClassifier(random_state = seed, class_weight = 'balanced', n_jobs = -1, objective = 'multiclass', max_depth = 15, n_estimators= 200, verbose = 10, colsample_bytree= 0.9, reg_alpha = 1, reg_lambda = 1, learning_rate = 0.02)
            classifier.fit(X_train, y_train)
            print(f"lgbm Train Accuracy: {accuracy_score(y_train, classifier.predict(X_train))}")
            print(f"lgbm Accuracy: {accuracy_score(y_valid, classifier.predict(X_valid))}")
            lgbm_info = classifier.feature_importances_
            lgbm_info = lgbm_info / lgbm_info.sum()
            lgbm_info_average += lgbm_info


        elif selected_classifier == 'xgb':
            classifier = XGBClassifier(random_state=seed, n_jobs = -1, objective = 'multiclass', max_depth = 5, n_estimators= 100, reg_alpha = 2, colsample_bytree = 0.5, reg_lambda = 1, learning_rate = 0.02)
            classifier.fit(X_train, y_train)
            print(f"xgb Train Accuracy: {accuracy_score(y_train, classifier.predict(X_train))}")
            print(f"xgb Accuracy: {accuracy_score(y_valid, classifier.predict(X_valid))}")
            xgb_info = classifier.feature_importances_
            xgb_info = xgb_info / xgb_info.sum()
            xgb_info_average += xgb_info

    forest_info_average = forest_info_average/n_bags/n_folds
    cat_info_average = cat_info_average/n_bags/n_folds
    lgbm_info_average = lgbm_info_average/n_bags/n_folds
    xgb_info_average = xgb_info_average/n_bags/n_folds
            
importances = pd.DataFrame(
    [forest_info, cat_info, lgbm_info, xgb_info],
    columns=column_transformers.get_feature_names_out(),
    index=["FOREST", "CAT","LGBM", "XGB"],
).T

forest Train Accuracy: 0.9999836641346075
forest Accuracy: 0.8210154871593806
forest Train Accuracy: 0.9999673282692151
forest Accuracy: 0.8261778736195518
forest Train Accuracy: 0.9999673282692151
forest Accuracy: 0.8229758870809645
forest Train Accuracy: 0.9999509924038226
forest Accuracy: 0.8216689537999086
forest Train Accuracy: 0.9999509924038226
forest Accuracy: 0.8201659805266941
0:	learn: 1.0795055	total: 24.9ms	remaining: 24.9s
1:	learn: 1.0615014	total: 48.9ms	remaining: 24.4s
2:	learn: 1.0445040	total: 71.8ms	remaining: 23.8s
3:	learn: 1.0281187	total: 89.9ms	remaining: 22.4s
4:	learn: 1.0120461	total: 107ms	remaining: 21.3s
5:	learn: 0.9969817	total: 125ms	remaining: 20.6s
6:	learn: 0.9824918	total: 142ms	remaining: 20.1s
7:	learn: 0.9687516	total: 159ms	remaining: 19.7s
8:	learn: 0.9552281	total: 175ms	remaining: 19.2s
9:	learn: 0.9425365	total: 192ms	remaining: 19s
10:	learn: 0.9305298	total: 208ms	remaining: 18.7s
11:	learn: 0.9187472	total: 227ms	remaining: 18.6s
12:	le

In [65]:
importance_score_summary = importances.T.describe().T.sort_values(by = 'mean', ascending = False)
importance_score_summary['impor_score'] = range(1, len(importance_score_summary) + 1)
importance_score_summary[['impor_score', 'min', 'max', 'mean']].style.set_table_styles(DF_STYLE).background_gradient(
                                                                        cmap = DF_CMAP2, 
                                                                        subset = ['mean'], 
                                                                        vmin = importance_score_summary['mean'].min(),
                                                                        vmax = importance_score_summary['mean'].mean(),
                                                            ).background_gradient(
                                                                        cmap = DF_CMAP, 
                                                                        subset = 'min', 
                                                                        vmin = importance_score_summary['min'].min(),
                                                                        vmax = importance_score_summary['min'].mean(),
                                                            ).background_gradient(
                                                                        cmap = DF_CMAP, 
                                                                        subset = 'max', 
                                                                        vmin = importance_score_summary['max'].min(),
                                                                        vmax = importance_score_summary['max'].mean(),
                                                                        
                                                            ).format(precision = 3)

Unnamed: 0,impor_score,min,max,mean
Curricular units 2nd sem (approved),1,0.018,0.216,0.09
min_approved,2,0.019,0.179,0.074
sum_approved,3,0.026,0.104,0.053
approved_add_grade_s2,4,0.031,0.057,0.043
fees_plus_scholarship,5,0.012,0.086,0.043
Course,6,0.002,0.069,0.032
approved_add_grade_s1_s2,7,0.016,0.039,0.03
sum_evaluations,8,0.017,0.038,0.025
Admission grade,9,0.001,0.045,0.022
Age at enrollment,10,0.005,0.037,0.021


In [66]:
importances_melted_frame = (
    importances.melt(
        var_name="Method",
        value_name="Importance",
        ignore_index=False,
    )
    .reset_index()
    .rename(columns={"index": "Feature"})
    .round(4)
)

fig = px.bar(
    importances_melted_frame,
    x="Importance",
    y="Feature",
    color="Importance",
    facet_col="Method",
    facet_col_spacing=0.07,
    height=2000,
    width=1800,
    color_continuous_scale=color_map,
    title="Normalised Feature Importances (Three Different Default Methods)",
)
fig.update_annotations(font_size=20)
fig.update_yaxes(
    matches=None,
    showticklabels=True,
    categoryorder="total ascending",
    tickfont_size=15,
)
fig.update_xaxes(matches=None)
fig.update_traces(width=0.7)
fig.update_layout(
    font_color=FONT_COLOR,
    title_font_size=25,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
    coloraxis_colorbar=dict(
        orientation="h",
        title_side="bottom",
        yanchor="bottom",
        xanchor="center",
        title=None,
        y=-0.05,
        x=0.5,
    ),
)
fig.show()

In [None]:
import optuna
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import numpy as np
from optuna.samplers import TPESampler, CmaEsSampler
import optuna.visualization as vis

In [None]:
import optuna
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import numpy as np
from optuna.samplers import TPESampler, CmaEsSampler
import optuna.visualization as vis


# Define the objective function for Optuna
def lgbm_objective(trial):
    params = {
        'booster': 'gbtree',
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 6e-1, log = True),  # Learning rate
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log = True),  # Minimum loss reduction
        'max_depth': trial.suggest_int('max_depth', 2, 12),  # Maximum depth of a tree
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 10.0, log = True),  # Minimum sum of instance weight
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),  # Subsample ratio of training instances
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),  # Subsample ratio of columns
        'lambda': trial.suggest_float('lambda', 1e-9, 1.0, log = True),  # L2 regularization term
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log = True),  # L1 regularization term
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.1, 10.0),  # Balance of positive and negative weights
        'n_estimators': trial.suggest_int('n_estimators', 50, 800),  # Number of boosting rounds
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart'])#, 'goss']) # goss performs bad
    }
    
    early_stopping_param = 50 if params['boosting_type'] != 'dart' else None

    # Cross-validation with StratifiedKFold
    np.random.seed(30)
    n_bags = 1
    seeds = np.random.randint(0, 19937, size=n_bags)
    cv_results = []

    for seed in seeds:
        skfold = StratifiedKFold(n_splits=6, shuffle=True, random_state = seed)
    
        for train_ids, valid_ids in skfold.split(X, y):
            
            X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
            X_valid, y_valid = X.iloc[valid_ids], y.iloc[valid_ids]

            X_train = column_transformers.fit_transform(X_train)
            X_valid = column_transformers.transform(X_valid)
        

            #model = instantiate_learner(trial)
            model = LGBMClassifier(**params, eval_metric='logloss', early_stopping = early_stopping_param, n_jobs = -1)
            model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
        
            y_val_pred = model.predict(X_valid)
            accuracy = accuracy_score(y_valid, y_val_pred)
            cv_results.append(accuracy)
    
    #mean_accuracy = np.mean(cv_results)
    final_score = np.min([np.mean(cv_results), np.median(cv_results)])
    return final_score

In [None]:
study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
study.optimize(lgbm_objective, n_trials = 50, n_jobs= -1)

# Print the best hyperparameters
print("Best hyperparameters: ", study.best_params)

In [None]:
#lgbm_params = {'learning_rate': 0.06718684084319468, 
#               'gamma': 5.467996775023497e-07, 
#               'max_depth': 11, 
#               'min_child_weight': 1.6614006326681152, 
#               'subsample': 0.9422754714677312, 
#               'colsample_bytree': 0.5841014160265492, 
#               'lambda': 1.6018054020032332e-05, 
#               'alpha': 2.4887387067764657e-07, 
#               'scale_pos_weight': 2.8561403193820327, 
#               'n_estimators': 170
#}
#
#
#lgbm2_params = {'learning_rate': 0.19624105311205217, 
#                'gamma': 0.0010089640346807461, # Apparently an unknown parameter
#                'max_depth': 4, 
#                'min_child_weight': 0.2157143281759279, 
#                'subsample': 0.8179877607299206, 
#                'colsample_bytree': 0.5544076591945851, 
#                'lambda': 1.410271889281656e-07, 
#                'alpha': 0.09228982699940794, 
#                'scale_pos_weight': 1.1921114468810923, 
#                'n_estimators': 142}
#
#
#xgb_params = {'learning_rate': 0.09638393981620358, 
#               'gamma': 4.123268147046361e-06, 
#               'max_depth': 9, 
#               'min_child_weight': 0.013808413151732068, 
#               'subsample': 0.7592063908271939, 
#               'colsample_bytree': 0.9973844113500343, 
#               'lambda': 0.7635104056535492, 
#               'alpha': 0.003288646693435787, 
#               'n_estimators': 145
#}
#
#xgb2_params = {'learning_rate': 0.06422803243149475, 
#               'gamma': 5.697796981947303e-06, 
#               'max_depth': 6, 
#               'min_child_weight': 0.2799856230438902, 
#               'subsample': 0.8851659101267259, 
#               'colsample_bytree': 0.7950091586366488, 
#               'lambda': 3.297580150758358e-08, 
#               'alpha': 0.0006447700961887292, 
#               'n_estimators': 247}
#
#
#
#xgb3_params = {'learning_rate': 0.07341117582508434, 
#               'gamma': 1.0318702404206988e-08, 
#               'max_depth': 6, 
#               'min_child_weight': 0.35050528655022484, 
#               'subsample': 0.8294593215626396, 
#               'colsample_bytree': 0.7897258282829348, 
#               'lambda': 0.37538515044563997, 
#               'alpha': 7.289917167033083e-08, 
#               'n_estimators': 449
#}
#
#
#cat_params = {'learning_rate': 0.05815213279739086, 
#              'depth': 6, 
#              'rsm': 0.9389579011785203, 
#              'l2_leaf_reg': 3.0082696969903538e-05, 
#              'iterations': 438
#}
#
#cat2_params = {'learning_rate': 0.04408315945559109, 
#               'depth': 8, 'subsample': 0.9944334079423667, 
#               'l2_leaf_reg': 0.1206597145243022, 
#               'iterations': 494, 
#               'colsample_bylevel': 0.7253429260689228, 
#               'random_strength': 0.4576768401006938, 
#               'grow_policy': 'Lossguide', 
#               'boosting_type': 'Plain', 
#               'bootstrap_type': 'Bernoulli'
#}
#
#cat3_params = {'learning_rate': 0.060498740743563, 
#               'depth': 6, 
#               'l2_leaf_reg': 0.004612901318157922, 
#               'iterations': 367, 
#               'random_strength': 0.013030409815497926, 
#               'grow_policy': 'Lossguide', 
#               'boosting_type': 'Plain', 
#               'bootstrap_type': 'Bernoulli'}




#83.16
#xgb_params = {'learning_rate': 0.0631194343285199, 'gamma': 3.576671526661163e-08, 'max_depth': 8, 'min_child_weight': 5.508019147953081, 'subsample': 0.999660324296625, 'colsample_bytree': 0.40419310299792643, 'lambda': 4.543515537896746e-05, 'alpha': 0.0001932153352909221, 'n_estimators': 433}


# Using min max scaler
# 0.832...
xgb_minmax_params = {'learning_rate': 0.04808572634755236, 'gamma': 9.560327584505747e-05, 'max_depth': 6, 'min_child_weight': 0.11743307971084631, 'subsample': 0.7204079538816981, 'colsample_bytree': 0.4523296155877883, 'lambda': 0.5492675245323843, 'alpha': 8.819969375852153e-08, 'n_estimators': 435}


#0.831098
xgb2_minmax_params = {'learning_rate': 0.15267345813242902, 'gamma': 0.0002202210602923774, 'max_depth': 3, 'min_child_weight': 0.0032093132626038303, 'subsample': 0.7789317921899368, 'colsample_bytree': 0.4944876602452024, 'lambda': 4.088956032379317e-05, 'alpha': 1.264701759175353e-07, 'n_estimators': 369}

#0.831647
xgb3_minmax_params = {'learning_rate': 0.020498556354593255, 'gamma': 4.0117054063305194e-06, 'max_depth': 9, 'min_child_weight': 9.097349005570713, 'subsample': 0.814672362304106, 'colsample_bytree': 0.5074813732573049, 'lambda': 8.132240585431906e-06, 'alpha': 3.493464282610354e-07, 'n_estimators': 479}


lgbm_minmax_params = {'learning_rate': 0.12548315204042748, 'gamma': 4.578734755794731e-06, 'max_depth': 11, 'min_child_weight': 0.006074836332063258, 'subsample': 0.542429661988086, 'colsample_bytree': 0.8889283657514138, 'lambda': 0.0774449841007967, 'alpha': 0.006148549497097544, 'scale_pos_weight': 6.7529237049752915, 'n_estimators': 482, 'boosting_type': 'dart'}

#0.83190
lgbm2_minmax_params = {'learning_rate': 0.07311929223191145, 'gamma': 0.0001465824143858157, 'max_depth': 5, 'min_child_weight': 1.1101772255986377, 'subsample': 0.7717416516622793, 'colsample_bytree': 0.5521824473706827, 'lambda': 1.204402096691422e-08, 'alpha': 6.446090081481055e-07, 'scale_pos_weight': 3.2424584929164473, 'n_estimators': 494, 'boosting_type': 'gbdt'}

#0.83172
lgbm3_minmax_params = {'learning_rate': 0.05305666675975752, 'gamma': 3.737093538969974e-05, 'max_depth': 10, 'min_child_weight': 0.015065986984559972, 'subsample': 0.4107383480784791, 'colsample_bytree': 0.4754708058469771, 'lambda': 1.1778913998726071e-09, 'alpha': 0.00046716162072521424, 'scale_pos_weight': 7.601656443319444, 'n_estimators': 798, 'boosting_type': 'gbdt'}


cat_minmax_params = {'learning_rate': 0.0822257813369298, 'depth': 6, 'l2_leaf_reg': 1.5125953816160678, 'iterations': 378, 'random_strength': 0.08419072361237345, 'grow_policy': 'Lossguide', 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli'}

In [None]:
def xgb_objective(trial):
    params = {
        'booster': 'gbtree',
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 6e-1, log = True),  # Learning rate
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log = True),  # Minimum loss reduction
        'max_depth': trial.suggest_int('max_depth', 2, 12),  # Maximum depth of a tree
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 10.0, log = True),  # Minimum sum of instance weight
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),  # Subsample ratio of training instances
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),  # Subsample ratio of columns
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log = True),  # L2 regularization term
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log = True),  # L1 regularization term
        'n_estimators': trial.suggest_int('n_estimators', 40, 700)  # Number of boosting rounds
    }

    np.random.seed(30)
    n_bags = 1
    seeds = np.random.randint(0, 19937, size=n_bags)
    cv_results = []

    for seed in seeds:
        skfold = StratifiedKFold(n_splits=6, shuffle=True, random_state = seed)
    
        for train_ids, valid_ids in skfold.split(X, y):
            
            X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
            X_valid, y_valid = X.iloc[valid_ids], y.iloc[valid_ids]

            X_train = column_transformers.fit_transform(X_train)
            X_valid = column_transformers.transform(X_valid)
        
            model = XGBClassifier(**params, use_label_encoder=False, eval_metric='mlogloss', early_stopping_rounds=50)
            model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose = 500)
        
            y_val_pred = model.predict(X_valid)
            accuracy = accuracy_score(y_valid, y_val_pred)
            cv_results.append(accuracy)
    
    final_score = np.min([np.mean(cv_results), np.median(cv_results)])
    return final_score

study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
study.optimize(xgb_objective, n_trials = 50, n_jobs= -1)

print("Best hyperparameters: ", study.best_params)

In [None]:
study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
study.optimize(xgb_objective, n_trials = 50, n_jobs= -1)

print("Best hyperparameters: ", study.best_params)

In [None]:
def cat_objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 6e-1, log = True),  # Learning rate
        #'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log = True),  # Minimum loss reduction
        'depth': trial.suggest_int('depth', 3, 12),  # Maximum depth of a tree
        #'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 10.0, log = True),  # Minimum sum of instance weight
        #'subsample': trial.suggest_float('subsample', 0.5, 1.0),  # Only allowed for bernoulli bootstrap type
        #'rsm': trial.suggest_float('rsm', 0.5, 1.0),  # Subsample ratio of columns
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 5, log = True),  # L2 regularization term
        #'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log = True),  # L1 regularization term
        #'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.1, 10.0),  # Balance of positive and negative weights
        'iterations': trial.suggest_int('iterations', 40, 500),  # Number of boosting rounds
        #'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        'random_strength': trial.suggest_float('random_strength', 1e-2, 1, log = True),
        "grow_policy": trial.suggest_categorical("grow_policy", ["Lossguide", "SymmetricTree"]),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Plain"]), # Much faster than ordered
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bernoulli"]),

    }
    # Cross-validation with StratifiedKFold
    np.random.seed(42)
    n_bags = 1
    seeds = np.random.randint(0, 19937, size=n_bags)
    print(seeds)
    cv_results = []

    for seed in seeds:
        skfold = StratifiedKFold(n_splits=6, shuffle=True, random_state = seed)
    
        for train_ids, valid_ids in skfold.split(X, y):
            
            X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
            X_valid, y_valid = X.iloc[valid_ids], y.iloc[valid_ids]

            X_train = column_transformers.fit_transform(X_train)
            X_valid = column_transformers.transform(X_valid)
        

            #model = instantiate_learner(trial)
            model = CatBoostClassifier(**params, eval_metric='MultiClass', early_stopping_rounds=50, classes_count = 3, leaf_estimation_iterations = 1, auto_class_weights = 'Balanced')
            model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose = 100)
        
            y_val_pred = model.predict(X_valid)
            accuracy = accuracy_score(y_valid, y_val_pred)
            cv_results.append(accuracy)
    
    #mean_accuracy = np.mean(cv_results)
    final_score = np.min([np.mean(cv_results), np.median(cv_results)])
    return final_score

In [None]:
study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
study.optimize(cat_objective, n_trials = 20, n_jobs= -1)

# Print the best hyperparameters
print("Best hyperparameters: ", study.best_params)

In [None]:
vis.plot_optimization_history(study).show()

In [None]:
vis.plot_parallel_coordinate(study).show()

# Evaluating our Tuned models

In [None]:
skfold = StratifiedKFold(n_splits= 5, shuffle=True, random_state= 42) 

In [None]:
def cross_validate_score(model, data, folds = 5, target='Target'): #include_original=True):
    X = data.drop(columns = target)
    y = data[target]
    
    skfold = StratifiedKFold(n_splits= folds, shuffle=True, random_state= 42) 

    # Initiate prediction arrays and score lists
    val_predictions = np.zeros((len(X), 3))  # Adjust for class probabilities
    train_scores, val_scores = [], []
    
    # Training model and evaluating metrics
    for fold, (train_idx, val_idx) in enumerate(skfold.split(X, y)):
        # Define train set
        X_train, y_train = X.iloc[train_idx].reset_index(drop=True), y.iloc[train_idx].reset_index(drop=True)
        
        # Define validation set
        X_val, y_val = X.iloc[val_idx].reset_index(drop=True), y.iloc[val_idx].reset_index(drop=True)

        
        X_train = column_transformers.fit_transform(X_train)
        X_val = column_transformers.transform(X_val)


        #if include_original:
        #    X_train = pd.concat([pipe_original.drop(label, axis=1), X_train]).reset_index(drop=True)
        #    y_train = pd.concat([pipe_original[label], y_train]).reset_index(drop=True)
        
        # Train model
        model.fit(X_train, y_train)
        
        # Make predictions
        train_preds = model.predict(X_train)
        val_preds_proba = model.predict_proba(X_val)  # Get class probabilities
        
        # Store validation predictions
        val_predictions[val_idx] = val_preds_proba
        
        # Evaluate model for a fold
        val_preds = np.argmax(val_preds_proba, axis=1)  # Get predicted classes for accuracy score
        train_score = accuracy_score(y_train, train_preds)
        val_score = accuracy_score(y_val, val_preds)
        
        print(f'Fold {fold}: {val_score:.5f}')
        
        # Append model score for a fold to list
        train_scores.append(train_score)
        val_scores.append(val_score)
    
    # Refit the model on the entire dataset, including the original data
    #if include_original:
    #    X_full = pd.concat([pipe_original.drop(label, axis=1), X]).reset_index(drop=True)
    #    y_full = pd.concat([pipe_original[label], y]).reset_index(drop=True)

    X_full = column_transformers.fit_transform(X)
    y_full = y


    
    model.fit(X_full, y_full)
    
    test_data = column_transformers.transform(test)

    # Make final predictions on the test set
    test_predictions_proba = model.predict_proba(test_data)  # Get class probabilities
    
    print(f'Val Score: {np.mean(val_scores):.7f} ± {np.std(val_scores):.7f} | Train Score: {np.mean(train_scores):.7f} ± {np.std(train_scores):.7f} | {target}')
    
    return val_scores, val_predictions, test_predictions_proba

In [None]:
X 

In [None]:
test

In [None]:
# Initialize our dataframes
cv_summary, oof_predictions_df, submission_predictions_df = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

In [None]:
xgb_tuned = XGBClassifier(**xgb_params, random_state= 42)
xgb2_tuned = XGBClassifier(**xgb2_params, random_state= 42)
xgb3_tuned = XGBClassifier(**xgb3_params, random_state= 42)

lgbm_tuned = LGBMClassifier(**lgbm_params, random_state= 42, n_jobs = -1)
lgbm2_tuned = LGBMClassifier(**lgbm2_params, random_state= 42, n_jobs = -1)

#cat_tuned = CatBoostClassifier(**cat_params, random_state= 42, classes_count = 3, leaf_estimation_iterations = 1, auto_class_weights = 'Balanced')
#cat2_tuned = CatBoostClassifier(**cat2_params, random_state= 42, classes_count = 3, leaf_estimation_iterations = 1, auto_class_weights = 'Balanced')
#cat3_tuned = CatBoostClassifier(**cat3_params, random_state= 42, classes_count = 3, leaf_estimation_iterations = 1, auto_class_weights = 'Balanced')



cv_summary['xgb'], oof_predictions_df[['xgb_0', 'xgb_1', 'xgb_2']], submission_predictions_df[['xgb_0', 'xgb_1', 'xgb_2']] = cross_validate_score(xgb_tuned, train, 5, 'Target')
cv_summary['xgb2'], oof_predictions_df[['xgb2_0', 'xgb2_1', 'xgb2_2']], submission_predictions_df[['xgb2_0', 'xgb2_1', 'xgb2_2']] = cross_validate_score(xgb2_tuned, train, 5, 'Target')
cv_summary['xgb3'], oof_predictions_df[['xgb3_0', 'xgb3_1', 'xgb3_2']], submission_predictions_df[['xgb3_0', 'xgb3_1', 'xgb3_2']] = cross_validate_score(xgb3_tuned, train, 5, 'Target')

cv_summary['lgbm'], oof_predictions_df[['lgbm_0', 'lgbm_1', 'lgbm_2']], submission_predictions_df[['lgbm_0', 'lgbm_1', 'lgbm_2']] = cross_validate_score(lgbm_tuned, train, 5, 'Target')
cv_summary['lgbm2'], oof_predictions_df[['lgbm2_0', 'lgbm2_1', 'lgbm2_2']], submission_predictions_df[['lgbm2_0', 'lgbm2_1', 'lgbm2_2']] = cross_validate_score(lgbm2_tuned, train, 5, 'Target')

#cv_summary['cat'], oof_predictions_df[['cat_0', 'cat_1', 'cat_2']], submission_predictions_df[['cat_0', 'cat_1', 'cat_2']] = cross_validate_score(cat_tuned, train, 5, 'Target')
#cv_summary['cat2'], oof_predictions_df[['cat2_0', 'cat2_1', 'cat2_2']], submission_predictions_df[['cat2_0', 'cat2_1', 'cat2_2']] = cross_validate_score(cat2_tuned, train, 5, 'Target')
#cv_summary['cat3'], oof_predictions_df[['cat3_0', 'cat3_1', 'cat3_2']], submission_predictions_df[['cat3_0', 'cat3_1', 'cat3_2']] = cross_validate_score(cat3_tuned, train, 5, 'Target')

In [None]:
submission_predictions_df

In [None]:
transposed_df = cv_summary.transpose()
transposed_df.columns = ['fold1','fold2','fold3','fold4','fold5']
transposed_df['Mean'] = transposed_df.mean(axis=1)
transposed_df['Std'] = transposed_df.std(axis=1)
transposed_df.sort_values(by = 'Mean', ascending=False)

In [None]:
transposed_df.to_csv('results_df.csv')

In [None]:
# Model diversity check

sns.set_theme(font_scale=1.1)
correlation_train = oof_predictions_df.corr()
mask = np.triu(correlation_train.corr())
plt.figure(figsize=(20, 20))
sns.heatmap(correlation_train,
            annot=True,
            fmt='.3f',
            cmap='coolwarm',
            square=True,
            mask=mask,
            linewidths=1,
            cbar=False);

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
def objective(trial):

    # L1 regularization weight.
    n_estimators = trial.suggest_int("n_estimators", 10, 200)

    alpha1 = trial.suggest_float("alpha", 1e-8, 1.0, log=True)
    # sampling ratio for training data.
    subsample1 = trial.suggest_float("subsample", 0.2, 0.9)
    # sampling according to each tree.
    colsample_bytree1 = trial.suggest_float("colsample_bytree", 0.2, 0.9)
    # maximum depth of the tree, signifies complexity of the tree.
    max_depth1 = trial.suggest_int("max_depth", 2, 8)
    # minimum child weight, larger the term more conservative the tree.
    min_child_weight1 = trial.suggest_int("min_child_weight", 1, 10)
    # learning rate
    learning_rate1 =  trial.suggest_float("learning_rate", 1e-3, 0.5, log=True)
    # defines how selective algorithm is.
    gamma1 = trial.suggest_float("gamma", 1e-8, 1.0, log=True)


    xgb_regressor = XGBClassifier(n_estimators = n_estimators, alpha=alpha1,subsample=subsample1,colsample_bytree=colsample_bytree1,
                           max_depth=max_depth1,min_child_weight =min_child_weight1,learning_rate=learning_rate1,gamma=gamma1,
                           random_state = 42)
    

    
#    ss = cv
    score = cross_val_score(xgb_regressor, oof_predictions_df, train['Target'], scoring= 'accuracy',  cv=skfold)
    #score = min(score.mean(), score.median())
    score = score.mean()
    return score

study = optuna.create_study(direction="maximize", sampler = TPESampler(seed=42))
study.optimize(objective, n_trials=150)

print("Best hyperparameters: ", study.best_params)

In [None]:
vis.plot_parallel_coordinate(study).show() 
#vis.plot_optimization_history(study).show()

In [None]:
meta_model_params = {'n_estimators': 72, 
                     'alpha': 0.2439166256290594, 
                     'subsample': 0.7004885619418898, 
                     'colsample_bytree': 0.4962219686084267, 
                     'max_depth': 3, 
                     'min_child_weight': 8, 
                     'learning_rate': 0.024205698506541935, 
                     'gamma': 0.05069193639944559}



meta_model_params2 = {'n_estimators': 64, 
                      'alpha': 8.893508024579523e-05, 
                      'subsample': 0.42838999154433505, 
                      'colsample_bytree': 0.5997680464656635, 
                      'max_depth': 3, 
                      'min_child_weight': 1, 
                      'learning_rate': 0.022483059484740258, 
                      'gamma': 2.981326645695844e-06}



meta_model_params3 = {'n_estimators': 84, 
                      'alpha': 9.437057473670953e-06, 
                      'subsample': 0.596306741974016, 
                      'colsample_bytree': 0.6112978036333462, 
                      'max_depth': 5, 
                      'min_child_weight': 8, 
                      'learning_rate': 0.00399970773887522, 
                      'gamma': 0.0005857095911477838}

In [None]:
from xgboost import XGBClassifier
meta_model = XGBClassifier(**meta_model_params3, random_state= 42, objective = 'multi:softmax')
meta_model.fit(oof_predictions_df, train['Target'])

In [None]:
oof_predictions_df.head(50)

#0.938479 + 0.004268 + 0.057253

In [None]:
submission_predictions_df.head(50)

In [None]:
preds_test =  meta_model.predict(submission_predictions_df)
submission = pd.DataFrame({'id': test.index,
                       'Target': preds_test})

submission['Target'].value_counts()

In [None]:
target_dict = {
    2: 'Enrolled',
    0: 'Dropout',
    1: 'Graduate'
}
# Replace the values in the "Target" column
submission['Target'] = submission['Target'].replace(target_dict)
submission.to_csv('academic-success-predictions_ensemble2.csv', index = False)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve,auc

In [None]:
# Confusion matrix
conf_matrix = confusion_matrix(test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Classification report
class_report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
print('Classification Report:')
print(class_report)

# Results
lgbm1 + xgb3

In [None]:
oof_predictions_df

In [None]:
initial_preprocess = make_pipeline(
        make_column_transformer(
        (
            StandardScaler(),
            no_transform_cols.to_list(),  # Columns not requiring transformation
        ),
        (
            make_pipeline(
                FunctionTransformer(func=np.square, feature_names_out="one-to-one"),#, validate=False),
                StandardScaler(),
            ),
            square_transform_cols.to_list(),  # Columns to be squared and then scaled
        ),
        (
            PowerTransformer(method="yeo-johnson", standardize=True),
            yeojohnson_transform_cols.to_list(),  # Columns to be transformed using Yeo-Johnson
        ),
        #(
        #    OneHotEncoder(sparse_output = False, handle_unknown = "use_encoded_value", unknown_value=-1),
        #    ONE_HOT_COLUMNS,  # Columns to be one-hot encoded
        #),
        remainder="passthrough",  # Pass through columns not specified above
        verbose_feature_names_out=False,
    ),
)

In [None]:
submission_predictions_df

In [None]:
numeric_data = train.select_dtypes("number")
numeric_cols = numeric_data.drop("Target", axis=1).columns.tolist()
n_cols = 5
n_rows, axes = get_n_rows_axes(len(numeric_cols))


numeric_cols = [col for col in numeric_cols if col not in binary_columns]

In [None]:
positive_features = list(train[numeric_cols].describe().T.query("min > 0").index)
zero_features = list(train[numeric_cols].describe().T.query("min == 0").index)
negative_features = list(train[numeric_cols].describe().T.query("min < 0").index)

In [None]:
r2_scores = defaultdict(tuple)

for feature in numeric_cols:
    orig = train[feature].dropna()
    if feature in positive_features:
        _, (*_, R_orig) = probplot(orig, rvalue=True)
        _, (*_, R_log1p) = probplot(np.log1p(orig), rvalue=True)
        _, (*_, R_square) = probplot(np.square(orig), rvalue=True)
        _, (*_, R_yeojohn) = probplot(stats.yeojohnson(orig)[0], rvalue=True)
    elif feature in zero_features:
        _, (*_, R_orig) = probplot(orig, rvalue=True)
        _, (*_, R_log1p) = probplot(np.log1p(orig), rvalue=True)
        _, (*_, R_square) = probplot(np.square(orig), rvalue=True)
        _, (*_, R_yeojohn) = probplot(stats.yeojohnson(orig)[0], rvalue=True)
    elif feature == 'GDP':
        _, (*_, R_orig) = probplot(orig, rvalue=True)
        _, (*_, R_log1p) = probplot(orig, rvalue=True)
        _, (*_, R_square) = probplot(orig, rvalue=True)
        _, (*_, R_yeojohn) = probplot(stats.yeojohnson(orig)[0], rvalue=True)
    elif feature == 'Inflation Rate':
        _, (*_, R_orig) = probplot(orig, rvalue=True)
        _, (*_, R_log1p) = probplot(orig, rvalue=True)
        _, (*_, R_square) = probplot(orig, rvalue=True)
        _, (*_, R_yeojohn) = probplot(stats.yeojohnson(orig)[0], rvalue=True)

    r2_scores[feature] = (
        R_orig * R_orig,
        R_log1p * R_log1p,
        R_square * R_square,
        R_yeojohn * R_yeojohn
    )

r2_scores = pd.DataFrame(
    r2_scores, index=("Original", "Log1p", "Square", "YeoJohnson")
).T

r2_scores["HighestScore"] = r2_scores[["Original", "Log1p", "Square", "YeoJohnson"]].max(axis = 1)
r2_scores["Winner"] = r2_scores.idxmax(axis=1)


def highlight_max(s):
    is_max = s == s.max()
    return [f'background-color: {TEAL}' if v else '' for v in is_max]

r2_scores['Improvement'] = r2_scores['HighestScore'] - r2_scores['Original']
r2_scores.style.set_table_styles(DF_STYLE).apply(highlight_max, subset= ["Original", "Log1p", "Square", "YeoJohnson"], axis=1).background_gradient(cmap = DF_CMAP2, subset = 'Improvement').format(precision = 3)

In [None]:
no_transform_cols = r2_scores.query("Winner == 'Original'").index
log1p_transform_cols = r2_scores.query("Winner == 'Log1p'").index
square_transform_cols = r2_scores.query("Winner == 'Square'").index
yeojohnson_transform_cols = r2_scores.query("Winner == 'YeoJohnson'").index

In [None]:
pd.set_option('display.max_columns', 500)
train.describe().loc['max'].head(50)

In [None]:
initial_preprocess = make_pipeline(
    make_column_transformer(
        (
            StandardScaler(),
            no_transform_cols,
        ),
        (
            make_pipeline(
                FunctionTransformer(func=np.log1p, feature_names_out="one-to-one"),
                StandardScaler(),
            ),
            log1p_transform_cols,
        ),
        (
            make_pipeline(
                FunctionTransformer(func=np.square, feature_names_out="one-to-one"),
                StandardScaler(),
            ),
            square_transform_cols,
        ),
        (
            PowerTransformer(method="yeo-johnson", standardize=True),
            yeojohnson_transform_cols,
        ),
#        (
#            make_pipeline(
#                SimpleImputer(strategy="most_frequent"),
#                OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
#            ),
#            make_column_selector(dtype_include=object),  # type: ignore
        remainder="passthrough",
        verbose_feature_names_out=False,
        ),
    )

In [None]:
X = train.drop(target, axis=1)
y = train[target]

In [None]:
X.shape[1]

In [None]:
n_bags = 10
n_folds = 5
np.random.seed(42)
seeds = np.random.randint(0, 19937, size=n_bags)

classifiers = ['rf', 'cat', 'lgbm', 'xgb']
forest_info_average = np.zeros(X.shape[1]) #number of features
cat_info_average = np.zeros(X.shape[1])
lgbm_info_average = np.zeros(X.shape[1])
xgb_info_average = np.zeros(X.shape[1])


for selected_classifier in classifiers:
    for seed in seeds:
        skfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)

        for train_ids, valid_ids in skfold.split(X, y):
            X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
            X_valid, y_valid = X.iloc[valid_ids], y.iloc[valid_ids]

            X_train = initial_preprocess.fit_transform(X_train)
            X_valid = initial_preprocess.transform(X_valid)


        if selected_classifier == 'rf':
            classifier = RandomForestClassifier(class_weight="balanced", random_state = seed, n_jobs = -1, max_features = 1)
            classifier.fit(X_train, y_train)
            print(f"forest Accuracy: {accuracy_score(y_valid, classifier.predict(X_valid))}")
            forest_info = classifier.feature_importances_
            forest_info = forest_info / forest_info.sum()
            forest_info_average += forest_info


        elif selected_classifier == 'cat':
            classifier = CatBoostClassifier(random_state = seed, auto_class_weights= "Balanced", classes_count= 3, thread_count= -1, rsm = 1)
            classifier.fit(X_train, y_train)
            print(f"cat Accuracy: {accuracy_score(y_valid, classifier.predict(X_valid))}")
            cat_info = classifier.feature_importances_
            cat_info = cat_info / cat_info.sum()
            cat_info_average += cat_info


        elif selected_classifier == 'lgbm':
            classifier = LGBMClassifier(random_state = seed, class_weight = 'balanced', n_jobs = -1, objective = 'multiclass', max_depth = 15, n_estimators= 200, verbose = 10, colsample_bytree= 1, reg_alpha = 1, reg_lambda = 1)
            classifier.fit(X_train, y_train)
            print(f"lgbm Accuracy: {accuracy_score(y_valid, classifier.predict(X_valid))}")
            lgbm_info = classifier.feature_importances_
            lgbm_info = lgbm_info / lgbm_info.sum()
            lgbm_info_average += lgbm_info


        elif selected_classifier == 'xgb':
            classifier = XGBClassifier(random_state=seed, class_weight = 'balanced', n_jobs = -1, objective = 'multiclass', max_depth = 5, n_estimators= 200, reg_alpha = 2, colsample_bytree = 0.5)
            classifier.fit(X_train, y_train)
            print(f"xgb Accuracy: {accuracy_score(y_valid, classifier.predict(X_valid))}")
            xgb_info = classifier.feature_importances_
            xgb_info = xgb_info / xgb_info.sum()
            xgb_info_average += xgb_info

    forest_info_average = forest_info_average/n_bags/n_folds
    cat_info_average = cat_info_average/n_bags/n_folds
    lgbm_info_average = lgbm_info_average/n_bags/n_folds
    xgb_info_average = xgb_info_average/n_bags/n_folds
            




"""
forest_pipeline = make_pipeline(
    initial_preprocess,
    RandomForestClassifier(random_state= 42)
).fit(X_train, y_train)
print(f"forest Accuracy: {accuracy_score(y_test, forest_pipeline.predict(X_test))}")
forest_info = forest_pipeline[-1].feature_importances_
forest_info = forest_info / forest_info.sum()



cat_pipeline = make_pipeline(
    initial_preprocess,
    CatBoostClassifier(random_seed = 42, auto_class_weights= "Balanced", classes_count= 3, thread_count= -1, max_depth= 10, n_estimators= 100, verbose = 0)
).fit(X_train, y_train)
print(f"cat Accuracy: {accuracy_score(y_test, cat_pipeline.predict(X_valid))}")
cat_info = cat_pipeline[-1].feature_importances_
cat_info = cat_info / cat_info.sum()




lgbm_pipeline = make_pipeline(
    initial_preprocess,
    LGBMClassifier(random_state=42, class_weight = 'balanced', n_jobs = -1, objective = 'multiclass', max_depth = 10, n_estimators= 500, verbose = 1),
).fit(X_train,y_train)
print(f"lgbm Accuracy: {accuracy_score(y_test, lgbm_pipeline.predict(X_test))}")
lgbm_info = lgbm_pipeline[-1].feature_importances_
lgbm_info = lgbm_info / lgbm_info.sum()



xgb_pipeline = make_pipeline(
    initial_preprocess,
    XGBClassifier(random_state=42, class_weight = 'balanced', n_jobs = -1, objective = 'multiclass', max_depth = 10, n_estimators= 1000, reg_alpha = 1),
).fit(X_train,y_train)
print(f"xgb Accuracy: {accuracy_score(y_test, xgb_pipeline.predict(X_test))}")
xgb_info = xgb_pipeline[-1].feature_importances_
xgb_info = xgb_info / xgb_info.sum()


"""




#mutual_info = mutual_info_classif(
#    X=initial_preprocess.fit_transform(X), y=y, random_state=42
#)
#mutual_info = mutual_info / np.sum(mutual_info)



importances = pd.DataFrame(
    [forest_info, cat_info, lgbm_info, xgb_info],
    columns=initial_preprocess.get_feature_names_out(),
    index=["FOREST", "CAT","LGBM", "XGB"],
).T

In [None]:
importances_melted_frame = (
    importances.melt(
        var_name="Method",
        value_name="Importance",
        ignore_index=False,
    )
    .reset_index()
    .rename(columns={"index": "Feature"})
    .round(4)
)

fig = px.bar(
    importances_melted_frame,
    x="Importance",
    y="Feature",
    color="Importance",
    facet_col="Method",
    facet_col_spacing=0.07,
    height=2000,
    width=2000,
    color_continuous_scale=color_map,
    title="Normalised Feature Importances (Three Different Default Methods)",
)
fig.update_annotations(font_size=14)
fig.update_yaxes(
    matches=None,
    showticklabels=True,
    categoryorder="total ascending",
    tickfont_size=8,
)
fig.update_xaxes(matches=None)
fig.update_traces(width=0.7)
fig.update_layout(
    font_color=FONT_COLOR,
    title_font_size=18,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
    coloraxis_colorbar=dict(
        orientation="h",
        title_side="bottom",
        yanchor="bottom",
        xanchor="center",
        title=None,
        y=-0.2,
        x=0.5,
    ),
)
fig.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from collections import defaultdict

n_bags = 1
n_folds = 5

np.random.seed(42)
seeds = np.random.randint(0, 19937, size=n_bags)

X = train.drop("Target", axis=1)
y = train['Target']



param_distributions = {
            "votingclassifier__lgbm__max_depth": [7,9,11],
            #"votingclassifier__lgbm__n_estimators": [50,100,150],
            #"votingclassifier__lgbm__reg_alpha": [0.01,0.1,1],
            #"lgbmclassifier__num_leaves": lgbm_params["num_leaves"],
            #"lgbmclassifier__n_estimators": lgbm_params["n_estimators"],
            #"lgbmclassifier__learning_rate": lgbm_params["learning_rate"]
            #"votingclassifier__xgb__max_depth": [7,9,11],
            #"votingclassifier__lgbm__n_estimators": [50,100,150],
            #"votingclassifier__xgb__reg_alpha": [0.01,0.1,1]
            #"xgb__classifier__n_estimators": xgb_params["n_estimators"],
            #"xgb__classifier__learning_rate": xgb_params["learning_rate"],
            #"votingclassifier__svc__C": [1],
        }


classifiers = defaultdict(object)
fold_accuracies = []
val_results_data = []#pd.DataFrame(columns=["Bag", "Fold", "Best Parameters", "Validation Accuracy"])
cv_results_data = []


for bag, seed in enumerate(seeds):
    skfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    print(f"Beginning Bag {bag}")
    for fold, (train_ids, valid_ids) in enumerate(skfold.split(X, y)):
        X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
        X_valid, y_valid = X.iloc[valid_ids], y.iloc[valid_ids]
        print(f"fold {fold}")
        current_ensemble = make_pipeline(
            initial_preprocess,  # Example preprocessing step (replace with your own)
            VotingClassifier(
                [   #("rf", RandomForestClassifier(class_weight="balanced", random_state = seed, n_jobs = -1, max_features = 1)),
                    ("lgbm", LGBMClassifier(random_state=seed, class_weight = 'balanced', n_jobs = -1, objective = 'multiclass', reg_alpha= 1, reg_lambda= 1)),
                    ("xgb", XGBClassifier(random_state=seed, n_jobs = -1,verbose = 2, reg_alpha = 1, reg_lambda = 1)),
                    #("cat", CatBoostClassifier(random_state = seed, auto_class_weights= "Balanced", classes_count= 3, thread_count= -1, verbose = 10)),
                    #("svc", SVC(random_state=seed, class_weight = 'balanced'))
                ],  
                voting="hard",
                #weights=(0.25, 0.25, 0.5),
            ),
        )





        random_search = GridSearchCV(
            estimator=current_ensemble,
            #param_distributions=param_distributions,
            param_grid = param_distributions,
            #n_iter=4,  # Adjust as needed
            scoring="accuracy",
            cv=skfold,
            #random_state=seed,
            n_jobs=-1,
            return_train_score = True,
            verbose = 3
        )


        random_search.fit(X_train, y_train)
        cv_results = random_search.cv_results_
        cv_results_data.append(cv_results)


        best_estimator = random_search.best_estimator_
        classifiers[f"Voting Bag: {bag} Fold: {fold}"] = best_estimator





        # Evaluate on validation set
        y_pred_valid = best_estimator.predict(X_valid)
        fold_accuracy = accuracy_score(y_valid, y_pred_valid)
        fold_accuracies.append(fold_accuracy)
        print(f"Bag: {bag}, Fold: {fold} - Validation Accuracy: {fold_accuracy:.4f}")



        results = [bag, fold, random_search.best_params_, fold_accuracy]
        val_results_data.append(results)

        #results_df = results_df.append({
        #    "Bag": bag,
        #    "Fold": fold,
        #    "Best Parameters": random_search.best_params_,
        #    "Validation Accuracy": fold_accuracy
        #}, ignore_index=True)


X_test = test
y_pred_test = best_estimator.predict(X_test)
#test_accuracy = accuracy_score(y_test, y_pred_test)
#print(f"Accuracy on test set: {test_accuracy:.4f}")

# Optionally, you can also compute mean accuracy across all folds
mean_accuracy = np.mean(fold_accuracies)
print(f"Mean Cross-Validation Accuracy: {mean_accuracy:.4f}")

In [None]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from collections import defaultdict

from sklearn.experimental import enable_halving_search_cv 
from sklearn.model_selection import HalvingGridSearchCV

n_bags = 1
n_folds = 5

np.random.seed(42)
seeds = np.random.randint(0, 19937, size=n_bags)

X = train.drop("Target", axis=1)
y = train['Target']



param_distributions = {
            "votingclassifier__lgbm__max_depth": [11],
#            "votingclassifier__lgbm__n_estimators": [30,50,70],
#            "votingclassifier__lgbm__reg_alpha": [0.1,1,2],
#            "votingclassifier__xgb__reg_lambda": [0.1,1,2],
#            "lgbmclassifier__num_leaves": lgbm_params["num_leaves"],
#            "votingclassifier__lgbm__learning_rate": [0.01, 0.1]
#
            #"votingclassifier__xgb__max_depth": [2,3,4,5,6],
#            "votingclassifier__xgb__n_estimators": [50,100],
#            "votingclassifier__xgb__reg_alpha": [0.5,1,2],
#            "votingclassifier__xgb__reg_lambda": [0.5,1,2],

#            "votingclassifier__svc__C": [0.,1,1.5],
        }


classifiers = defaultdict(object)
fold_accuracies = []
val_results_data = []#pd.DataFrame(columns=["Bag", "Fold", "Best Parameters", "Validation Accuracy"])



for bag, seed in enumerate(seeds):
    skfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    print(f"Beginning Bag {bag}")





    #for fold, (train_ids, valid_ids) in enumerate(skfold.split(X, y)):
      #  X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
     #   X_valid, y_valid = X.iloc[valid_ids], y.iloc[valid_ids]
      #  print(f"fold {fold}")


    current_ensemble = make_pipeline(
        initial_preprocess,  # Example preprocessing step (replace with your own)
        VotingClassifier(
            [   #("rf", RandomForestClassifier(class_weight="balanced", random_state = seed, n_jobs = -1, max_features = 1)),
                ("lgbm", LGBMClassifier(random_state=seed, class_weight = 'balanced', n_jobs = -1, objective = 'multiclass',reg_alpha= 1, reg_lambda= 1)),
                ("xgb", XGBClassifier(random_state=seed, n_jobs = -1, verbose = 2,reg_alpha= 1, reg_lambda= 1)),
                #("cat", CatBoostClassifier(random_state = seed, auto_class_weights= "Balanced", classes_count= 3, thread_count= -1, verbose = 10)),
                #("svc", SVC(random_state=seed, class_weight = 'balanced'))
            ],  
            voting="hard",
            #weights=(0.25, 0.25, 0.5),
        ),
    )





    random_search = HalvingGridSearchCV(
        estimator=current_ensemble,
        #param_distributions=param_distributions,
        param_grid = param_distributions,
        #n_iter=4,  # Adjust as needed
        scoring="accuracy",
        cv=skfold,
        #random_state=seed,
        n_jobs=-1,
        return_train_score = True,
        verbose = 3
    )


    random_search.fit(X, y)
    cv_results = random_search.cv_results_
    cv_results_data.append(cv_results)


    best_estimator = random_search.best_estimator_
    #classifiers[f"Voting Bag: {bag} Fold: {fold}"] = best_estimator
    classifiers[f"Bag: {bag}"] = best_estimator




        # Evaluate on validation set
        #y_pred_valid = best_estimator.predict(X_valid)
        #fold_accuracy = accuracy_score(y_valid, y_pred_valid)
        #fold_accuracies.append(fold_accuracy)
        #print(f"Bag: {bag}, Fold: {fold} - Validation Accuracy: {fold_accuracy:.4f}")



        #results = [bag, fold, random_search.best_params_, fold_accuracy]
        #val_results_data.append(results)

        #results_df = results_df.append({
        #    "Bag": bag,
        #    "Fold": fold,
        #    "Best Parameters": random_search.best_params_,
        #    "Validation Accuracy": fold_accuracy
        #}, ignore_index=True)


X_test = test
y_pred_test = best_estimator.predict(X_test)
#test_accuracy = accuracy_score(y_test, y_pred_test)
#print(f"Accuracy on test set: {test_accuracy:.4f}")

# Optionally, you can also compute mean accuracy across all folds
#mean_accuracy = np.mean(fold_accuracies)
#print(f"Mean Cross-Validation Accuracy: {mean_accuracy:.4f}")

In [None]:
cv_results_dataframe = pd.DataFrame(cv_results)
cv_results_dataframe.sort_values(by = 'mean_test_score', ascending = False).head(3)

In [None]:
cv_results_dataframe.sort_values(by = 'mean_test_score', ascending = False).params[0]

In [None]:
#pd.set_option('display.max_colwidth', 500)
#results_dataframe = pd.DataFrame(val_results_data).rename(columns = {0:'bag', 1: 'fold', 2:'best_parameters', 3:'validation_accuracy'})
#results_dataframe

In [None]:
lgbm_params = {
  #  "max_depth": 14,
 #   "num_leaves": 9,
 #   "min_child_samples": 17,
 #   "n_estimators": 200,
 #   "learning_rate": 0.1,
 #   "colsample_bytree": 0.4,
 #   "min_split_gain": 1e-4,
 #   "reg_alpha": 1e-2,
 #   "reg_lambda": 5e-3,
}

xgb_params = {
    "max_depth": 3,
 #   "n_estimators": 200,
 #   "learning_rate": 0.4,
 #   "subsample": 0.6,
 #   "min_child_weight": 0.1,
 #   "max_delta_step": 0.35,
 #   "colsample_bytree": 0.3,
 #   "colsample_bylevel": 0.7,
 #   "min_split_loss": 1e-4,
 #   "reg_alpha": 2e-3,
 #   "reg_lambda": 6e-2,
}

In [None]:








final_ensemble = make_pipeline(
            initial_preprocess,  # Example preprocessing step (replace with your own)
            VotingClassifier(
                [   ("rf", RandomForestClassifier(class_weight="balanced", random_state = seed, n_jobs = -1)),
                    ("lgbm", LGBMClassifier(random_state=seed, class_weight = 'balanced', n_jobs = -1, objective = 'multiclass')),
                    ("xgb", XGBClassifier(random_state=seed, n_jobs = -1, verbose = 2, **xgb_params)),

                ],  
                voting="hard",
                #weights=(0.25, 0.25, 0.5),
            ),
        )

final_ensemble.fit(X, y)

predictions = final_ensemble.predict(test)

In [None]:
test_ids = test.index

submission_data = pd.DataFrame({"id": test_ids,
              "Target": predictions}).set_index('id')


target_dict = {
    2: 'Enrolled',
    0: 'Dropout',
    1: 'Graduate'
}


# Replace the values in the "Target" column
submission_data['Target'] = submission_data['Target'].replace(target_dict)

submission_data.to_csv('academic-success-predictions3.csv')