## Import Packages

In [None]:
import math
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, IsolationForest, StackingClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.svm import OneClassSVM
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from scipy.stats import pointbiserialr, chi2_contingency, uniform, randint
from itertools import combinations

from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.combine import SMOTEENN

## Pipeline Classes

In [None]:
class FeatureDropper(BaseEstimator, TransformerMixin):
    
    def __init__(self, col_to_drop):
        self.columns_to_drop = col_to_drop
    
    def fit(self, X, y= None):
        return self
    
    def transform(self, X):
        return X.drop(columns= self.columns_to_drop)

class OneHotEncoding(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, binary= False):
        self.columns = columns
        self.binary = binary
        self.encoders = {}
        self.new_column_names = {}
        
    def fit(self, X, y= None):
        X_transformed = X.copy()
        for column in self.columns:
            if self.binary:                
                category_names = sorted(list(X_transformed[column].drop_duplicates()))
                self.binary_check(column, category_names)
                self.encoders[column] = {col: id for id, col in enumerate(category_names)}
            else:
                category_names = sorted([x for x in X_transformed[column].drop_duplicates() 
                                         if not (isinstance(x, float) and math.isnan(x))])
                self.new_column_names[column] = [f"{column}_{c}" for c in category_names]
                encoder = OneHotEncoder(dtype= np.int64, drop= 'if_binary')
                encoder.fit(X_transformed[[column]])
                self.encoders[column] = encoder
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for column in self.columns:
            if self.binary:
                X_transformed[column] = X_transformed[column].map(self.encoders[column])
            else:
                ohe_matrix = self.encoders[column].transform(X_transformed[[column]]).toarray()
                if self.has_nan(X_transformed[column]):
                    ohe_matrix = ohe_matrix[:, :-1]
                for i, new_col in enumerate(self.new_column_names[column]):
                    X_transformed.insert(X_transformed.columns.get_loc(column) + i, new_col, ohe_matrix[:, i])
                X_transformed.drop(columns= [column], inplace= True)
        return X_transformed
    
    def binary_check(self, column, categories):
        if self.has_nan(categories):
            raise ValueError(f"Can't perform binary encoding to column {column} because there are NaN values.")
        if len(categories) > 2 or len(categories) == 0:
            raise ValueError(f"Can't perform binary encoding to column {column} because the number of categories to binary encode is wrong.")

    def has_nan(self, value_list):
        return any([math.isnan(x) for x in value_list if isinstance(x, float)])

class TargetMeanEncoding(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, y, m= 10):
        self.columns = columns
        self.y_target = y
        self.m = m
        self.encoding_dict = {}
        
    def fit(self, X, y= None):
        global_mean = self.y_target.mean()
        for column in self.columns:
            smoothed_mean = self.smooth_mean(X[column], global_mean)
            self.encoding_dict[column] = dict(zip(X[column].dropna().drop_duplicates(), smoothed_mean.dropna().drop_duplicates()))        
        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        global_mean = self.y_target.mean()
        
        for column in self.columns:
            if column in self.encoding_dict:
                X_transformed[column] = X_transformed[column].map(self.encoding_dict[column]).fillna(global_mean)
            else:
                X_transformed[column] = X_transformed[column].fillna(global_mean)
                
        return X_transformed
    
    def smooth_mean(self, values, mean):
        encoded_mean = self.y_target.groupby(values).mean()
        counts = values.map(values.value_counts())
        return (values.map(encoded_mean) * counts + mean * self.m) / (counts + self.m)

class DataFrameImputer(BaseEstimator, TransformerMixin):
    
    def __init__(self, imputer_class, **imputer_kwargs):
        self.imputer_class = imputer_class
        self.imputer_kwargs = imputer_kwargs
        self.imputer = self.imputer_class(**self.imputer_kwargs)
    
    def fit(self, X, y= None):
        self.imputer.fit(X, y)
        return self

    def transform(self, X):
        X_imputed = self.imputer.transform(X)
        return pd.DataFrame(X_imputed, columns= X.columns, index= X.index)
    
class MulticollinearityEliminator(BaseEstimator, TransformerMixin):
    
    def __init__(self, y, threshold= 0.9):
        self.y = y
        self.threshold = threshold
        self.to_drop = None
        
    def fit(self, X, y= None):
        corr_matrix = X.corr().abs()
        gini_scores = {}
        for feature in X.columns:
            gini = 2 * roc_auc_score(self.y, X[feature]) - 1
            gini_scores[feature] = gini

        to_drop = set()
        for i in range(len(corr_matrix.columns)):
            for j in range(i + 1, len(corr_matrix.columns)):
                if corr_matrix.iloc[i, j] > self.threshold:
                    feature_i = corr_matrix.columns[i]
                    feature_j = corr_matrix.columns[j]

                    if gini_scores[feature_i] > gini_scores[feature_j]:
                        to_drop.add(feature_j)
                    else:
                        to_drop.add(feature_i)

        self.to_drop = list(to_drop)
        return self
    
    def transform(self, X):
        X_reduced = X.drop(columns= self.to_drop)
        return X_reduced

## Gather Data

In [None]:
df = pd.read_csv("", sep= ';')
df.head(5)

In [None]:
# Training and Validation Data
df.set_index('ID', inplace= True)

X, y = df.iloc[:, :-1], df.iloc[:,-1]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size= 0.1, random_state= 42, stratify= y)

print("\n\033[1mTraining Data information: \033[0m\n")
X_train.info()
print("\n\033[1mTraining Label information: \033[0m\n")
y_train.info()
label_count_tr = y_train.value_counts()
print("\n\033[1mIn the Training Dataset:\033[0m\n")
print(f"Percentage of positively labeled data: {((label_count_tr[1] / (label_count_tr[0] + label_count_tr[1])) * 100):.4f}%")

print("\n\033[1mValidation Data information: \033[0m\n")
X_val.info()
print("\n\033[1mValidation Label information: \033[0m\n")
y_val.info()
label_count_ts = y_val.value_counts()
print("\n\033[1mIn the Validation Dataset: \033[0m\n")
print(f"Percentage of positively labeled data: {(label_count_ts[1] / (label_count_ts[0] + label_count_ts[1])) * 100}%")

## Data Analysis

### Null Check

In [None]:
positive_labels = df[df['Target'] == 1]
negative_labels = df[df['Target'] == 0]
print("Null Check for positive labels: \n")
sns.heatmap(positive_labels.isna(), cbar= False)

In [None]:
print("Null Check for negative labels: \n")
sns.heatmap(negative_labels.isna(), cbar= False)

### Correlations

In [None]:
def corr_matrix_info(df):
    corr_matrix = df.corr()
    plt.figure(figsize= (30, 24))
    sns.heatmap(corr_matrix, annot= True, cmap= 'coolwarm', fmt= '.2f')
    plt.title('Correlation Matrix')
    plt.show()

    corr_with_target = df.corr()['Target'].sort_values(ascending= False)
    print(corr_with_target)

##### Categorical Features

In [None]:
unique_categories = {}
for feature in X.columns[:20]:
    unique_categories[feature] = X[feature].nunique()
    
for feature, count in unique_categories.items():
    print(f"Feature {feature} has {count} unique values.")

In [None]:
cat_corr_pipeline = Pipeline([
    ("One-Hot Encoder", OneHotEncoding([column for column, count in unique_categories.items() if count < 8])),
    ("Target-Mean Encoder", TargetMeanEncoding([column for column, count in unique_categories.items() if count >= 8], y)),
    ("Feature Dropper", FeatureDropper(X.columns[20:]))
])
df_cat_corr = cat_corr_pipeline.fit_transform(X)
corr_matrix_info(pd.concat([df_cat_corr, y], axis= 1))

##### Numeric Features

In [None]:
num_corr_pipeline = Pipeline([
    ("Feature Dropper", FeatureDropper(X.columns[:20]))
])
df_num_corr = num_corr_pipeline.fit_transform(X)
corr_matrix_info(pd.concat([df_num_corr, y], axis= 1))

##### All Features

In [None]:
all_corr_pipeline = Pipeline([
    ("One-Hot Encoder", OneHotEncoding([column for column, count in unique_categories.items() if count < 8])),
    ("Target-Mean Encoder", TargetMeanEncoding([column for column, count in unique_categories.items() if count >= 8], y)),
    ("Imputer", DataFrameImputer(imputer_class= SimpleImputer, strategy= 'mean'))    
])
df_all_corr = all_corr_pipeline.fit_transform(X)
corr_matrix_info(pd.concat([df_all_corr, y], axis= 1))

### Outliers

In [None]:
outlier_pipeline = Pipeline([
    ("Feature Dropper", FeatureDropper(X.columns[:20])),
    ("Imputer", DataFrameImputer(imputer_class= SimpleImputer, strategy= 'mean')),
])
df_out = outlier_pipeline.fit_transform(X)
plt.figure(figsize= (20, 10))
sns.boxplot(data= df_out)
sns.catplot(data= df_out)
plt.xticks(rotation= 90)
plt.show()

### Class Distribution

In [None]:
def plot_class_distribution(df, gini_scores, plot_type= 'histogram', threshold= 0.30):
    filtered_features = [feature for feature, gini in gini_scores.items() if abs(gini) >= threshold]
    if not filtered_features:
        print(f"No features with GINI scores greater than {threshold}.")
        return
    
    for feature in filtered_features:
        plt.figure(figsize= (5, 3))
        if plot_type == 'histogram':
            sns.histplot(data= df, x= feature, hue= 'Target', kde= True, element= "step", stat= "density", common_norm= False)
        elif plot_type == 'kde':
            sns.kdeplot(data= df, x= feature, hue= 'Target', common_norm= False)
        elif plot_type == 'boxplot':
            sns.boxplot(data= df, x= 'Target', y= feature)
        else:
            raise ValueError(f"Invalid plot_type: {plot_type}. Choose from 'histogram', 'kde', 'boxplot'.")
        plt.title(f"Class Distribution for {feature} (Gini Score: {gini_scores[feature]:.4f})")
        plt.xlabel(feature)
        plt.ylabel('Density' if plot_type != 'boxplot' else 'Value')
        plt.show()

def plot_feature_scatter(df, gini_scores, pairs= None, threshold= 0.30):
    filtered_features = [feature for feature, gini in gini_scores.items() if abs(gini) >= threshold]
    if pairs is None:
        pairs = [(filtered_features[i], filtered_features[j]) 
                 for i in range(len(filtered_features))
                 for j in range(i+1, len(filtered_features))]
    if not pairs:
        print(f"No features with GINI scores greater than {threshold}.")
        return
    
    for feature_x, feature_y in pairs:
        if feature_x in df.columns and feature_y in df.columns:
            plt.figure(figsize= (5, 3))
            sns.scatterplot(data= df, x= feature_x, y= feature_y, hue= 'TARGET', palette= 'coolwarm', alpha= 0.7)
            plt.title(f"Scatter Plot of {feature_x} vs {feature_y}")
            plt.xlabel(feature_x)
            plt.ylabel(feature_y)
            plt.show()
            
def calculate_gini(df, y, prnt= False):
    scores = {}
    for feature in df.columns:
        score = 2 * roc_auc_score(y, df[feature]) - 1
        scores[feature] = score
    if prnt:
        sorted_values = sorted(scores.items(), key= lambda item: item[1], reverse= True)
        print("Gini Scores:")
        for feature, score in sorted_values:
            print(f"{feature}:\t{score}")
    return scores

In [None]:
allf_pipeline = Pipeline([
    ("One-Hot Encoder", OneHotEncoding([column for column, count in unique_categories.items() if count < 8])),
    ("Target-Mean Encoder", TargetMeanEncoding([column for column, count in unique_categories.items() if count >= 8], y)),
    ("Imputer", DataFrameImputer(imputer_class= SimpleImputer, strategy= 'mean'))    
])
X_train_allf = allf_pipeline.fit_transform(X_train)
gini_scores = calculate_gini(X_train_allf, y_train, prnt= True)

In [None]:
X_train_allf_target = pd.concat([X_train_allf, y_train], axis= 1)
plot_class_distribution(X_train_allf_target, gini_scores, plot_type= 'kde')

In [None]:
plot_feature_scatter(X_train_allf_target, gini_scores, threshold= 0.32)

## Model Training

In [None]:
def print_my_models(X_train, y_train, X_test, y_test,
                    rf= True, xg= True, lr= True, random_grid= True, scoring= 'roc_auc', cv= 5):
    # Random Forest
    print("\n\033[1m\033[4mRandom Forest results: \033[0m\n")
    rf_model = RandomForestClassifier(random_state= 42)
    rf_param_grid = {'n_estimators': [50, 100, 200, 500],
                     'max_depth': [None, 5, 10, 15],
                     'min_samples_split': [2, 5, 8, 10],
                     'min_samples_leaf': [1, 2, 4]}
    train_test_model(X_train, y_train, X_test, y_test, rf_model, 
                     param_grid= rf_param_grid, random_grid= random_grid, scoring= scoring, cv= cv) if rf else None
    
    # XGBoosting
    print("\n\033[1m\033[4mXGBoosting results: \033[0m\n")
    xgb_model = xgb.XGBClassifier(random_state= 42)
    xgb_param_grid = {'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2], 
                      'max_depth': [3, 5, 7, 10], 
                      'min_child_weight': [1, 3, 5, 7], 
                      'gamma': [0.0, 0.1, 0.2, 0.3], 
                      'colsample_bytree': [0.3, 0.5, 0.8, 1.0], 
                      'reg_alpha': [0, 0.1, 1], 
                      'reg_lambda': [1, 1.5, 2]}
    train_test_model(X_train, y_train, X_test, y_test, xgb_model, 
                     param_grid= xgb_param_grid, random_grid= random_grid, scoring= scoring, cv= cv) if xg else None
    
    # Logistic Regression
    print("\n\033[1m\033[4mLogistic Regression results: \033[0m\n")
    lr_model = LogisticRegression(random_state= 42)
    lr_param_grid = {'C': np.logspace(-4, 4, 20),
                     'solver': ['lbfgs', 'newton-cholesky', 'newton-cg', 'sag', 'saga'],
                     'max_iter': [750, 1000, 1500, 2000, 3000, 5000]}
    train_test_model(X_train, y_train, X_test, y_test, lr_model,
                     param_grid= lr_param_grid, random_grid= random_grid, scoring= scoring, cv= cv) if lr else None

def train_test_model(X_train, y_train, X_test, y_test, model, param_grid, random_grid, scoring, cv):
    if random_grid:
        grid = RandomizedSearchCV(estimator= model, param_distributions= param_grid, scoring= scoring, n_iter= 15, n_jobs= 4, cv= cv, verbose= 1)
    else:
        grid = GridSearchCV(estimator= model, param_grid= param_grid, scoring= scoring, n_jobs= 4, cv= cv, verbose= 4)
    
    grid.fit(X_train, y_train)
    best_params = grid.best_params_
    best_model = grid.best_estimator_
    cv_results = grid.cv_results_
    print(f"Best hyperparameters: {best_params}")
    print(f"Best estimator: {best_model}")
    print(f"Model Cross-Validation AUC: {cv_results['mean_test_score'][grid.best_index_]:.4f} with S.D. {cv_results['std_test_score'][grid.best_index_]}\n")
    
    y_prob_tr = best_model.predict_proba(X_train)[:, 1]
    y_pred_tr, best_threshold = optimize_threshold(best_model, y_prob_tr, y_train)
    print(f"Training Set Accuracy: {accuracy_score(y_train, y_pred_tr):.4f}\nTraining Set AUC: {roc_auc_score(y_train, y_prob_tr):.4f}")
    print(f"\n\033[1mTraining Set GINI: {(2 * roc_auc_score(y_train, y_prob_tr) - 1):.4f}\033[0m\n")
    
    y_prob_ts = best_model.predict_proba(X_test)[:, 1]
    y_pred_ts = (y_prob_ts >= best_threshold).astype(int)
    print(f"Validation Set Accuracy: {accuracy_score(y_test, y_pred_ts):.4f}\nValidation Set AUC: {roc_auc_score(y_test, y_prob_ts):.4f}")
    print(f"\n\033[1mValidation Set GINI: {(2 * roc_auc_score(y_test, y_prob_ts) - 1):.4f}\033[0m\n")
    
    print(f"Prediction Report:\n {classification_report(y_test, y_pred_ts)}")
    print(f"Confusion Matrix:\n {confusion_matrix(y_test, y_pred_ts)}\n")

def optimize_threshold(model, y_prob, y, thresholds= np.arange(0.0, 1.0, 0.05)):
    best_threshold = 0.5
    best_f1_score = 0.0
    best_report = None
    for threshold in thresholds:
        y_pred = (y_prob >= threshold).astype(int)
        report = classification_report(y, y_pred, zero_division= 0, output_dict= True)
        f1_minority = report['1']['f1-score']
        if f1_minority > best_f1_score:
            best_f1_score = f1_minority
            best_threshold = threshold
            best_report = report
    print(f"Best Threshold: {best_threshold}")
    print(f"Best F1 Score for the minority class: {best_f1_score}")
    best_y_pred = (y_prob >= best_threshold).astype(int)
    return best_y_pred, best_threshold

def plot_dimensionality_reduction(X, y, dim= 2, label= "PCA Projection"):
    pca = PCA(n_components= dim)
    X_reduced = pca.fit_transform(X)
                         
    colors = ['#1F77B4', '#FF7F0E']
    markers = ['o', 's']
    if dim == 2:
        for l, c, m in zip(np.unique(y), colors, markers):
            plt.scatter(X_reduced[y==l, 0], X_reduced[y==l, 1], c=c, label=l, marker=m)
        plt.title("PCA 2D Projection")
        plt.xlabel("PCA Component 1")
        plt.ylabel("PCA Component 2")
        plt.legend(loc='upper right')
        plt.show()
    elif dim == 3:
        fig = plt.figure()
        ax = fig.add_subplot(111, projection= '3d')
        for l, c, m in zip(np.unique(y), colors, markers):
            ax.scatter(X_reduced[y==l, 0], X_reduced[y==l, 1], X_reduced[y==l, 2], c=c, label=l, marker=m)
        ax.set_title("PCA 3D Projection)")
        ax.set_xlabel("PCA Component 1")
        ax.set_ylabel("PCA Component 2")
        ax.set_zlabel("PCA Component 3")
        plt.legend(loc='upper right')
        plt.show()
        
def sample_and_train(X_train, y_train, X_test, y_test, 
                     rus= True, tl= True, ros= True, smote= True, smoteenn= True):
    # Under Samplers
    if rus:
        print("\n\033[1m\033[4mPerforming Random Under Sampling... \033[0m\n")
        random_undersampler = RandomUnderSampler()
        X_resampled_rus, y_resampled_rus = random_undersampler.fit_resample(X_train, y_train)
        print_my_models(X_resampled_rus, y_resampled_rus, X_test, y_test)
    if tl:
        print("\n\033[1m\033[4mPerforming Tomek Links Sampling... \033[0m\n")
        tomeklinks_undersampler = TomekLinks()
        X_resampled_tl, y_resampled_tl = tomeklinks_undersampler.fit_resample(X_train, y_train)
        print_my_models(X_resampled_tl, y_resampled_tl, X_test, y_test)
        
    # Over Samplers
    if ros:
        print("\n\033[1m\033[4mPerforming Random Over Sampling... \033[0m\n")
        random_oversampler = RandomOverSampler()
        X_resampled_ros, y_resampled_ros = random_oversampler.fit_resample(X_train, y_train)
        print_my_models(X_resampled_ros, y_resampled_ros, X_test, y_test)
    if smote:
        print("\n\033[1m\033[4mPerforming SMOTE Sampling... \033[0m\n")
        syn_oversampler = SMOTE()
        X_resampled_smote, y_resampled_smote = syn_oversampler.fit_resample(X_train, y_train)
        print_my_models(X_resampled_smote, y_resampled_smote, X_test, y_test)
    
    # Mix
    if smoteenn:
        print("\n\033[1m\033[4mPerforming SMOTEENN Sampling... \033[0m\n")
        syn_underoversampler = SMOTEENN()
        X_resampled_smoteenn, y_resampled_smoteenn = syn_underoversampler.fit_resample(X_train, y_train)
        print_my_models(X_resampled_smoteenn, y_resampled_smoteenn, X_test, y_test)

In [None]:
#### Encoding 1

enc1_pipeline = Pipeline([
    ("One-Hot Encoder", OneHotEncoding([column for column, count in unique_categories.items() if count < 8])),
    ("Target-Mean Encoder", TargetMeanEncoding([column for column, count in unique_categories.items() if count >= 8], y)),
    ("Imputer", DataFrameImputer(imputer_class= SimpleImputer, strategy= 'mean'))
])
X_train_enc1 = enc1_pipeline.fit_transform(X_train)
X_val_enc1 = enc1_pipeline.transform(X_val)

#### Encoding 1 + M

enc1m_pipeline = Pipeline([
    ("One-Hot Encoder", OneHotEncoding([column for column, count in unique_categories.items() if count < 8])),
    ("Target-Mean Encoder", TargetMeanEncoding([column for column, count in unique_categories.items() if count >= 8], y)),
    ("Imputer", DataFrameImputer(imputer_class= SimpleImputer, strategy= 'mean')),
    ("Multicollinearity", MulticollinearityEliminator(y_train))
])
X_train_enc1m = enc1m_pipeline.fit_transform(X_train)
X_val_enc1m = enc1m_pipeline.transform(X_val)

#### Encoding 2

enc2_pipeline = Pipeline([
    ("Target-Mean Encoder", TargetMeanEncoding(X_train.columns[:20], y)),
    ("Imputer", DataFrameImputer(imputer_class= SimpleImputer, strategy= 'mean'))
])
X_train_enc2 = enc2_pipeline.fit_transform(X_train)
X_val_enc2 = enc2_pipeline.transform(X_val)

#### Encoding 2 + M

enc2m_pipeline = Pipeline([
    ("Target-Mean Encoder", TargetMeanEncoding(X_train.columns[:20], y)),
    ("Imputer", DataFrameImputer(imputer_class= SimpleImputer, strategy= 'mean')),
    ("Multicollinearity", MulticollinearityEliminator(y_train))
])
X_train_enc2m = enc2m_pipeline.fit_transform(X_train)
X_val_enc2m = enc2m_pipeline.transform(X_val)

#### Methods to handle the imbalanced dataset
- Cost Sensitive Training
- Under Sampling => Random Under Sampler, Tomek Links
- Over Sampling => Random Over Sampler, SMOTE
- Mix Sampling => SMOTEENN
- One-Class Classification
- Ensembling Techniques

##### Cost Sensitive Training

"+ Loss" fields in Excel

RandomForest(..., class_weight= 'balanced')

XGBClassifier(..., scale_pos_weight= negative_labels / positive_labels)

LogisticRegression(..., class_weight= 'balanced')

##### Under Sampling, Over Sampling, Mix Sampling

"Samplers" fields in Excel

Under Samplers used: Random Under Sampler (RUS), Tomek Links (TL)

Over Samplers used: Randm Over Sampler (ROS), SMOTE

Mix Samplers used: SMOTEENN

In [None]:
plot_dimensionality_reduction(X_train_enc1, y_train, dim= 2, label= "PCA Projection 2D")
plot_dimensionality_reduction(X_train_enc1, y_train, dim= 3, label= "PCA Projection 3D")

In [None]:
print("\n\033[1m\033[4mPerforming Random Under Sampling... \033[0m\n")
random_undersampler = RandomUnderSampler()
X_resampled_rus, y_resampled_rus = random_undersampler.fit_resample(X_train_enc1, y_train)
plot_dimensionality_reduction(X_resampled_rus, y_resampled_rus, dim= 2, label= "PCA Projection 2D")
plot_dimensionality_reduction(X_resampled_rus, y_resampled_rus, dim= 3, label= "PCA Projection 3D")
print("\n\033[1m\033[4mPerforming Near Miss Sampling... \033[0m\n")
nearmiss_sampler = NearMiss()
X_resampled_nms, y_resampled_nms = nearmiss_sampler.fit_resample(X_train_enc1, y_train)
plot_dimensionality_reduction(X_resampled_nms, y_resampled_nms, dim= 2, label= "PCA Projection 2D")
plot_dimensionality_reduction(X_resampled_nms, y_resampled_nms, dim= 3, label= "PCA Projection 3D")
print("\n\033[1m\033[4mPerforming Tomek Links Sampling... \033[0m\n")
tomeklinks_sampler = TomekLinks()
X_resampled_tls, y_resampled_tls = tomeklinks_sampler.fit_resample(X_train_enc1, y_train)
plot_dimensionality_reduction(X_resampled_tls, y_resampled_tls, dim= 2, label= "PCA Projection 2D")
plot_dimensionality_reduction(X_resampled_tls, y_resampled_tls, dim= 3, label= "PCA Projection 3D")

##### One-Class Classification

In [None]:
X_minority = X_train_enc1[y_train == 1]

## OneClassSVM
ocsvm = OneClassSVM(kernel= 'rbf', gamma= 'auto', nu= 0.1)
ocsvm.fit(X_minority)
y_scores = ocsvm.decision_function(X_val_enc1)
y_pred = (y_scores >= 0).astype(int)
gini = 2 * roc_auc_score(y_val, y_scores) - 1
print("For One Class SVM: \n")
print(f"Test set GINI score: {gini}\n")
print(f"Classification Report: \n{classification_report(y_val, y_pred)}\n")
print(f"Confusion Matrix: \n{confusion_matrix(y_val, y_pred)}\n")

## Isolation Forest
isolation_forest = IsolationForest(n_estimators= 100, contamination= 0.1, random_state= 42)
isolation_forest.fit(X_minority)
y_scores = isolation_forest.decision_function(X_val_enc1)
y_pred = (y_scores >= 0).astype(int)
gini = 2 * roc_auc_score(y_val, y_scores) - 1
print("For Isolation Forest: \n")
print(f"Test set GINI score: {gini}\n")
print(f"Classification Report: \n{classification_report(y_val, y_pred)}\n")
print(f"Confusion Matrix: \n{confusion_matrix(y_val, y_pred)}\n")

##### Ensembling Techniques

In [None]:
#### Stacking Classifier

param_grid = {
    'final_estimator__C': np.logspace(-4, 4, 20),
    'final_estimator__solver': ['lbfgs', 'newton-cholesky', 'newton-cg', 'sag', 'saga'],
    'final_estimator__max_iter': [750, 1000, 1500, 2000, 3000, 5000]
}
base_models = [
    ("rf", RandomForestClassifier(max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=500, random_state=42)),
    ("xgb", xgb.XGBClassifier(colsample_bytree=0.3, gamma=0.2, learning_rate=0.05, max_depth=5, min_child_weight=3,  random_state=42, reg_alpha= 1, reg_lambda= 1)),
    ("lr", LogisticRegression(C=3792.690190732246, max_iter=5000, random_state=42))
]
meta_model = LogisticRegression()
stacking_clf = StackingClassifier(estimators= base_models, final_estimator= meta_model, cv= 5, n_jobs= -1)
train_test_model(X_train_enc1, y_train, X_val_enc1, y_val, stacking_clf, param_grid, random_grid= True, scoring= 'roc_auc', cv= 5)

In [None]:
#### Voting Classifier

param_grid = {
    'rf__max_depth': [10], 'rf__min_samples_leaf': [4], 'rf__min_samples_split': [10], 'rf__n_estimators': [500], 'rf__random_state': [42],
    'xgb__colsample_bytree': [0.3], 'xgb__gamma': [0.2], 'xgb__learning_rate': [0.05], 'xgb__max_depth': [5], 'xgb__min_child_weight': [3], 'xgb__reg_alpha': [1], 'xgb__reg_lambda': [1],
    'lr__C': [3792.690190732246], 'lr__max_iter': [5000], 'lr__random_state': [42]
}
base_models = [
    ("rf", RandomForestClassifier()),
    ("xgb", xgb.XGBClassifier()),
    ("lr", LogisticRegression())
]
voting_clf = VotingClassifier(estimators= base_models, voting= 'soft')
train_test_model(X_train_enc1, y_train, X_val_enc1, y_val, voting_clf, param_grid= param_grid, random_grid= True, scoring= 'roc_auc', cv= 5)

### Best Models

In [None]:
def train_one_model(X_train, y_train, X_test, y_test, model):
    model.fit(X_train, y_train)
    y_prob_tr = model.predict_proba(X_train)[:, 1]
    y_pred_tr, best_threshold = optimize_threshold(model, y_prob_tr, y_train)
    print(f"Training Set Accuracy: {accuracy_score(y_train, y_pred_tr):.4f}\n")
    print(f"\n\033[1mTraining Set GINI: {(2 * roc_auc_score(y_train, y_prob_tr) - 1):.4f}\033[0m\n")

    y_prob_ts = model.predict_proba(X_test)[:, 1]
    y_pred_ts = (y_prob_ts >= best_threshold).astype(int)
    print(f"Validation Set Accuracy: {accuracy_score(y_test, y_pred_ts):.4f}\n")
    print(f"\n\033[1mValidation Set GINI: {(2 * roc_auc_score(y_test, y_prob_ts) - 1):.4f}\033[0m\n")

    print(f"Prediction Report:\n {classification_report(y_test, y_pred_ts)}")
    print(f"Confusion Matrix:\n {confusion_matrix(y_test, y_pred_ts)}\n")

##### XGBoost

In [None]:
negative_labels = y_train.value_counts()[0]
positive_labels = y_train.value_counts()[1]
model = xgb.XGBClassifier(colsample_bytree=0.8, gamma=0.3, learning_rate=0.05, max_depth=3, min_child_weight=1,  random_state=42, reg_alpha= 1, reg_lambda= 2, scale_pos_weight= negative_labels / positive_labels)
train_one_model(X_train_enc2m, y_train, X_val_enc2m, y_val, model)

In [None]:
model = xgb.XGBClassifier(colsample_bytree=0.3, gamma=0.2, learning_rate=0.05, max_depth=5, min_child_weight=3,  random_state=42, reg_alpha= 1, reg_lambda= 1)
train_one_model(X_train_enc1, y_train, X_val_enc1, y_val, model)

##### Random Forest

In [None]:
model = RandomForestClassifier(max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=500, random_state=42)
train_one_model(X_train_enc1, y_train, X_val_enc1, y_val, model)

##### Logistic Regression

In [None]:
model = LogisticRegression(C=29.763514416313132, max_iter=750, random_state=42, solver='newton-cg')
tomeklinks_undersampler = TomekLinks()
X_resampled_tl, y_resampled_tl = tomeklinks_undersampler.fit_resample(X_train_enc1, y_train)
train_one_model(X_resampled_tl, y_resampled_tl, X_val_enc1, y_val, model)

### Probabilities and Predictions on Test Data

In [None]:
df_test = pd.read_csv("", sep= ';')
df_test.set_index('ID', inplace= True)

negative_labels = y_train.value_counts()[0]
positive_labels = y_train.value_counts()[1]
xgb_best_model = xgb.XGBClassifier(colsample_bytree=0.8, gamma=0.3, learning_rate=0.05, max_depth=3, min_child_weight=1,  random_state=42, reg_alpha= 1, reg_lambda= 2, scale_pos_weight= negative_labels / positive_labels)
xgb_best_model.fit(X_train_enc2m, y_train)

y_prob_tr = xgb_best_model.predict_proba(X_train_enc2m)[:, 1]
y_pred_tr, best_threshold = optimize_threshold(xgb_best_model, y_prob_tr, y_train)
print(f"Training Set Accuracy: {accuracy_score(y_train, y_pred_tr):.4f}\n")
print(f"\n\033[1mTraining Set GINI: {(2 * roc_auc_score(y_train, y_prob_tr) - 1):.4f}\033[0m\n")

X_test = enc2m_pipeline.transform(df_test)
X_test.T

In [None]:
y_prob_test = xgb_best_model.predict_proba(X_test)[:, 1]
y_pred_test = (y_prob_test >= best_threshold).astype(int)

df_final = df_test.copy()
df_final = df_final[[]]
df_final["Probabilities"] = y_prob_test
df_final["Predictions"] = y_pred_test

In [None]:
df_final

In [None]:
df_final.to_csv("", index= True)
print("Predictions and Probabilities saved.")

###### Results

In [None]:
test_df = pd.read_csv("", sep= ';')
target = test_df['Target']

In [None]:
print(f"Test GINI score: {2 * roc_auc_score(target, y_prob_test)- 1}")
print(f"Test Accuracy: {accuracy_score(target, y_pred_test)}")
print(f"Prediction Report:\n {classification_report(target, y_pred_test)}")
print(f"Confusion Matrix:\n {confusion_matrix(target, y_pred_test)}\n")