In [34]:
import sys

sys.path.insert(
    0,
    r"C:\Users\Asus\Desktop\Repo\MasterThesis_RI\Python-Real-World-Machine-Learning\Module 2\Chapter 5",
)

import warnings

warnings.filterwarnings("ignore")

# Basic Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Sampling
from modAL.uncertainty import uncertainty_sampling

# Modelling
# Classification
import statsmodels.api as sm
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn import svm
from sklearn.svm import SVC

# Semi-Supervised Learning
from sklearn.semi_supervised import (
    LabelPropagation,
    LabelSpreading,
    SelfTrainingClassifier,
)
from modAL.models import ActiveLearner

# Chapter 5
from SelfLearning import SelfLearningModel
from scikitWQDA import WQDA

# Model Selection
from sklearn.model_selection import train_test_split, GridSearchCV

# Metrics
from sklearn.metrics import (
    f1_score,
    roc_auc_score,
    roc_curve,
    accuracy_score,
    confusion_matrix,
    plot_confusion_matrix,
    ConfusionMatrixDisplay,
    log_loss,
)

# Ensembling
from sklearn.ensemble import *

# Balancing
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Binning
import woeBinningPandas

# Create Unique ID
import uuid

import os

## Data Preprocessing

In [None]:
def data_preprocessing(df, accepted_flag, target, train_ratio):
    """
    The goal of this function is to load the original dataset, split it into accepts and rejects,
    add ids, which can later be used for merging. For the rejects to further perform train / test split

    Parameters
    ----------

    df : name of the original dataset in quotation marks, csv format
    accepted_flag: name of the accepted flag; Binary: 1 if accepted, 0 if rejected
    target : name of the target column
    train_ratio : percentage used for training; Continuous (0,1)
    Return
    ------
    a : accepted data
    r : rejected data
    r_dev : rejected trainining data
    r_test : rejected testing data

    """
    
    # Load data
    data = pd.read_csv("C:/Users/Asus/Desktop/Repo/MasterThesis_RI/Data_09_05/" + df)

    # Accepted

    ## Create separate dataset with accepts
    dfa = data[data[accepted_flag] == 1]
    dfa = dfa.drop([accepted_flag], axis=1)
    ## Rename target variable as "target"
    dfa = dfa.rename(columns={target: "target"})
    ## Add id to the dataset, which can later be used for merging
    dfa["id"] = dfa.index.to_series().map(lambda x: uuid.uuid4())

    # Rejected

    ## Create separate dataset with accepts
    dfr = data[data[accepted_flag] == 0]
    dfr = dfr.drop([accepted_flag], axis=1)
    ## Add id to the dataset, which can later be used for merging
    dfr["id"] = dfr.index.to_series().map(lambda x: uuid.uuid4())
    ## Train/Test Split (without labels)
    ### Shuffle the dataset
    shuffle_df = dfr.sample(frac=1, random_state=42)
    ### Define a size for the train set
    train_size = int(train_ratio * len(shuffle_df))
    ### Split the dataset
    dfr_dev = shuffle_df[:train_size]
    dfr_test = shuffle_df[train_size:]
    ## Unlabel the rejects (i.e. drop the target)
    dfr_dev_with_label = dfr_dev
    dfr_test_with_label = dfr_test
    dfr_dev2 = dfr_dev_with_label.drop([target], axis=1)
    dfr_test2 = dfr_test_with_label.drop([target], axis=1)

    return dfr_dev_with_label, dfr_test_with_label, dfa, dfr, dfr_dev2, dfr_test2

In [36]:
def select_columns_rejects_without_id(r_dev, r_test, r_dev_mod, r_test_mod):
    # Create rejects datasets with the modelling columns only (for a dataset with 8 features)
    r_dev_mod = r_dev.iloc[:, :9]
    r_test_mod = r_test.iloc[:, :9]
    return r_dev_mod, r_test_mod

In [37]:
def select_columns_rejects_with_id(r_dev, r_test, r_dev_mod_id, r_test_mod_id):
    # Create rejects datasets with the modelling columns + id
    r_dev_mod_id = r_dev.iloc[:, :10]
    r_test_mod_id = r_test.iloc[:, :10]
    return r_dev_mod_id, r_test_mod_id

In [38]:
def create_X_y(data):
    """
    Undersample the data

    Parameters
    ----------
    data : accepts, dataframe

    Return
    ------
    X_res : undersampled data
    y_res : undersampled labels

    """
    # Create X and y
    X = data.loc[:, data.columns != "target"]
    y = data.loc[:, data.columns == "target"]

    return X, y

In [39]:
def split(X, y):
    """
    Split the data into training and testing sample

    Parameters
    ----------
    X : data
    y : labels

    Return
    ------
    X_train : training modelling fields
    X_test : test modelling fields
    y_train : training labels
    y_test : testing labels

    """
    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(
        X_res, y_res, test_size=0.2, random_state=7
    )
    columns = X_train.columns

    # Columns
    X_train = pd.DataFrame(data=X_train, columns=columns)
    y_train = pd.DataFrame(data=y_train, columns=["target"])

    return X_train, X_test, y_train, y_test

In [40]:
def log_reg(X_train, y_train, X_test):
    #logreg = LogisticRegression(fit_intercept=True, penalty="none")
    logreg = LGBMClassifier()
    logreg.fit(X_train, y_train.values.ravel())
    y_pred = logreg.predict(X_test)
    return logreg, y_pred

In [43]:
def isolation_forest(X_train, r_dev_mod, r_test_mod):
    """
    The goal of this function is to filter the outliers from the rejected sample.

    Parameters
    ----------
    X_train: accepts training data; Dataframe
    r_dev_mod: rejects modelling data prior outlier treatment; Dataframe
    r_test_mod: rejects testinf data prior outlier treatment; Dataframe

    Return
    ------
    r_dev_mod: rejects modelling data post outlier treatment; Dataframe
    r_test_mod: rejects training data prior outlier treatment; Dataframe

    """

    # Build Isolation forest model
    isf = IsolationForest(
        n_estimators=50, max_samples="auto", contamination=float(0.02), max_features=1.0
    )
    isf.fit(X_train)
    rej_isf = isf.predict(r_dev_mod)
    # Add scores and anomaly columns to rejected train
    r_dev_mod["scores"] = isf.decision_function(r_dev_mod)
    r_dev_mod["anomaly"] = isf.predict(
        r_dev_mod[["known_col_0", "known_col_1", "known_col_3", "known_col_4"]]
    )
    # Print number of non-outliers and outliers
#     print(
#         "Rejected Train. Number of non-outliers is:", np.sum(r_dev_mod["anomaly"] == 1)
#     )
#     print("Rejected Train. Number of outliers is:", np.sum(r_dev_mod["anomaly"] == -1))
    # Drop all outliers
    r_dev_mod = r_dev_mod[r_dev_mod.anomaly != -1]
    # Delete columns related to the outliers
    r_dev_mod = r_dev_mod[["known_col_0", "known_col_1", "known_col_3", "known_col_4"]]

    # Add scores and anomaly columns to rejected test
    r_test_mod["scores"] = isf.decision_function(r_test_mod)
    r_test_mod["anomaly"] = isf.predict(
        r_test_mod[["known_col_0", "known_col_1", "known_col_3", "known_col_4"]]
    )
    # Print number of non-outliers and outliers
#     print(
#         "Rejected Test. Number of non-outliers is:", np.sum(r_test_mod["anomaly"] == 1)
#     )
#     print("Rejected Test. Number of outliers is:", np.sum(r_test_mod["anomaly"] == -1))
    # Drop all outliers
    r_test_mod = r_test_mod[r_test_mod.anomaly != -1]
    # Delete columns related to the outliers
    r_test_mod = r_test_mod[
        ["known_col_0", "known_col_1", "known_col_3", "known_col_4"]
    ]

    return r_dev_mod, r_test_mod

In [41]:
def pred(y_test, X_test, X_test_3, model):
    # Test set with labels
    test_labels = pd.merge(
        y_test,
        X_test,
        how="inner",
        left_index=True,
        right_index=True,
    )
    # Predictions on testset
    test_pred = model.predict_proba(X_test_3)[:, 1]
    test_pred2 = pd.DataFrame(data=test_pred, columns=["prediction"])
    test_pred2["count"] = test_pred2.groupby("prediction")["prediction"].transform(
        "count"
    )
    test_pred2.groupby(["prediction"]).count()
    test_pred2.describe()

    # Join predictions with test new
    pred_test_kgb = pd.DataFrame(
        data=test_pred, columns=["prediction_beforeRI"], index=y_test.index.copy()
    )
    pred_test1 = pd.merge(
        test_labels,
        pred_test_kgb[["prediction_beforeRI"]],
        how="inner",
        left_index=True,
        right_index=True,
    )

    return pred_test1

In [42]:
def all_metrics(pred_label, true_label, model):
    """""
    pred_label = predicted label of the model
    true_label = true label
    model = model name
    """ ""

    # F1 score
    f1_stat = f1_score(pred_label, true_label, average="weighted")

    # Confusion matrix
    cm = confusion_matrix(pred_label, true_label, labels=model.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
    fig = disp.plot()

    return print("F1_stat ", model, "is: ", f1_stat, fig)

In [44]:
def predictions1(model):
    # Join predictions with train new
    pred = model.predict_proba(r_dev_mod)[:, 1]
    pred2 = pd.DataFrame(
        data=pred,
        columns=["prediction2"],
        index=r_dev_mod.index.copy(),
    )

    # Set cut-off
    q1 = pred2["prediction2"].quantile(q=1 - conservative_dr)
    pred2["prediction_beforeRI"] = pred2["prediction2"].apply(
        lambda x: 0 if (x < q1) else 1
    )
    outcome = pd.merge(
        r_dev_mod_id,
        pred2[["prediction_beforeRI"]],
        how="inner",
        left_index=True,
        right_index=True,
    )
    # pred_test1.dropna(subset=["prediction_beforeRI"], inplace=True)
    outcome = outcome[["id", "prediction_beforeRI"]]
    return outcome

In [45]:
def print_results(data, state, prediction):
    print(
        "The number of accurately classified cases ",
        state,
        " is: ",
        data[
            (data.target == 1) & (data[prediction] == 1)
            | (data.target == 0) & (data[prediction] == 0)
        ].shape[0],
    )
    print(
        "The number of misclassified cases ",
        state,
        " is: ",
        data[
            (data.target == 1) & (data[prediction] == 0)
            | (data.target == 0) & (data[prediction] == 1)
        ].shape[0],
    )

In [46]:
def log_loss_fun(
    category,
    data,
    y_true,
    y_pred,
):
    print(category, " :", log_loss(data[y_true], data[y_pred]))

In [47]:
def evaluation(ri_data):  # ri1_train, ri2_train, etc..

    # TRAIN NEW
    # Join labels to train set
    # Accepts
    train_accepts = pd.merge(
        X_train,
        y_train[["target"]],
        how="left",
        left_index=True,
        right_index=True,
    )

    train_accepts["Flag1"] = "Accept"

    # Rejects
    train_rejects = pd.merge(
        r_dev_mod,
        ri_data[["prediction_beforeRI"]],
        how="left",
        left_index=True,
        right_index=True,
    )

    # Align naming
    train_rejects.rename(columns={"prediction_beforeRI": "target"}, inplace=True)

    # Create X and y
    X_res_rej, y_res_rej = create_X_y(train_rejects)

    #     # Sample a matching number of observations from the accepts as the size of rejects
    #     # Shuffle the dataset
    #     shuffle_df = train_accepts.sample(frac=1, random_state=42)
    #     # Define a size for the train set
    #     train_size = int(0.25 * len(shuffle_df))
    #     train_accepts = shuffle_df[:train_size]
    #     print(train_accepts.shape)

    # Concatenate Train Accepts and Train Rejects
    train_new = pd.concat([train_accepts, train_rejects])

    # Flag
    train_new["Flag"] = train_new["Flag1"].apply(
        lambda x: "Accept" if x == "Accept" else "Reject"
    )
    train_new = train_new.drop(columns=["Flag1"])

    # Retrain KGB Model

    # Split
    X_new = train_new[significant_columns]
    y_new = train_new["target"]
    X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(
        X_new, y_new, test_size=0.2, random_state=42
    )

    # Keep only columns for modelling
    os_data_X_2_new = X_train_new[significant_columns]
    X_test_2_new = X_test_new[significant_columns]

    # Build Logistic regression
    # logreg2 = LogisticRegression(fit_intercept=False, penalty="none")
    # logreg2 = GradientBoostingClassifier(criterion="mse")
    # logreg2 = RandomForestClassifier()
    logreg2 = LGBMClassifier()
    logreg2.fit(os_data_X_2_new, y_train_new.values.ravel())

    # Predictions
    pred_test2 = pred(y_test, X_test, X_test_3, logreg2)
    pred_test2.rename(
        columns={"prediction_beforeRI": "prediction_baseline"}, inplace=True
    )

    # Merge original and baseline predictions
    pred_test_final = pd.merge(
        pred_test1[["target", "prediction_beforeRI"]],
        pred_test2[["prediction_baseline"]],
        how="inner",
        left_index=True,
        right_index=True,
    )

    # Make binary predictions based on cutoff 50percentile of the distribution
    q1 = pred_test_final["prediction_beforeRI"].quantile(q=1 - conservative_dr)
    q2 = pred_test_final["prediction_baseline"].quantile(q=1 - conservative_dr)
    #     print(q1)
    #     print(q2)
    pred_test_final["prediction_beforeRI_binary"] = pred_test_final[
        "prediction_beforeRI"
    ].apply(lambda x: 0 if (x < q1) else 1)
    pred_test_final["prediction_baseline"] = pred_test_final[
        "prediction_baseline"
    ].apply(lambda x: 0 if (x < q2) else 1)

    #     # Log Loss
    #     log_loss_fun("Before", pred_test_final, "target", "prediction_beforeRI_binary")
    #     log_loss_fun("After", pred_test_final, "target", "prediction_baseline")

    #     # Numbers of accurately classified and misclassified cases
    #     print_results(pred_test_final, "before RI", "prediction_beforeRI_binary")
    #     print_results(pred_test_final, "with baseline", "prediction_baseline")
    return pred_test_final

In [None]:
def evaluation_rejects(ri_data):  # ri1_train, ri2_train, etc..

    # TRAIN NEW
    # Join labels to train set
    # Accepts
    train_accepts = pd.merge(
        X_train,
        y_train[["target"]],
        how="left",
        left_index=True,
        right_index=True,
    )

    train_accepts["Flag1"] = "Accept"

    # Rejects
    train_rejects = pd.merge(
        r_dev_mod,
        ri_data[["prediction_beforeRI"]],
        how="left",
        left_index=True,
        right_index=True,
    )

    # Align naming
    train_rejects.rename(columns={"prediction_beforeRI": "target"}, inplace=True)

    # Create X and y for rejects
    X_res_rej, y_res_rej = create_X_y(train_rejects)

    #     # Sample a matching number of observations from the accepts as the size of rejetcs
    #     ## Shuffle the dataset
    #     shuffle_df = train_accepts.sample(frac=1, random_state=42)
    #     ## Define a size for the train set
    #     train_size = int(0.03 * len(shuffle_df))
    #     train_accepts = shuffle_df[:train_size]
    #     print(train_accepts.shape)

    # Concatenate Train Accepts and Train Rejects
    train_new = pd.concat([train_accepts, train_rejects])

    # Flag
    train_new["Flag"] = train_new["Flag1"].apply(
        lambda x: "Accept" if x == "Accept" else "Reject"
    )
    train_new = train_new.drop(columns=["Flag1"])

    # Retrain KGB Model

    # Split
    X_new = train_new[significant_columns]
    y_new = train_new["target"]
    X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(
        X_new, y_new, test_size=0.2, random_state=42
    )

    # Keep only columns for modelling
    os_data_X_2_new = X_train_new[significant_columns]
    X_test_2_new = X_test_new[significant_columns]

    # Build Logistic regression
    # logreg2 = LogisticRegression(fit_intercept=False, penalty="none")
    # logreg2 = GradientBoostingClassifier(criterion="mse")
    # logreg2 = RandomForestClassifier()
    logreg2 = LGBMClassifier()
    logreg2.fit(os_data_X_2_new, y_train_new.values.ravel())

    # Predictions
    pred_rej2 = pred(
        dfr_test_with_label_y, dfr_test_with_label_X, dfr_test_with_label_X, logreg2
    )
    pred_rej2.rename(
        columns={"prediction_beforeRI": "prediction_baseline"}, inplace=True
    )

    # Merge original and baseline predictions
    pred_test_final = pd.merge(
        pred_rej1[["target", "prediction_beforeRI"]],
        pred_rej2[["prediction_baseline"]],
        how="inner",
        left_index=True,
        right_index=True,
    )

    # Make binary predictions based on cutoff: median of the distribution
    q1 = pred_test_final["prediction_beforeRI"].quantile(q=1 - conservative_dr)
    q2 = pred_test_final["prediction_baseline"].quantile(q=1 - conservative_dr)
    #     print(q1)
    #     print(q2)
    pred_test_final["prediction_beforeRI_binary"] = pred_test_final[
        "prediction_beforeRI"
    ].apply(lambda x: 0 if (x < q1) else 1)
    pred_test_final["prediction_baseline"] = pred_test_final[
        "prediction_baseline"
    ].apply(lambda x: 0 if (x < q2) else 1)

    #     # Log Loss
    #     log_loss_fun("Before", pred_test_final, "target", "prediction_beforeRI_binary")
    #     log_loss_fun("After", pred_test_final, "target", "prediction_baseline")

    #     # Numbers of accurately classified and misclassified cases
    #     print_results(pred_test_final, "before RI", "prediction_beforeRI_binary")
    #     print_results(pred_test_final, "with baseline", "prediction_baseline")
    return pred_test_final

## Semi-Supervised Learning

In [48]:
def ssl_prep(X_accept, y_accept, X_reject):
    """
    Parameters
    ----------

    X_train_acc : training data of accepted population
    y_train_acc: training lables of accepted population
    X_train_rej: training data of rejected population

    Return
    ------
    df : data of accepted and rejected population

    """
    # Merge explanatory and target in accepts
    accepts = pd.merge(
        X_accept, y_accept, how="left", left_index=True, right_index=True
    )
    # Create accept flag
    accepts["Flag1"] = "Accept"

    # Sample a matching number of observations from the rejects as the size of accepts
    ## Shuffle the dataset
    shuffle_df = X_reject.sample(frac=1)
    ## Define a size for the train set
    train_size = int(0.25 * len(shuffle_df))
    train_rejects = shuffle_df[:train_size]

    # Merge accepts and rejects
    df = pd.concat([accepts, train_rejects])

    # If accepted use accept label, if rejected use -1 (default value for unlabelled entries) - hard-coded for now
    conditions = [
        (df["Flag1"] == "Accept") & (df["target"] == 1),
        (df["Flag1"] == "Accept") & (df["target"] == 0),
    ]
    choices = [1, 0]

    # New target is called unlabel
    df["unlabel"] = np.select(conditions, choices, -1)

    # Select columns for modelling - hard-coded for now - can be moved outside of the function
    df = df[["known_col_0", "known_col_1", "known_col_3", "known_col_4", "unlabel"]]

    return df

In [49]:
def ssl_split(df, target):
    """
    Parameters
    ----------

    df : dataframe of accepted and rejected population, including data and labels
    target: string name of the target column, should be passed in quotation marks (e.g. "target")

    Return
    ------
    X_train: training data of accepted and rejected population, ready to be fed into the semi-supervised model
    y_train: training labels of accepted and rejected population, ready to be fed into the semi-supervised model

    """

    X_train = df.loc[:, df.columns != target]
    y_train = df.loc[:, df.columns == target]
    return X_train, y_train

In [50]:
def ssl_model_selftraining(X_train, y_train, model):
    """
    Parameters
    ----------

    X_train : training data of accepted and rejected population
    y_train : training lables of accepted population (0,1) and rejected population (-1)
    model : semi-supervised learning model from sklearn (Self-Training Classifier)

    Return
    ------

    ssl: trained semi-supervised learning model

    """

    # Fit SSL moodel
    #base = SVC(probability=True, gamma="auto")
    #base = LogisticRegression(fit_intercept=True, penalty="none")
    base = LGBMClassifier()
    model = model(base)
    labels = np.copy(y_train)
    data = np.copy(X_train)
    ssl = model.fit(data, labels)
    return ssl

In [63]:
def ssl_model_label(X_train, y_train, model):
    """
    Parameters
    ----------

    X_train : training data of accepted and rejected population
    y_train : training lables of accepted population (0,1) and rejected population (-1)
    model : semi-supervised learning model from sklearn (Label Propagation, Label Spreading)

    Return
    ------

    ssl: trained semi-supervised learning model

    """

    # Fit SSL moodel
    model = model()
    labels = np.copy(y_train)
    data = np.copy(X_train)
    ssl = model.fit(data, labels)
    return ssl

In [52]:
def ssl_predictions_ds(X_test, estimators):
    """
    Parameters
    ----------

    ssl : trained semi-supervised learning model
    X_test : testing data of accepted and rejected population for predictions

    Return
    ------

    Predictions before RI (binary)
    Predictions after RI (binary)

    """
    # Make Predictions
    y_pred = combine_using_Dempster_Schafer(X_test, estimators)

    # Convert y_pred array to pandas dataframe
    pred_test = pd.DataFrame(
        data=y_pred,
        columns=["prediction_afterRI"],
        index=X_test.index.copy(),
    )
    a1 = pred_test1[["id", "target", "prediction_beforeRI"]]  # hard-coded for now
    a2 = pred_test[["prediction_afterRI"]]  # hard-coded for now

    # Merge a1 and a2
    a1_a2_inner = pd.merge(
        a1,
        a2,
        how="inner",
        left_index=True,
        right_index=True,
    )

    # Make binary predictions based on cutoff 75percentile of the distribution
    q1 = a1_a2_inner["prediction_beforeRI"].quantile(q=0.25)
    q2 = a1_a2_inner["prediction_afterRI"].quantile(q=0.25)

    a1_a2_inner["prediction_beforeRI_binary"] = a1_a2_inner[
        "prediction_beforeRI"
    ].apply(lambda x: 0 if (x < q1) else 1)
    a1_a2_inner["prediction_afterRI_binary"] = a1_a2_inner["prediction_afterRI"].apply(
        lambda x: 0 if (x < q2) else 1
    )

    # Log Loss
    #log_loss_fun("Before", a1_a2_inner, "target", "prediction_beforeRI")
    #log_loss_fun("After", a1_a2_inner, "target", "prediction_afterRI")

    # Numbers of accurately classified and misclassified cases
    #print_results(a1_a2_inner, "before RI", "prediction_beforeRI_binary")
    #print_results(a1_a2_inner, "after RI", "prediction_afterRI_binary")
    return a1_a2_inner

In [53]:
def ssl_predictions_oth(ssl, X_test):
    """
    Parameters
    ----------

    ssl : trained semi-supervised learning model
    X_test : testing data of accepted and rejected population for predictions

    Return
    ------

    Predictions before RI (binary)
    Predictions after RI (binary)

    """
    # Make Predictions
    y_pred = ssl.predict_proba(X_test)[:, 1]

    # Convert y_pred array to pandas dataframe
    pred_test = pd.DataFrame(
        data=y_pred,
        columns=["prediction_ssl_cont"],
        index=X_test.index.copy(),
    )

    # Set quantile
    q = pred_test["prediction_ssl_cont"].quantile(q=1 - conservative_dr)
    pred_test["prediction_ssl"] = pred_test["prediction_ssl_cont"].apply(
        lambda x: 0 if (x < q) else 1
    )

    # Merge Baseline and SSL prediction
    pred_test_final2 = pd.merge(
        outcome_a[["target", "prediction_baseline", "prediction_beforeRI_binary"]],
        pred_test[["prediction_ssl"]],
        how="inner",
        left_index=True,
        right_index=True,
    )

#     # Log Loss
#     log_loss_fun("Before", pred_test_final2, "target", "prediction_beforeRI_binary")
#     log_loss_fun("After", pred_test_final2, "target", "prediction_ssl")

#     # Numbers of accurately classified and misclassified cases
#     print_results(pred_test_final2, "before RI", "prediction_beforeRI_binary")
#     print_results(pred_test_final2, "after RI", "prediction_ssl")
    return pred_test_final2

In [None]:
def ssl_predictions_oth_rej(ssl, dfr_test_with_label_X):
    """
    Parameters
    ----------

    ssl : trained semi-supervised learning model
    X_test : testing data of accepted and rejected population for predictions

    Return
    ------

    Predictions before RI (binary)
    Predictions after RI (binary)

    """
    # Make Predictions

    y_pred = ssl.predict_proba(dfr_test_with_label_X)[:, 1]

    # Convert y_pred array to pandas dataframe
    pred_test = pd.DataFrame(
        data=y_pred,
        columns=["prediction_ssl_cont"],
        index=dfr_test_with_label_X.index.copy(),
    )

    # Set quantile
    q = pred_test["prediction_ssl_cont"].quantile(q=1 - conservative_dr)
    pred_test["prediction_ssl"] = pred_test["prediction_ssl_cont"].apply(
        lambda x: 0 if (x < q) else 1
    )

    # Merge Baseline and SSL prediction
    pred_test_final2 = pd.merge(
        outcome_b[["target", "prediction_baseline", "prediction_beforeRI_binary"]],
        pred_test[["prediction_ssl"]],
        how="inner",
        left_index=True,
        right_index=True,
    )

#     # Log Loss
#     log_loss_fun("Before", pred_test_final2, "target", "prediction_beforeRI_binary")
#     log_loss_fun("After", pred_test_final2, "target", "prediction_ssl")

#     # Numbers of accurately classified and misclassified cases
#     print_results(pred_test_final2, "before RI", "prediction_beforeRI_binary")
#     print_results(pred_test_final2, "after RI", "prediction_ssl")
    return pred_test_final2

In [54]:
def ssl_predictions_oth2(ssl, X_test):
    """
    Parameters
    ----------

    ssl : trained semi-supervised learning model
    X_test : testing data of accepted and rejected population for predictions

    Return
    ------

    Predictions before RI (binary)
    Predictions after RI (binary)

    """
    # Make Predictions

    y_pred = ssl.predict_proba(X_test)[:, 1]

    # Convert y_pred array to pandas dataframe
    pred_test = pd.DataFrame(
        data=y_pred,
        columns=["prediction_afterRI"],
        index=X_test.index.copy(),
    )
    a1 = pred_test1[["id", "target", "prediction_beforeRI"]]  # hard-coded for now
    a2 = pred_test[["prediction_afterRI"]]  # hard-coded for now

    # Merge a1 and a2
    a1_a2_inner = pd.merge(
        a1,
        a2,
        how="inner",
        left_index=True,
        right_index=True,
    )

    # Make binary predictions based on cutoff 75percentile of the distribution
    q1 = a1_a2_inner["prediction_beforeRI"].quantile(q=0.25)
    q2 = a1_a2_inner["prediction_afterRI"].quantile(q=0.25)

    a1_a2_inner["prediction_beforeRI_binary"] = a1_a2_inner[
        "prediction_beforeRI"
    ].apply(lambda x: 0 if (x < q1) else 1)
    a1_a2_inner["prediction_afterRI_binary"] = a1_a2_inner["prediction_afterRI"].apply(
        lambda x: 0 if (x < q2) else 1
    )

    # Log Loss
#     log_loss_fun("Before", a1_a2_inner, "target", "prediction_beforeRI")
#     log_loss_fun("After", a1_a2_inner, "target", "prediction_afterRI")

#     # Numbers of accurately classified and misclassified cases
#     print_results(a1_a2_inner, "before RI", "prediction_beforeRI_binary")
#     print_results(a1_a2_inner, "after RI", "prediction_afterRI_binary")
    return a1_a2_inner

In [55]:
def ssl_predictions_al(al, X_test):
    """
    Parameters
    ----------

    ssl : trained semi-supervised learning model
    X_test : testing data of accepted and rejected population for predictions

    Return
    ------

    Predictions before RI (binary)
    Predictions after RI (binary)

    """
    # Make Predictions

    y_pred = regressor.predict(X_test_3)

    # Convert y_pred array to pandas dataframe
    pred_test = pd.DataFrame(
        data=y_pred,
        columns=["prediction_afterRI"],
        index=X_test.index.copy(),
    )
    a1 = pred_test1[["id", "target", "prediction_beforeRI"]]  # hard-coded for now
    a2 = pred_test[["prediction_afterRI"]]  # hard-coded for now

    # Merge a1 and a2
    a1_a2_inner = pd.merge(
        a1,
        a2,
        how="inner",
        left_index=True,
        right_index=True,
    )

    # Make binary predictions based on cutoff 75percentile of the distribution
    q1 = a1_a2_inner["prediction_beforeRI"].quantile(q=0.25)
    q2 = a1_a2_inner["prediction_afterRI"].quantile(q=0.25)

    a1_a2_inner["prediction_beforeRI_binary"] = a1_a2_inner[
        "prediction_beforeRI"
    ].apply(lambda x: 0 if (x < q1) else 1)
    a1_a2_inner["prediction_afterRI_binary"] = a1_a2_inner["prediction_afterRI"].apply(
        lambda x: 0 if (x < q2) else 1
    )

    # Log Loss
    log_loss_fun("Before", a1_a2_inner, "target", "prediction_beforeRI")
    log_loss_fun("After", a1_a2_inner, "target", "prediction_afterRI")

    # Numbers of accurately classified and misclassified cases
    print_results(a1_a2_inner, "before RI", "prediction_beforeRI_binary")
    print_results(a1_a2_inner, "after RI", "prediction_afterRI_binary")
    return a1_a2_inner

## Active Learning

In [56]:
def regression_std(regressor, X):
    _, std = regressor.predict(X, return_std=True)
    return np.argmax(std)

In [57]:
def active_learning(n, X_train, y_train, r_dev_mod):
    n_initial = n
    initial_idx = np.random.choice(range(len(X_train)), size=n_initial, replace=False)
    X_training, y_training = X_train.iloc[initial_idx], y_train.iloc[initial_idx]

    learner = ActiveLearner(
        estimator=LGBMClassifier(),
        query_strategy=uncertainty_sampling,
        X_training=X_training,
        y_training=y_training,
    )
    query_idx, query_inst = learner.query(r_dev_mod)
    # active learning
    n_queries = int(0.2 * len(r_dev_mod))
    for idx in range(n_queries):
        query_idx, query_instance = learner.query(X_train)
        learner.teach(X_train.iloc[query_idx], y_train.iloc[query_idx])
    return learner

In [58]:
def active_learning2(X_train, y_train, r_dev_mod, fraction):
    n_initial = len(X_train)
    initial_idx = np.random.choice(range(len(X_train)), size=n_initial, replace=False)
    X_training, y_training = X_train.iloc[initial_idx], y_train.iloc[initial_idx]

    learner = ActiveLearner(
        estimator=LGBMClassifier(),
        query_strategy=uncertainty_sampling,
        X_training=X_training,
        y_training=y_training,
    )
    query_idx, query_inst = learner.query(r_dev_mod)
    # active learning
    n_queries = int(fraction * len(r_dev_mod))
    for idx in range(n_queries):
        query_idx, query_instance = learner.query(X_train)
        learner.teach(X_train.iloc[query_idx], y_train.iloc[query_idx])
    return learner

#### Active Learning with Self-Training Classifier

In [None]:
def active_learning3(X_train, y_train, r_dev_mod, fraction):
    n_initial = len(X_train)
    initial_idx = np.random.choice(range(len(X_train)), size=n_initial, replace=False)
    X_training, y_training = X_train.iloc[initial_idx], y_train.iloc[initial_idx]

    learner = ActiveLearner(
        estimator=SelfTrainingClassifier(
            base_estimator=LGBMClassifier()
        ),
        query_strategy=uncertainty_sampling,
        X_training=X_training,
        y_training=y_training,
    )
    query_idx, query_inst = learner.query(r_dev_mod)
    # active learning
    n_queries = int(fraction * len(r_dev_mod))
    for idx in range(n_queries):
        query_idx, query_instance = learner.query(X_train)
        learner.teach(X_train.iloc[query_idx], y_train.iloc[query_idx])
    return learner

## Kickout measure

In [59]:
# def flag_df(df):

#     # Flag kicked out bad cases (want more of these)
#     if (
#         df["target"] == 1
#         and df["prediction_beforeRI_binary"] == 0
#         and df["prediction_ssl"] == 1
#     ):
#         return "KB"

#     # Flag kicked out good cases (want less of these)
#     elif (
#         df["target"] == 0
#         and df["prediction_beforeRI_binary"] == 0
#         and df["prediction_ssl"] == 1
#     ):
#         return "KG"

#     # Flag kicked in good cases (want more of these)
#     elif (
#         df["target"] == 0
#         and df["prediction_beforeRI_binary"] == 1
#         and df["prediction_ssl"] == 0
#     ):
#         return "IG"

#     # Flag kicked in bad cases (want less of these)
#     elif (
#         df["target"] == 1
#         and df["prediction_beforeRI_binary"] == 1
#         and df["prediction_ssl"] == 0
#     ):
#         return "IB"

In [60]:
# def kickout(df):

#     # Counts of kickout bad and kickout good
#     counts = df["Flag"].value_counts()
#     if "KB" in df.values:
#         kb = counts.KB  # want more of these
#     else:
#         kb = 0
#     if "KG" in df.values:
#         kg = counts.KG  # want less of these
#     else:
#         kg = 0

#     if "IG" in df.values:
#         ig = counts.IG  # want more of these
#     else:
#         ig = 0

#     if "IB" in df.values:
#         ib = counts.IB  # want less of these
#     else:
#         ib = 0

#     # Counts of number of actual bad cases
#     sb = df[df["target"] == 1].shape[0]
#     sg = df[df["target"] == 0].shape[0]

#     # Target
#     counts_target = df["target"].value_counts()
#     total_bads = counts_target[0]
#     total_goods = counts_target[1]

#     total_bads = df[df["target"] == 1].shape[0]
#     total_goods = df[df["target"] == 0].shape[0]
#     pb = total_bads / (total_bads + total_goods)
#     pg = total_goods / (total_bads + total_goods)

#     # Calculate kickout metric
#     kickout = (((kb / pb) - (kg / (1 - pb))) / sb) * (pb * pb)
#     kickin = (((ig / pg) - (ib / (1 - pg))) / sg) * (pg * pg)
#     weighted_total = kickout + kickin

#     return weighted_total

In [None]:
def flag_df_beforeRI(df):

    # Flag kicked out bad cases (want more of these)
    if df["target"] == 1 and df["prediction_beforeRI_binary"] == 1:
        return "CB"

    # Flag kicked out good cases (want less of these)
    elif df["target"] == 1 and df["prediction_beforeRI_binary"] == 0:
        return "IB"

    # Flag kicked in good cases (want more of these)
    elif df["target"] == 0 and df["prediction_beforeRI_binary"] == 0:
        return "CG"

    # Flag kicked in bad cases (want less of these)
    elif df["target"] == 0 and df["prediction_beforeRI_binary"] == 1:
        return "IG"

In [None]:
def kickout_beforeRI(df):

    # Counts of kickout bad and kickout good
    counts = df["Flag"].value_counts()
    if "CB" in df.values:
        cb = counts.CB  # want more of these
    else:
        cb = 0
    if "IB" in df.values:
        ib = counts.IB  # want less of these
    else:
        ib = 0

    if "CG" in df.values:
        cg = counts.CG  # want more of these
    else:
        cg = 0

    if "IG" in df.values:
        ig = counts.IG  # want less of these
    else:
        ig = 0

    # Target
    total_bads = df[df["target"] == 1].shape[0]
    total_goods = df[df["target"] == 0].shape[0]
    pb = total_bads / (total_bads + total_goods)
    pg = total_goods / (total_bads + total_goods)

    kickout = (((cb / pb) - (ib / pb)) / total_bads) * (pb ** 2)
    kickin = (((cg / pg) - (ig / pg)) / total_goods) * (pg ** 2)
    weighted_total = kickout + kickin
    return weighted_total

In [None]:
def flag_df_baseline(df):

    # Flag kicked out bad cases (want more of these)
    if df["target"] == 1 and df["prediction_baseline"] == 1:
        return "CB"

    # Flag kicked out good cases (want less of these)
    elif df["target"] == 1 and df["prediction_baseline"] == 0:
        return "IB"

    # Flag kicked in good cases (want more of these)
    elif df["target"] == 0 and df["prediction_baseline"] == 0:
        return "CG"

    # Flag kicked in bad cases (want less of these)
    elif df["target"] == 0 and df["prediction_baseline"] == 1:
        return "IG"

In [None]:
def kickout_baseline(df):

    # Counts of kickout bad and kickout good
    counts = df["Flag"].value_counts()
    if "CB" in df.values:
        cb = counts.CB  # want more of these
    else:
        cb = 0
    if "IB" in df.values:
        ib = counts.IB  # want less of these
    else:
        ib = 0

    if "CG" in df.values:
        cg = counts.CG  # want more of these
    else:
        cg = 0

    if "IG" in df.values:
        ig = counts.IG  # want less of these
    else:
        ig = 0

    # Target
    total_bads = df[df["target"] == 1].shape[0]
    total_goods = df[df["target"] == 0].shape[0]
    pb = total_bads / (total_bads + total_goods)
    pg = total_goods / (total_bads + total_goods)

    kickout = (((cb / pb) - (ib / pb)) / total_bads) * (pb ** 2)
    kickin = (((cg / pg) - (ig /  pg)) / total_goods) * (pg ** 2)
    weighted_total = kickout + kickin
    return weighted_total

In [None]:
def flag_df_ssl(df):

    # Flag kicked out bad cases (want more of these)
    if df["target"] == 1 and df["prediction_ssl"] == 1:
        return "CB"

    # Flag kicked out good cases (want less of these)
    elif df["target"] == 1 and df["prediction_ssl"] == 0:
        return "IB"

    # Flag kicked in good cases (want more of these)
    elif df["target"] == 0 and df["prediction_ssl"] == 0:
        return "CG"

    # Flag kicked in bad cases (want less of these)
    elif df["target"] == 0 and df["prediction_ssl"] == 1:
        return "IG"

In [None]:
def kickout_ssl(df):

    # Counts of kickout bad and kickout good
    counts = df["Flag"].value_counts()
    if "CB" in df.values:
        cb = counts.CB  # want more of these
    else:
        cb = 0
    if "IB" in df.values:
        ib = counts.IB  # want less of these
    else:
        ib = 0

    if "CG" in df.values:
        cg = counts.CG  # want more of these
    else:
        cg = 0

    if "IG" in df.values:
        ig = counts.IG  # want less of these
    else:
        ig = 0

    # Target
    total_bads = df[df["target"] == 1].shape[0]
    total_goods = df[df["target"] == 0].shape[0]
    pb = total_bads / (total_bads + total_goods)
    pg = total_goods / (total_bads + total_goods)

    kickout = (((cb / pb) - (ib /  pb)) / total_bads) * (pb ** 2)
    kickin = (((cg / pg) - (ig /  pg)) / total_goods) * (pg ** 2)
    weighted_total = kickout + kickin
    return weighted_total