In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import sys
import warnings

warnings.filterwarnings("ignore")

# Basic Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import *

# Modelling
# Classification
import statsmodels.api as sm

# from sklearn import LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Model Selection
from sklearn.model_selection import train_test_split

# Metrics
from sklearn.metrics import f1_score, log_loss, matthews_corrcoef, roc_auc_score

# Visualization
import matplotlib.pyplot as plt

<IPython.core.display.Javascript object>

In [3]:
def data_preprocessing(df, accepted_flag, target, train_ratio):
    """
    The goal of this function is to load the original dataset, split it into accepts and rejects,
    add ids, which can later be used for merging. For the rejects to further perform train / test split

    Parameters
    ----------

    df : name of the original dataset in quotation marks, csv format
    accepted_flag: name of the accepted flag; Binary: 1 if accepted, 0 if rejected
    target : name of the target column
    train_ratio : percentage used for training; Continuous (0,1)

    Return
    ------
    a : accepted data
    r : rejected data
    r_dev : rejected trainining data without label
    r_test : rejected testing data without label
    dfr_dev_with_label: rejected training data with label
    dft_test_with_label: rejected training data with label

    """
    # Load data
    data = pd.read_csv("C:/Users/Asus/Desktop/Repo/MasterThesis_RI/02_Data/" + df)

    # Accepted

    ## Create separate dataset with accepts
    dfa = data[data[accepted_flag] == 1]
    dfa = dfa.drop([accepted_flag], axis=1)
    ## Rename target variable as "target"
    dfa = dfa.rename(columns={target: "target"})
    ## Add id to the dataset, which can later be used for merging
    # dfa["id"] = dfa.index.to_series().map(lambda x: uuid.uuid4())

    # Rejected

    ## Create separate dataset with accepts
    dfr = data[data[accepted_flag] == 0]
    dfr = dfr.drop([accepted_flag], axis=1)
    ## Add id to the dataset, which can later be used for merging
    #     dfr["id"] = dfr.index.to_series().map(lambda x: uuid.uuid4())
    ## Train/Test Split (without labels)
    ### Shuffle the dataset
    shuffle_df = dfr.sample(frac=1, random_state=42)
    ### Define a size for the train set
    train_size = int(train_ratio * len(shuffle_df))
    ### Split the dataset
    dfr_dev = shuffle_df[:train_size]
    dfr_test = shuffle_df[train_size:]
    ## Save a copy of the rejected data with label
    dfr_dev_with_label = dfr_dev
    dfr_test_with_label = dfr_test
    ## Unlabel the rejects (i.e. drop the target) and save a copy of the rejeted data without label
    dfr_dev2 = dfr_dev_with_label.drop([target], axis=1)
    dfr_test2 = dfr_test_with_label.drop([target], axis=1)
    # Rename target variable
    dfr_dev_with_label = dfr_dev_with_label.rename(columns={target: "target"})
    dfr_test_with_label = dfr_test_with_label.rename(columns={target: "target"})

    return dfr_dev_with_label, dfr_test_with_label, dfa, dfr, dfr_dev2, dfr_test2

<IPython.core.display.Javascript object>

In [4]:
def create_X_y(data):
    """
    Undersample the data

    Parameters
    ----------
    data : Dataframe

    Return
    ------
    X_res : undersampled data; Dataframe
    y_res : undersampled labels; Dataframe

    """
    # Create X and y
    X = data.loc[:, data.columns != "target"]
    y = data.loc[:, data.columns == "target"]

    return X, y

<IPython.core.display.Javascript object>

In [5]:
def split(X, y):
    """
    Split the data into training and testing sample

    Parameters
    ----------
    X : data
    y : labels

    Return
    ------
    X_train : training modelling fields
    X_test : test modelling fields
    y_train : training labels
    y_test : testing labels

    """
    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(
        X_res, y_res, test_size=0.2, random_state=42
    )
    columns = X_train.columns

    # Columns
    X_train = pd.DataFrame(data=X_train, columns=columns)
    y_train = pd.DataFrame(data=y_train, columns=["target"])

    return X_train, X_test, y_train, y_test

<IPython.core.display.Javascript object>

In [6]:
def train_ssl():
    # Make copies of the dataframes
    X_train_iter = X_train.copy()
    y_train_iter = y_train.copy()
    r_dev_iter = r_dev.copy()
    f1_scores = []
    aucs = []
    iterations = []

    iteration = 0
    while 0.7 * len(r_dev) <= len(r_dev_iter):
        iteration = iteration + 1
        print("Iteration Nr {}".format(iteration))
        print(len(r_dev))
        # Build logistic regression
        KGB_new = RandomForestClassifier(random_state=42).fit(
            X_train_iter, y_train_iter
        )

        # Make predictions on the rejected data
        pred = KGB_new.predict_proba(r_dev_iter)[:, 1]
        pred = pd.DataFrame(
            data=pred,
            columns=["target"],
            index=r_dev_iter.index.copy(),
        )

        # Choose the most certain predictions
        lq = pred["target"].quantile(q=0.05)
        uq = pred["target"].quantile(q=0.95)
        pred["certain"] = pred["target"].apply(lambda x: 1 if (x < lq or x > uq) else 0)

        # If PD is high, apply default status
        pred["target"] = pred["target"].apply(lambda x: 1 if (x > uq) else 0)
        # pred["target"] = pred["target"].apply(lambda x: 1 if (x > np.random.uniform()) else 0)

        # Pick only the certain predictions and concatenate them to the dev set
        # Y TRAIN
        certain = pred[pred["certain"] == 1]
        certain2 = certain["target"].to_frame()
        y_train_iter = pd.concat((y_train_iter, certain2))

        # print(len(certain))

        # Get significant columns of the rejects based on index
        certain_features = pd.merge(
            certain["target"],
            r_dev_iter[significant_columns],
            how="inner",
            left_index=True,
            right_index=True,
        )

        # X TRAIN
        certain_features = certain_features.loc[:, certain_features.columns != "target"]
        X_train_iter = pd.concat((X_train_iter, certain_features))

        # Remove certain columns from rejected data
        rows = certain_features.index
        r_dev_iter = r_dev_iter.drop(rows, axis="index")

        print(len(r_dev_iter))
    return KGB_new

<IPython.core.display.Javascript object>

In [7]:
def train_one_iter():
    X_train_iter1 = X_train.copy()
    y_train_iter1 = y_train.copy()
    r_dev_iter1 = r_dev.copy()

    KGB1 = RandomForestClassifier(random_state=42).fit(X_train_iter1, y_train_iter1)
    # KGB1 = LogisticRegression(fit_intercept=False, penalty="none").fit(X_train_iter1, y_train_iter1)

    return KGB1

<IPython.core.display.Javascript object>

In [8]:
def predict_rejects(model, r_dev):
    # Make predictions on the Train Rejects
    pred_test = model.predict_proba(r_dev)[:, 1]
    # pred_test = model.predict(r_dev)
    pred_test = pd.DataFrame(
        data=pred_test,
        columns=["pred"],
        index=r_dev.index.copy(),
    )

    # Make binary predictions based on cutoff DR
    q1 = pred_test["pred"].quantile(q=1 - conservative_dr)
    pred_test["target"] = pred_test["pred"].apply(lambda x: 0 if (x < q1) else 1)
    pred_test = pred_test["target"].to_frame()

    # Add new rows to df
    y_train_new = pd.concat((y_train, pred_test))
    X_train_new = pd.concat((X_train, r_dev))

    # Fit new model
    # KGB_baseline_new = LogisticRegression(fit_intercept=False, penalty="none").fit(X_train_new, y_train_new)
    # KGB_baseline_new = LinearRegression().fit(X_train_new, y_train_new)
    KGB_baseline_new = RandomForestClassifier(random_state=42).fit(
        X_train_new, y_train_new
    )
    return KGB_baseline_new

<IPython.core.display.Javascript object>

In [9]:
def evaluate_test_accepts(model, X_test):
    pred_test = model.predict_proba(X_test)[:, 1]
    # pred_test = model.predict(X_test)
    pred_test = pd.DataFrame(
        data=pred_test,
        columns=["pred"],
        index=X_test.index.copy(),
    )

    # Merge with Target
    pred_test2 = pd.merge(
        pred_test["pred"],
        y_test["target"],
        how="inner",
        left_index=True,
        right_index=True,
    )

    # Make binary predictions based on cutoff 50percentile of the distribution
    q1 = pred_test2["pred"].quantile(q=1 - conservative_dr)
    pred_test2["prediction_baseline"] = pred_test2["pred"].apply(
        lambda x: 0 if (x < q1) else 1
    )
    return pred_test2

<IPython.core.display.Javascript object>

In [10]:
def evaluate_test_rejects(model, r_test):
    pred_test = model.predict_proba(r_test)[:, 1]
    # pred_test = model.predict(r_test)
    pred_test = pd.DataFrame(
        data=pred_test,
        columns=["pred"],
        index=r_test.index.copy(),
    )

    # Merge with Target
    pred_test2 = pd.merge(
        pred_test["pred"],
        dfr_test_with_label["target"],
        how="inner",
        left_index=True,
        right_index=True,
    )

    # Make binary predictions based on cutoff 50percentile of the distribution
    q1 = pred_test2["pred"].quantile(q=1 - conservative_dr)
    pred_test2["prediction_baseline"] = pred_test2["pred"].apply(
        lambda x: 0 if (x < q1) else 1
    )
    return pred_test2

<IPython.core.display.Javascript object>

In [11]:
def evaluate_test_combined(model, X_test, r_test):
    # Attach target to X_test and r_test

    r_test_target = pd.merge(
        r_test,
        dfr_test_with_label["target"],
        how="inner",
        left_index=True,
        right_index=True,
    )

    X_test_target = pd.merge(
        X_test,
        y_test["target"],
        how="inner",
        left_index=True,
        right_index=True,
    )

    # Concatenate labels
    ta_tr_labels = pd.concat([X_test_target, r_test_target], axis=0)

    # Concatenate Test Accepts and Test Rejects
    ta_tr = pd.concat([X_test, r_test], axis=0)

    pred_test = model.predict_proba(ta_tr)[:, 1]
    # pred_test = model.predict(r_test)
    pred_test = pd.DataFrame(
        data=pred_test,
        columns=["pred"],
        index=ta_tr.index.copy(),
    )

    # Merge with Target
    pred_test2 = pd.merge(
        pred_test["pred"],
        ta_tr_labels["target"],
        how="inner",
        left_index=True,
        right_index=True,
    )

    # Make binary predictions based on cutoff 50percentile of the distribution
    q1 = pred_test2["pred"].quantile(q=1 - conservative_dr)
    pred_test2["prediction_baseline"] = pred_test2["pred"].apply(
        lambda x: 0 if (x < q1) else 1
    )
    return pred_test2

<IPython.core.display.Javascript object>

In [12]:
def flag_df_baseline(df):

    # Flag kicked out bad cases (want more of these)
    if df["target"] == 1 and df["prediction_baseline"] == 1:
        return "CB"

    # Flag kicked out good cases (want less of these)
    elif df["target"] == 1 and df["prediction_baseline"] == 0:
        return "IB"

    # Flag kicked in good cases (want more of these)
    elif df["target"] == 0 and df["prediction_baseline"] == 0:
        return "CG"

    # Flag kicked in bad cases (want less of these)
    elif df["target"] == 0 and df["prediction_baseline"] == 1:
        return "IG"

<IPython.core.display.Javascript object>

In [13]:
def kickout_baseline(df):

    # Counts of kickout bad and kickout good
    counts = df["Flag"].value_counts()
    if "CB" in df.values:
        cb = counts.CB  # want more of these
    else:
        cb = 0
    if "IB" in df.values:
        ib = counts.IB  # want less of these
    else:
        ib = 0

    if "CG" in df.values:
        cg = counts.CG  # want more of these
    else:
        cg = 0

    if "IG" in df.values:
        ig = counts.IG  # want less of these
    else:
        ig = 0

    # Target
    total_bads = df[df["target"] == 1].shape[0]
    total_goods = df[df["target"] == 0].shape[0]
    pb = total_bads / (total_bads + total_goods)
    pg = total_goods / (total_bads + total_goods)

    kickout = (((cb / pb) - (ib / pb)) / total_bads) * (pb ** 2)
    kickin = (((cg / pg) - (ig / pg)) / total_goods) * (pg ** 2)
    weighted_total = kickout + kickin
    return weighted_total

<IPython.core.display.Javascript object>

In [17]:
def standard_evaluation(pred_data):
    # Concatenate X_test and r_test
    ta_tr = pd.concat([X_test, r_test], axis=0)
    # Make predictions
    f1 = f1_score(pred_data["target"], pred_data["prediction_baseline"])
    auc = roc_auc_score(pred_data["target"], pred_data["prediction_baseline"])
    return f1, auc

<IPython.core.display.Javascript object>

In [16]:
def castToList(x): #casts x to a list
    if isinstance(x, list):
        return x
    elif isinstance(x, str):
        return [x]
    try:
        return list(x)
    except TypeError:
        return [x]


<IPython.core.display.Javascript object>