In [184]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [185]:
import sys
import warnings

warnings.filterwarnings("ignore")

# Basic Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Modelling
# Classification
import statsmodels.api as sm

# from sklearn import LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Model Selection
from sklearn.model_selection import train_test_split

# Metrics
from sklearn.metrics import f1_score, log_loss, matthews_corrcoef, roc_auc_score

# Visualization
import matplotlib.pyplot as plt

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Data Preprocessing

First, basic data preprocessing to obtain accepted and rejected training and test samples separately. Save rejected data in two versions: with and without lables. The rejected data without labels is needed for the semi-supervised model. The rejected data without labels is needed to perform evaluation. 

In [186]:
def data_preprocessing(df, accepted_flag, target, train_ratio):
    """
    The goal of this function is to load the original dataset, split it into accepts and rejects,
    add ids, which can later be used for merging. For the rejects to further perform train / test split

    Parameters
    ----------

    df : name of the original dataset in quotation marks, csv format
    accepted_flag: name of the accepted flag; Binary: 1 if accepted, 0 if rejected
    target : name of the target column
    train_ratio : percentage used for training; Continuous (0,1)

    Return
    ------
    a : accepted data
    r : rejected data
    r_dev : rejected trainining data without label
    r_test : rejected testing data without label
    dfr_dev_with_label: rejected training data with label
    dft_test_with_label: rejected training data with label

    """
    # Load data
    data = pd.read_csv("C:/Users/Asus/Desktop/Repo/MasterThesis_RI/Sample_18_06/" + df)

    # Accepted

    ## Create separate dataset with accepts
    dfa = data[data[accepted_flag] == 1]
    dfa = dfa.drop([accepted_flag], axis=1)
    ## Rename target variable as "target"
    dfa = dfa.rename(columns={target: "target"})
    ## Add id to the dataset, which can later be used for merging
    # dfa["id"] = dfa.index.to_series().map(lambda x: uuid.uuid4())

    # Rejected

    ## Create separate dataset with accepts
    dfr = data[data[accepted_flag] == 0]
    dfr = dfr.drop([accepted_flag], axis=1)
    ## Add id to the dataset, which can later be used for merging
    #     dfr["id"] = dfr.index.to_series().map(lambda x: uuid.uuid4())
    ## Train/Test Split (without labels)
    ### Shuffle the dataset
    shuffle_df = dfr.sample(frac=1, random_state=42)
    ### Define a size for the train set
    train_size = int(train_ratio * len(shuffle_df))
    ### Split the dataset
    dfr_dev = shuffle_df[:train_size]
    dfr_test = shuffle_df[train_size:]
    ## Save a copy of the rejected data with label
    dfr_dev_with_label = dfr_dev
    dfr_test_with_label = dfr_test
    ## Unlabel the rejects (i.e. drop the target) and save a copy of the rejeted data without label
    dfr_dev2 = dfr_dev_with_label.drop([target], axis=1)
    dfr_test2 = dfr_test_with_label.drop([target], axis=1)
    # Rename target variable
    dfr_dev_with_label = dfr_dev_with_label.rename(columns={target: "target"})
    dfr_test_with_label = dfr_test_with_label.rename(columns={target: "target"})

    return dfr_dev_with_label, dfr_test_with_label, dfa, dfr, dfr_dev2, dfr_test2

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [187]:
dfr_dev_with_label, dfr_test_with_label, a, r, r_dev, r_test = data_preprocessing(
    "linear_10dr_20rr.csv", "is_selected", "y", 0.8
)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

The below two functions continue the data preprocessing. Used to create feature and target data and to split into train and test samples.

In [188]:
def create_X_y(data):
    """
    Undersample the data

    Parameters
    ----------
    data : Dataframe

    Return
    ------
    X_res : undersampled data; Dataframe
    y_res : undersampled labels; Dataframe

    """
    # Create X and y
    X = data.loc[:, data.columns != "target"]
    y = data.loc[:, data.columns == "target"]

    return X, y

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [189]:
def split(X, y):
    """
    Split the data into training and testing sample

    Parameters
    ----------
    X : data
    y : labels

    Return
    ------
    X_train : training modelling fields
    X_test : test modelling fields
    y_train : training labels
    y_test : testing labels

    """
    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(
        X_res, y_res, test_size=0.2, random_state=42
    )
    columns = X_train.columns

    # Columns
    X_train = pd.DataFrame(data=X_train, columns=columns)
    y_train = pd.DataFrame(data=y_train, columns=["target"])

    return X_train, X_test, y_train, y_test

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [190]:
X_res, y_res = create_X_y(a)
X_train, X_test, y_train, y_test = split(X_res, y_res)
dfr_test_with_label_X = dfr_test_with_label.loc[:, dfr_test_with_label.columns != "y"]
dfr_test_with_label_y = dfr_test_with_label.loc[:, dfr_test_with_label.columns == "y"]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Feature Selection

First, we select the features that will end up in the model. The selection of columns below is subject to iteration based on the modelling outcomes from the logistic regression, i.e. significance (p-values).

In [191]:
significant_columns = ["col_0", "col_1"]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [192]:
X_train2 = X_train.copy()
y_train2 = y_train.copy()
X_test2 = X_test.copy()
y_test2 = y_test.copy()
r_dev2 = r_dev.copy()
r_test2 = r_test.copy()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [193]:
# Primary datasets
X_train = X_train[significant_columns]
X_test = X_test[significant_columns]
r_dev = r_dev[significant_columns]
r_test = r_test[significant_columns]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [194]:
# Build Logistic regression
# Statmodels
X_in = sm.add_constant(X_train.astype(float))
logit_model = sm.Logit(y_train, X_in)
result3 = logit_model.fit()
print(result3.summary2())

Optimization terminated successfully.
         Current function value: 0.122138
         Iterations 10
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.422    
Dependent Variable: target           AIC:              3132.7362
Date:               2021-06-20 16:19 BIC:              3155.1078
No. Observations:   12800            Log-Likelihood:   -1563.4  
Df Model:           2                LL-Null:          -2706.2  
Df Residuals:       12797            LLR p-value:      0.0000   
Converged:          1.0000           Scale:            1.0000   
No. Iterations:     10.0000                                     
------------------------------------------------------------------
         Coef.    Std.Err.      z       P>|z|     [0.025    0.975]
------------------------------------------------------------------
const   -7.9963     0.3191   -25.0550   0.0000   -8.6218   -7.3708
col_0   -5.4237     0.2767   -19.6020   0.0000   -5.9660   -4.8814
co

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [195]:
# Calculate Default Rates
dr = len(y_test[y_test["target"] == 1]) / (
    len(y_test[y_test["target"] == 1]) + len(y_test[y_test["target"] == 0])
)
conservative_dr = (
    1.25
    * len(y_test[y_test["target"] == 1])
    / (len(y_test[y_test["target"] == 1]) + len(y_test[y_test["target"] == 0]))
)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Isolation Forest

For rejects, another step of data preporcessing is applied via Isolation Forest model. The goal is to remove outliers. The isolation forest is trained on all accepts and is used to evaluate the similarity of the rejects. Then the rejects that are found to be the most and least similar to the accepts are dropped. The contaimination parameter determines how many observations are excluded.

In [196]:
# def isolation_forest(X_train, r_dev, r_test):
#     """
#     The goal of this function is to filter the outliers from the rejected sample.

#     Parameters
#     ----------
#     X_train: accepts training data; Dataframe
#     r_dev_mod: rejects modelling data prior outlier treatment; Dataframe
#     r_test_mod: rejects testinf data prior outlier treatment; Dataframe

#     Return
#     ------
#     r_dev_mod: rejects modelling data post outlier treatment; Dataframe
#     r_test_mod: rejects training data prior outlier treatment; Dataframe

#     """

#     # Build Isolation forest model
#     isf = IsolationForest(
#         n_estimators=50,
#         max_samples="auto",
#         contamination=float(0.005),
#         max_features=1.0,
#     )
#     isf.fit(X_train)
#     rej_isf = isf.predict(r_dev)
#     # Add scores and anomaly columns to rejected train
#     r_dev["scores"] = isf.decision_function(r_dev)
#     r_dev["anomaly"] = isf.predict(r_dev[significant_columns])
#     # Print number of non-outliers and outliers
#     print("Rejected Train. Number of non-outliers is:", np.sum(r_dev["anomaly"] == 1))
#     print("Rejected Train. Number of outliers is:", np.sum(r_dev["anomaly"] == -1))
#     # Drop all outliers
#     r_dev = r_dev[r_dev.anomaly != -1]
#     # Delete columns related to the outliers
#     r_dev = r_dev[significant_columns]

#     # Add scores and anomaly columns to rejected test
#     r_test["scores"] = isf.decision_function(r_test)
#     r_test["anomaly"] = isf.predict(r_test[significant_columns])
#     # Print number of non-outliers and outliers
#     print("Rejected Test. Number of non-outliers is:", np.sum(r_test["anomaly"] == 1))
#     print("Rejected Test. Number of outliers is:", np.sum(r_test["anomaly"] == -1))
#     # Drop all outliers
#     r_test = r_test[r_test.anomaly != -1]
#     # Delete columns related to the outliers
#     r_test = r_test[significant_columns]

#     return r_dev, r_test

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [197]:
# r_dev, r_test = isolation_forest(X_train, r_dev, r_test)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Self Training for the Most Certain examples with %DR as stopping criterion

In [198]:
X_train_iter = X_train.copy()
y_train_iter = y_train.copy()
r_dev_iter = r_dev.copy()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [199]:
f1_scores = []
log_losses = []
mccs = []
iterations = []

iteration = 0
while 0.7 * len(r_dev) < len(r_dev_iter):
    iteration = iteration + 1
    print("Iteration Nr {}".format(iteration))
    print(len(r_dev))
    # Build logistic regression
    KGB_new = RandomForestClassifier(random_state=42).fit(X_train_iter, y_train_iter)
    # KGB_new = LogisticRegression(fit_intercept=False, penalty="none").fit(X_train_iter, y_train_iter)

    # Scores
    mcc = matthews_corrcoef(y_test, KGB_new.predict(X_test))
    mccs.append(mcc)

    print("MCC: ", mcc)

    # Make predictions on the rejected data
    pred = KGB_new.predict_proba(r_dev_iter)[:, 1]
    pred = pd.DataFrame(
        data=pred,
        columns=["target"],
        index=r_dev_iter.index.copy(),
    )

    # Choose the most certain predictions
    lq = pred["target"].quantile(q=0.15)
    uq = pred["target"].quantile(q=0.95)
    pred["certain"] = pred["target"].apply(lambda x: 1 if (x < lq or x > uq) else 0)

    # If PD is high, apply default status
    pred["target"] = pred["target"].apply(lambda x: 1 if (x > uq) else 0)
    # pred["target"] = pred["target"].apply(lambda x: 1 if (x > np.random.uniform()) else 0)

    # If PD is low, be conservative and apply non-default status only to some examples
    #     pred["target"] = pred["target"].apply(
    #         lambda x: 0 if (x < np.random.uniform()) else 1
    #     )

    # Pick only the certain predictions and concatenate them to the dev set
    # Y TRAIN
    certain = pred[pred["certain"] == 1]
    certain2 = certain["target"].to_frame()
    y_train_iter = pd.concat((y_train_iter, certain2))

    # print(len(certain))

    # Get significant columns of the rejects based on index
    certain_features = pd.merge(
        certain["target"],
        r_dev_iter[significant_columns],
        how="inner",
        left_index=True,
        right_index=True,
    )

    # X TRAIN
    certain_features = certain_features.loc[:, certain_features.columns != "target"]
    X_train_iter = pd.concat((X_train_iter, certain_features))

    # Remove certain columns from rejected data
    rows = certain_features.index
    r_dev_iter = r_dev_iter.drop(rows, axis="index")

    print(len(r_dev_iter))

Iteration Nr 1
3200
MCC:  0.2797647164110619
3041
Iteration Nr 2
3200
MCC:  0.26297930390656493
2449
Iteration Nr 3
3200
MCC:  0.3018988042051179
1973


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Do only 1 iteration as baseline Model

In [200]:
# Create copies of the data that can be overwritten in the function below
X_train_iter1 = X_train.copy()
y_train_iter1 = y_train.copy()
r_dev_iter1 = r_dev.copy()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [201]:
def predict_rejects(model, r_dev):
    # Make predictions on the Train Rejects
    pred_test = model.predict_proba(r_dev)[:, 1]
    # pred_test = model.predict(r_dev)
    pred_test = pd.DataFrame(
        data=pred_test,
        columns=["pred"],
        index=r_dev.index.copy(),
    )

    # Make binary predictions based on cutoff DR
    q1 = pred_test["pred"].quantile(q=1 - conservative_dr)
    pred_test["target"] = pred_test["pred"].apply(lambda x: 0 if (x < q1) else 1)
    pred_test = pred_test["target"].to_frame()

    # Add new rows to df
    y_train_new = pd.concat((y_train, pred_test))
    X_train_new = pd.concat((X_train, r_dev))

    # Fit new model
    # KGB_baseline_new = LogisticRegression(fit_intercept=False, penalty="none").fit(X_train_new, y_train_new)
    # KGB_baseline_new = LinearRegression().fit(X_train_new, y_train_new)
    KGB_baseline_new = RandomForestClassifier(random_state=42).fit(
        X_train_new, y_train_new
    )
    # KGB_baseline_new = DecisionTreeClassifier().fit(X_train_iter, y_train_iter)
    return KGB_baseline_new

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [202]:
def evaluate_test_accepts(model, X_test):
    pred_test = model.predict_proba(X_test)[:, 1]
    # pred_test = model.predict(X_test)
    pred_test = pd.DataFrame(
        data=pred_test,
        columns=["pred"],
        index=X_test.index.copy(),
    )

    # Merge with Target
    pred_test2 = pd.merge(
        pred_test["pred"],
        y_test["target"],
        how="inner",
        left_index=True,
        right_index=True,
    )

    # Make binary predictions based on cutoff 50percentile of the distribution
    q1 = pred_test2["pred"].quantile(q=1 - conservative_dr)
    pred_test2["prediction_baseline"] = pred_test2["pred"].apply(
        lambda x: 0 if (x < q1) else 1
    )
    return pred_test2

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [203]:
def evaluate_test_rejects(model, r_test):
    pred_test = model.predict_proba(r_test)[:, 1]
    # pred_test = model.predict(r_test)
    pred_test = pd.DataFrame(
        data=pred_test,
        columns=["pred"],
        index=r_test.index.copy(),
    )

    # Merge with Target
    pred_test2 = pd.merge(
        pred_test["pred"],
        dfr_test_with_label["target"],
        how="inner",
        left_index=True,
        right_index=True,
    )

    # Make binary predictions based on cutoff 50percentile of the distribution
    q1 = pred_test2["pred"].quantile(q=1 - conservative_dr)
    pred_test2["prediction_baseline"] = pred_test2["pred"].apply(
        lambda x: 0 if (x < q1) else 1
    )
    return pred_test2

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [204]:
def evaluate_test_combined(model, X_test, r_test):
    # Attach target to X_test and r_test

    r_test_target = pd.merge(
        r_test,
        dfr_test_with_label["target"],
        how="inner",
        left_index=True,
        right_index=True,
    )

    X_test_target = pd.merge(
        X_test,
        y_test["target"],
        how="inner",
        left_index=True,
        right_index=True,
    )

    # Concatenate labels
    ta_tr_labels = pd.concat([X_test_target, r_test_target], axis=0)

    # Concatenate Test Accepts and Test Rejects
    ta_tr = pd.concat([X_test, r_test], axis=0)

    pred_test = model.predict_proba(ta_tr)[:, 1]
    # pred_test = model.predict(r_test)
    pred_test = pd.DataFrame(
        data=pred_test,
        columns=["pred"],
        index=ta_tr.index.copy(),
    )

    # Merge with Target
    pred_test2 = pd.merge(
        pred_test["pred"],
        ta_tr_labels["target"],
        how="inner",
        left_index=True,
        right_index=True,
    )

    # Make binary predictions based on cutoff 50percentile of the distribution
    q1 = pred_test2["pred"].quantile(q=1 - conservative_dr)
    pred_test2["prediction_baseline"] = pred_test2["pred"].apply(
        lambda x: 0 if (x < q1) else 1
    )
    return pred_test2

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [205]:
def flag_df_baseline(df):

    # Flag kicked out bad cases (want more of these)
    if df["target"] == 1 and df["prediction_baseline"] == 1:
        return "CB"

    # Flag kicked out good cases (want less of these)
    elif df["target"] == 1 and df["prediction_baseline"] == 0:
        return "IB"

    # Flag kicked in good cases (want more of these)
    elif df["target"] == 0 and df["prediction_baseline"] == 0:
        return "CG"

    # Flag kicked in bad cases (want less of these)
    elif df["target"] == 0 and df["prediction_baseline"] == 1:
        return "IG"

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [206]:
def kickout_baseline(df):

    # Counts of kickout bad and kickout good
    counts = df["Flag"].value_counts()
    if "CB" in df.values:
        cb = counts.CB  # want more of these
    else:
        cb = 0
    if "IB" in df.values:
        ib = counts.IB  # want less of these
    else:
        ib = 0

    if "CG" in df.values:
        cg = counts.CG  # want more of these
    else:
        cg = 0

    if "IG" in df.values:
        ig = counts.IG  # want less of these
    else:
        ig = 0

    # Target
    total_bads = df[df["target"] == 1].shape[0]
    total_goods = df[df["target"] == 0].shape[0]
    pb = total_bads / (total_bads + total_goods)
    pg = total_goods / (total_bads + total_goods)

    kickout = (((cb / pb) - (ib / pb)) / total_bads) * (pb ** 2)
    kickin = (((cg / pg) - (ig / pg)) / total_goods) * (pg ** 2)
    weighted_total = kickout + kickin
    return weighted_total

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [236]:
def standard_evaluation(pred_data):
    # Concatenate X_test and r_test
    ta_tr = pd.concat([X_test, r_test], axis=0)
    # Make predictions
    f1 = f1_score(pred_data["target"], pred_data["prediction_baseline"])
    auc = roc_auc_score(pred_data["target"], pred_data["prediction_baseline"])
    return f1, auc

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## 1. Prediction before RI

Step 1: KGB Model ($m_{1}$)  <br>
Step 2: Score Test Accepts and Test Rejects

In [248]:
pred_test_a = evaluate_test_accepts(KGB1, X_test)
pred_test_r = evaluate_test_rejects(KGB1, r_test)
pred_test_combined = evaluate_test_combined(KGB1, X_test, r_test)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [249]:
pred_test_a["Flag"] = pred_test_a.apply(flag_df_baseline, axis=1)
# predictions_accepts_beforeRI = [round(kickout_baseline(pred_test_a).tolist(), 3)]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [250]:
pred_test_r["Flag"] = pred_test_r.apply(flag_df_baseline, axis=1)
predictions_rejects_beforeRI = [round(kickout_baseline(pred_test_r).tolist(), 3)]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [251]:
pred_test_combined["Flag"] = pred_test_combined.apply(flag_df_baseline, axis=1)
predictions_combined_beforeRI = [
    round(kickout_baseline(pred_test_combined).tolist(), 3)
]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [252]:
predictions_accepts_beforeRI

[0.865]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [253]:
f1, auc = standard_evaluation(pred_test_combined)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [254]:
auc

0.6728601109222342

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## 2. Predicions Baseline

Step 1: KGB Model ($m_{1}$)  <br>
Step 2: Infer status of each reject <br> 
Step 3: Redevelop KGB mdoel with inferred rejects ($m_{2}$)  <br>
Step 4: Score Test Accepts and Test Rejects

In [255]:
KGB_baseline_new = predict_rejects(KGB1, r_dev)
pred_test_a = evaluate_test_accepts(KGB_baseline_new, X_test)
pred_test_r = evaluate_test_rejects(KGB_baseline_new, r_test)
pred_test_combined = evaluate_test_combined(KGB_baseline_new, X_test, r_test)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [256]:
pred_test_a["Flag"] = pred_test_a.apply(flag_df_baseline, axis=1)
predictions_accepts_base = [round(kickout_baseline(pred_test_a).tolist(), 3)]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [257]:
pred_test_r["Flag"] = pred_test_r.apply(flag_df_baseline, axis=1)
predictions_rejects_base = [round(kickout_baseline(pred_test_r).tolist(), 3)]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [258]:
pred_test_combined["Flag"] = pred_test_combined.apply(flag_df_baseline, axis=1)
predictions_combined_base = [round(kickout_baseline(pred_test_combined).tolist(), 3)]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [34]:
predictions_accepts_base

[0.9]

<IPython.core.display.Javascript object>

In [259]:
f1, auc = standard_evaluation(pred_test_combined)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [260]:
auc

0.6229914661996256

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## 3. KGB model of best iteration

Step 1: KGB Model ($m_{1}$)  <br>
Step 2: Infer status of each reject <br> 
Step 3: Choose the most certain predictions (0.05q and 0.95q of the predicted probailities) <br>
Step 4: Add the most certain predictions to the training sample
Step 5: Redevelop KGB Model ($m_{2}$)  <br>
Step 6: Repeat Step 5 until convergence - best F1 score ($m_{i}$)  <br> 
Step 7: Score Test Accepts and Test Rejects

In [261]:
pred_test_a = evaluate_test_accepts(KGB_new, X_test)
pred_test_r = evaluate_test_rejects(KGB_new, r_test)
pred_test_combined = evaluate_test_combined(KGB_new, X_test, r_test)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [262]:
pred_test_a["Flag"] = pred_test_a.apply(flag_df_baseline, axis=1)
predictions_accepts_iter = [round(kickout_baseline(pred_test_a).tolist(), 3)]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [263]:
pred_test_r["Flag"] = pred_test_r.apply(flag_df_baseline, axis=1)
predictions_rejects_iter = [round(kickout_baseline(pred_test_r).tolist(), 3)]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [264]:
pred_test_combined["Flag"] = pred_test_combined.apply(flag_df_baseline, axis=1)
predictions_combined_iter = [round(kickout_baseline(pred_test_combined).tolist(), 3)]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [39]:
predictions_accepts_iter

[0.904]

<IPython.core.display.Javascript object>

In [265]:
f1, auc = standard_evaluation(pred_test_combined)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [266]:
auc

0.6763028211336047

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## 4. New model

Step 1: KGB Model ($m_{1}$)  <br>
Step 2: Infer status of each reject <br> 
Step 3: Choose the most certain predictions (0.05q and 0.95q of the predicted probailities) <br>
Step 4: Add the most certain predictions to the training sample <br>
Step 5: Redevelop KGB Model ($m_{2}$)  <br>
Step 6: Repeat Step 5 until convergence - best F1 score ($m_{i}$) <br> 
Step 7: Infer status of each reject with ($m_{i}$) <br> 
Step 8: Redevelop KGB mdoel with inferred rejects ($m_{final}$) <br> 
Step 9: Score Test Accepts and Test Rejects

In [267]:
KGB_baseline_new = predict_rejects(KGB_new, r_dev)
pred_test_a = evaluate_test_accepts(KGB_baseline_new, X_test)
pred_test_r = evaluate_test_rejects(KGB_baseline_new, r_test)
pred_test_combined = evaluate_test_combined(KGB_baseline_new, X_test, r_test)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [268]:
pred_test_a["Flag"] = pred_test_a.apply(flag_df_baseline, axis=1)
predictions_accepts_new = [round(kickout_baseline(pred_test_a).tolist(), 3)]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [269]:
pred_test_r["Flag"] = pred_test_r.apply(flag_df_baseline, axis=1)
predictions_rejects_new = [round(kickout_baseline(pred_test_r).tolist(), 3)]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [270]:
pred_test_combined["Flag"] = pred_test_combined.apply(flag_df_baseline, axis=1)
predictions_combined_new = [round(kickout_baseline(pred_test_combined).tolist(), 3)]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [44]:
predictions_accepts_new

[0.902]

<IPython.core.display.Javascript object>

In [271]:
f1, auc = standard_evaluation(pred_test_combined)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [272]:
auc

0.6337196723844847

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Parcelling

In [45]:
significant_columns = ["col_0", "col_1"]

<IPython.core.display.Javascript object>

In [46]:
# Create copies of the dataframes
X_train_parc = X_train2[significant_columns]
y_train_parc = y_train2
X_test = X_test2[significant_columns]
r_dev = r_dev2[significant_columns]

<IPython.core.display.Javascript object>

In [47]:
# Build a model on the accepted
KGB1 = LinearRegression().fit(X_train_parc, y_train_parc)
# Score the test
pred_test = KGB1.predict(X_test)
pred_test_acc = pd.DataFrame(
    data=pred_test,
    columns=["pred"],
    index=X_test.index.copy(),
)

NameError: name 'LinearRegression' is not defined

<IPython.core.display.Javascript object>

In [None]:
# Create 10 score bands
pred_test_acc["score_band"] = pd.qcut(pred_test_acc["pred"].values, 10)
pred_test_acc["nr_band"] = pd.qcut(pred_test_acc["pred"].values, 10, labels=False)

In [None]:
# Attach target
df = pd.merge(
    pred_test_acc,
    y_test,
    how="inner",
    left_index=True,
    right_index=True,
)

In [None]:
# Select rows with target = 1
df_bad = df[df["target"] == 1]
# Select rows with target = 0
df_good = df[df["target"] == 0]

In [None]:
# Count nr of bads in each interval
df_bad = df_bad.groupby("nr_band").size().sort_values().reset_index(name="nr_bad")
df_good = df_good.groupby("nr_band").size().sort_values().reset_index(name="nr_good")

In [None]:
# Merge counts with original data
df = pd.merge(df, df_bad, on="nr_band", how="outer")
df = pd.merge(df, df_good, on="nr_band", how="outer")

In [None]:
# Replace NaN with 0.1
df["nr_bad"] = df["nr_bad"].fillna(0.1)

In [None]:
# Calculate %good and %bads
df["perc_good"] = df["nr_good"] / (df["nr_bad"] + df["nr_good"])
df["perc_bad"] = df["nr_bad"] / (df["nr_bad"] + df["nr_good"])

In [None]:
# Get distinct score bands
df.drop_duplicates(subset=["score_band"]).sort_values(by="nr_band")

In [None]:
# make a copy of the r_dev data
r_dev_parc = r_dev.copy()
r_test_parc = r_test.copy()

In [None]:
# Score test rejects
pred_test_rej = KGB1.predict(r_test_parc)
pred_test_rej = pd.DataFrame(
    data=pred_test_rej,
    columns=["pred"],
    index=r_test_parc.index.copy(),
)

In [None]:
def rej_scoring(x):
    if x <= -0.0938:
        return 0
    elif x > -0.0938 and x <= -0.0359:
        return 1
    elif x > -0.0359 and x <= -0.0158:
        return 2
    elif x > -0.0158 and x <= 0.0638:
        return 3
    elif x > 0.0638 and x <= 0.118:
        return 4
    elif x > 0.118 and x <= 0.164:
        return 5
    elif x > 0.164 and x <= 0.22:
        return 6
    elif x > 0.22 and x <= 0.28:
        return 7
    elif x > 0.28 and x <= 0.343:
        return 8
    elif x > 0.343 and x <= 0.448:
        return 9

In [None]:
# Apply these scores to the rejects
pred_test_rej["nr_band"] = pred_test_rej["pred"].apply(rej_scoring)

In [None]:
pred_test_rej

In [None]:
pred_test_rej.isnull().sum(axis=0)

In [None]:
# Check how well known_col_0 discriminates goods and bads. Use as "score"
reg1 = LogisticRegression(fit_intercept=True, penalty="none").fit(
    X_train_parc[["col_0"]], y_train_parc
)
f1_score(y_test, reg1.predict(X_test[["col_0"]]), average="weighted")

In [None]:
# Create 10 score bands
X_train_parc["score_band"] = pd.qcut(X_train_parc["col_0"].values, 10)
X_train_parc["nr_band"] = pd.qcut(X_train_parc["col_0"].values, 10, labels=False)

In [None]:
# Attach target
df = pd.merge(
    X_train_parc,
    y_train_parc,
    how="inner",
    left_index=True,
    right_index=True,
)

In [None]:
# Select rows with target = 1
df_bad = df[df["target"] == 1]
# Select rows with target = 0
df_good = df[df["target"] == 0]

In [None]:
# Count nr of bads in each interval
df_bad = df_bad.groupby("nr_band").size().sort_values().reset_index(name="nr_bad")
df_good = df_good.groupby("nr_band").size().sort_values().reset_index(name="nr_good")

In [None]:
# Merge counts with original data
df = pd.merge(df, df_bad, on="nr_band")
df = pd.merge(df, df_good, on="nr_band")

In [None]:
# Calculate %good and %bads
df["perc_good"] = df["nr_good"] / (df["nr_bad"] + df["nr_good"])
df["perc_bad"] = df["nr_bad"] / (df["nr_bad"] + df["nr_good"])

In [None]:
# Get distinct score bands
df.drop_duplicates(subset=["score_band"]).sort_values(by="nr_band")

In [None]:
# make a copy of the r_dev data
r_dev_parc = r_dev.copy()

In [None]:
def rej_scoring(x):
    if x <= 0.19:
        return 0
    elif x > 0.19 and x <= 0.361:
        return 1
    elif x > 0.361 and x <= 0.437:
        return 2
    elif x > 0.437 and x <= 0.509:
        return 3
    elif x > 0.509 and x <= 0.58:
        return 4
    elif x > 0.58 and x <= 0.648:
        return 5
    elif x > 0.648 and x <= 0.716:
        return 6
    elif x > 0.716 and x <= 0.789:
        return 7
    elif x > 0.789 and x <= 0.862:
        return 8
    elif x > 0.862 and x <= 0.929:
        return 9

In [None]:
# Apply these scores to the rejects
r_dev_parc["nr_band"] = r_dev_parc["col_0"].apply(rej_scoring)

In [None]:
# Create another copy of the rejects
r_dev2 = r_dev_parc.copy()

In [None]:
# Count number of rejects in each band
r_dev_parc = (
    r_dev_parc.groupby("nr_band").size().sort_values().reset_index(name="nr_rejects")
)

In [None]:
# attach rej. counts to original data
df = pd.merge(df, r_dev_parc, on="nr_band")

In [None]:
# infer nr rejects
df["inf_good"] = round((df["nr_rejects"] * df["perc_good"]), 0)
df["inf_bad"] = round((df["nr_rejects"] * df["perc_bad"]), 0)

In [None]:
# Augmentation factor
# Drop duplicates
df["aug_factor"] = (df["nr_bad"] + df["nr_good"] + df["nr_rejects"]) / (
    df["nr_good"] + df["nr_bad"]
)

In [None]:
# Create Augmented nr of good and bad
df["inf_bad_aug"] = round((df["inf_bad"] * df["aug_factor"]), 0)
df["inf_good_aug"] = round((df["nr_rejects"] - df["inf_bad_aug"]), 0)

In [None]:
# Inferred probabilities
df["inf_p_good"] = round((df["inf_good_aug"] / df["nr_rejects"]), 2)
df["inf_p_bad"] = round((df["inf_bad_aug"] / df["nr_rejects"]), 2)

In [None]:
df_n = df.drop_duplicates(subset=["score_band"]).sort_values(by="nr_band")

In [None]:
df_n

## 5. Random Parcelling

In [None]:
# Assign randomly given number of bads in each interval with a bad status with their given probabilities

df_list = []
for i in range(0, 2):
    data = r_dev2[r_dev2["nr_band"] == i]
    data["target"] = np.random.choice(
        [0, 1],
        len(data),
        p=[float(df_n[["inf_p_good"]].iloc[1]), float(df_n[["inf_p_bad"]].iloc[1])],
    )
    df_list.append(data)

In [None]:
# Concatenate all dataframes
new_rej = pd.concat(df_list)

In [None]:
significant_columns = ["col_1"]

In [None]:
# Add new rows to df

# Split X and y
new_rej_X, new_rej_y = create_X_y(new_rej)
new_rej_X = new_rej_X[significant_columns]

# Extend training set
y_train_new = pd.concat((y_train, new_rej_y))
X_train_new = pd.concat((X_train, new_rej_X))

# Fit new model
KGB_baseline_new = LogisticRegression(fit_intercept=False, penalty="none").fit(
    X_train_new, y_train_new
)

In [None]:
pred_test_a = evaluate_test_accepts(KGB_baseline_new, X_test[significant_columns])
pred_test_r = evaluate_test_rejects(KGB_baseline_new, r_test[significant_columns])

In [None]:
pred_test_a["Flag"] = pred_test_a.apply(flag_df_baseline, axis=1)
predictions_accepts_rand_parc = [round(kickout_baseline(pred_test_a).tolist(), 3)]

In [None]:
pred_test_r["Flag"] = pred_test_r.apply(flag_df_baseline, axis=1)
predictions_rejects_rand_parc = [round(kickout_baseline(pred_test_r).tolist(), 3)]

## 6. Non-Random Parcelling

In [None]:
# Band 0

In [None]:
# r_dev_iter0 = r_dev2[r_dev2["nr_band"] == 0]
# X_train_iter0 = X_train_parc[X_train_parc["nr_band"] == 0]
# y_train_iter0 = pd.merge(
#     X_train_iter0["known_col_0"],
#     y_train_parc,
#     how="inner",
#     left_index=True,
#     right_index=True,
# )
# X_train_iter0 = X_train_iter0[significant_columns]
# X_test = X_test[significant_columns]
# # Drop unnecessary columns
# y_train_iter0 = y_train_iter0.drop(["known_col_0"], axis=1)

In [None]:
# mccs = []
# for iteration in range(1, 10):  # Change to how many iterrations you like
#     print("Iteration Nr {}".format(iteration))
#     # Build logistic regression
#     KGB1 = LogisticRegression(fit_intercept=False, penalty="none").fit(
#         X_train_iter0, y_train_iter0
#     )

#     # Scores
#     #     f1_stat = f1_score(y_test, KGB1.predict(X_test), average="weighted")
#     #     f1_scores.append(f1_stat)

#     #     logloss = log_loss(y_test, KGB1.predict(X_test), eps=1e-15)
#     #     log_losses.append(logloss)

#     #     print("F1: ", f1_stat)

#     mcc = matthews_corrcoef(y_test, KGB1.predict(X_test))
#     mccs.append(mcc)

#     print("MCC: ", mcc)

#     # Make predictions on the rejected data
#     pred = KGB1.predict_proba(r_dev_iter0[significant_columns])[:, 1]
#     pred = pd.DataFrame(
#         data=pred,
#         columns=["target"],
#         index=r_dev_iter0.index.copy(),
#     )

#     # Choose the most certain predictions
#     lq = pred["target"].quantile(q=0.05)
#     uq = pred["target"].quantile(q=0.95)
#     pred["certain"] = pred["target"].apply(lambda x: 1 if (x < lq or x > uq) else 0)

#     # If PD is high, apply default status
#     pred["target"] = pred["target"].apply(lambda x: 1 if (x > uq) else 0)

#     # Pick only the certain predictions and concatenate them to the dev set
#     # Y TRAIN
#     certain = pred[pred["certain"] == 1]
#     certain2 = certain["target"].to_frame()
#     y_train_iter0 = pd.concat((y_train_iter0, certain2))

#     # Get significant columns of the rejects based on index
#     certain_features = pd.merge(
#         certain["target"],
#         r_dev_iter0[significant_columns],
#         how="inner",
#         left_index=True,
#         right_index=True,
#     )

#     # X TRAIN
#     certain_features = certain_features.loc[:, certain_features.columns != "target"]
#     X_train_iter0 = pd.concat((X_train_iter0, certain_features))

#     # Remove certain columns from rejected data
#     rows = certain_features.index
#     r_dev_iter0 = r_dev_iter0.drop(rows, axis="index")
#     df_list.append(df)

In [None]:
# plt.plot(mccs, label="MCCs")

In [None]:
# Assign non-randomly given number of bads in each interval with a bad status with their given probabilities

df_list = []
f1_scores = []
mccs = []
for i in range(0, 9):
    # Prepare the data
    r_dev_iter = r_dev2[r_dev2["nr_band"] == i]
    X_train_iter = X_train_parc[X_train_parc["nr_band"] == 0]
    y_train_iter = pd.merge(
        X_train_iter["known_col_0"],
        y_train_parc,
        how="inner",
        left_index=True,
        right_index=True,
    )
    X_train_iter = X_train_iter[significant_columns]
    X_test = X_test[significant_columns]
    # Drop unnecessary columns
    y_train_iter = y_train_iter.drop(["known_col_0"], axis=1)

    for iteration in range(1, 11):  # Change to how many iterrations you like
        print("Iteration Nr {}".format(iteration))
        # Build logistic regression
        KGB1 = LogisticRegression(fit_intercept=False, penalty="none").fit(
            X_train_iter, y_train_iter
        )

        # Scores
        #         f1_stat = f1_score(y_test, KGB1.predict(X_test), average="weighted")
        #         f1_scores.append(f1_stat)

        #         logloss = log_loss(y_test, KGB1.predict(X_test), eps=1e-15)
        #         log_losses.append(logloss)

        #         print("F1: ", f1_stat)

        mcc = matthews_corrcoef(y_test, KGB1.predict(X_test))
        mccs.append(mcc)

        print("MCC: ", mcc)

        # Make predictions on the rejected data
        pred = KGB1.predict_proba(r_dev_iter[significant_columns])[:, 1]
        pred = pd.DataFrame(
            data=pred,
            columns=["target"],
            index=r_dev_iter.index.copy(),
        )

        # Choose the most certain predictions
        lq = pred["target"].quantile(q=0.05)
        uq = pred["target"].quantile(q=0.95)
        #pred["certain"] = pred["target"].apply(lambda x: 1 if (x < lq or x > uq) else 0)
        pred["certain"] = pred["target"].apply(lambda x: 1 if (x > uq) else 0)

        # If PD is high, apply default status
        pred["target"] = pred["target"].apply(lambda x: 1 if (x > uq) else 0)
#         pred["target"] = pred["target"].apply(
#             lambda x: 1 if (x > np.random.uniform()) else 0
#         )

        #If PD is low, be conservative and apply non-default status only to some examples
        #pred["target"] = pred["target"].apply(lambda x: 0 if (x < np.random.uniform()) else 1)


        # Pick only the certain predictions and concatenate them to the dev set
        # Y TRAIN
        certain = pred[pred["certain"] == 1]
        certain2 = certain["target"].to_frame()
        y_train_iter = pd.concat((y_train_iter, certain2))

        # Get significant columns of the rejects based on index
        certain_features = pd.merge(
            certain["target"],
            r_dev_iter[significant_columns],
            how="inner",
            left_index=True,
            right_index=True,
        )

        # X TRAIN
        certain_features = certain_features.loc[:, certain_features.columns != "target"]
        X_train_iter = pd.concat((X_train_iter, certain_features))

        # Remove certain columns from rejected data
        rows = certain_features.index
        r_dev_iter = r_dev_iter.drop(rows, axis="index")
        df_list.append(df)

In [None]:
len(mccs)

In [None]:
# Assign non-randomly given number of bads in each interval with a bad status with their given probabilities

df_list = []
f1_scores = []
for i in range(0, 9):
    r_dev_iter = r_dev2[r_dev2["nr_band"] == i]
    for iteration in range(1, 10):  # Change to how many iterrations you like
        print("Iteration Nr {}".format(iteration))
        # Build logistic regression
        # KGB1 = LogisticRegression(fit_intercept=False, penalty="none").fit(X_train_iter, y_train_iter)
        KGB1 = RandomForestClassifier().fit(X_train_iter, y_train_iter)

        # Scores
        f1_stat = f1_score(y_test, KGB1.predict(X_test), average="weighted")
        f1_scores.append(f1_stat)

        logloss = log_loss(y_test, KGB1.predict(X_test), eps=1e-15)
        log_losses.append(logloss)

        print("F1: ", f1_stat)

        # Make predictions on the rejected data
        pred = KGB1.predict_proba(r_dev_iter[significant_columns])[:, 1]
        pred = pd.DataFrame(
            data=pred,
            columns=["target"],
            index=r_dev_iter.index.copy(),
        )

        # Choose the most certain predictions
        lq = pred["target"].quantile(q=0.05)
        uq = pred["target"].quantile(q=0.95)
        #pred["certain"] = pred["target"].apply(lambda x: 1 if (x < lq or x > uq) else 0)
        pred["certain"] = pred["target"].apply(lambda x: 1 if (x > uq) else 0)

        # If PD is high, apply default status
        pred["target"] = pred["target"].apply(lambda x: 1 if (x > uq) else 0)
#         pred["target"] = pred["target"].apply(
#             lambda x: 1 if (x > np.random.uniform()) else 0
#         )

        #If PD is low, be conservative and apply non-default status only to some examples
        #pred["target"] = pred["target"].apply(lambda x: 0 if (x < np.random.uniform()) else 1)

        # Pick only the certain predictions and concatenate them to the dev set
        # Y TRAIN
        certain = pred[pred["certain"] == 1]
        certain2 = certain["target"].to_frame()
        y_train_iter = pd.concat((y_train_iter, certain2))

        # Get significant columns of the rejects based on index
        certain_features = pd.merge(
            certain["target"],
            r_dev_iter[significant_columns],
            how="inner",
            left_index=True,
            right_index=True,
        )

        # X TRAIN
        certain_features = certain_features.loc[:, certain_features.columns != "target"]
        X_train_iter = pd.concat((X_train_iter, certain_features))

        # Remove certain columns from rejected data
        rows = certain_features.index
        r_dev_iter = r_dev_iter.drop(rows, axis="index")
        df_list.append(df)

In [None]:
# Save the iteration of the model where max MCC score is reached
max_value0 = max(mccs[0:9])
max_index0 = mccs.index(max_value0)
max_value1 = max(mccs[10:19])
max_index1 = 10 + mccs.index(max_value1)
max_value2 = max(mccs[20:29])
max_index2 = 20 + mccs.index(max_value2)
max_value3 = max(mccs[30:39])
max_index3 = 30 + mccs.index(max_value3)
max_value4 = max(mccs[40:49])
max_index4 = 40 + mccs.index(max_value4)
max_value5 = max(mccs[50:59])
max_index5 = 50 + mccs.index(max_value5)
max_value6 = max(mccs[60:69])
max_index6 = 60 + mccs.index(max_value6)
max_value7 = max(mccs[70:79])
max_index7 = 70 + mccs.index(max_value7)
max_value8 = max(mccs[80:89])
max_index8 = 80 + mccs.index(max_value8)
print(max_index7)

In [None]:
### max_value1 = max(mccs[10:19])

In [None]:
max_index1 = mccs.index(max_value1)

In [None]:
max_index1

In [None]:
max_value = max(mccs[0:9])
max_index = mccs.index(max_value)
print(max_index)

In [None]:
# Assign non-randomly given number of bads in each interval with a bad status with their given probabilities

df_list = []
f1_scores = []
for i in range(0, 9):
    r_dev_iter = r_dev2[r_dev2["nr_band"] == i]
    for iteration in range(1, 10):  # Change to how many iterrations you like
        print("Iteration Nr {}".format(iteration))
        # Build logistic regression
        # KGB1 = LogisticRegression(fit_intercept=False, penalty="none").fit(X_train_iter, y_train_iter)
        KGB1 = RandomForestClassifier().fit(X_train_iter, y_train_iter)

        # Scores
        f1_stat = f1_score(y_test, KGB1.predict(X_test), average="weighted")
        f1_scores.append(f1_stat)

        logloss = log_loss(y_test, KGB1.predict(X_test), eps=1e-15)
        log_losses.append(logloss)

        print("F1: ", f1_stat)

        # Make predictions on the rejected data
        pred = KGB1.predict_proba(r_dev_iter[significant_columns])[:, 1]
        pred = pd.DataFrame(
            data=pred,
            columns=["target"],
            index=r_dev_iter.index.copy(),
        )

        # Choose the most certain predictions
        lq = pred["target"].quantile(q=0.05)
        uq = pred["target"].quantile(q=0.95)
        #pred["certain"] = pred["target"].apply(lambda x: 1 if (x < lq or x > uq) else 0)
        pred["certain"] = pred["target"].apply(lambda x: 1 if (x > uq) else 0)

        # If PD is high, apply default status
        pred["target"] = pred["target"].apply(lambda x: 1 if (x > uq) else 0)
#         pred["target"] = pred["target"].apply(
#             lambda x: 1 if (x > np.random.uniform()) else 0
#         )

        #If PD is low, be conservative and apply non-default status only to some examples
        #pred["target"] = pred["target"].apply(lambda x: 0 if (x < np.random.uniform()) else 1)


        # Pick only the certain predictions and concatenate them to the dev set
        # Y TRAIN
        certain = pred[pred["certain"] == 1]
        certain2 = certain["target"].to_frame()
        y_train_iter = pd.concat((y_train_iter, certain2))

        # Get significant columns of the rejects based on index
        certain_features = pd.merge(
            certain["target"],
            r_dev_iter[significant_columns],
            how="inner",
            left_index=True,
            right_index=True,
        )

        # X TRAIN
        certain_features = certain_features.loc[:, certain_features.columns != "target"]
        X_train_iter = pd.concat((X_train_iter, certain_features))

        # Remove certain columns from rejected data
        rows = certain_features.index
        r_dev_iter = r_dev_iter.drop(rows, axis="index")
        df_list.append(df)

## Combine predictions

In [None]:
my_list_accepts = [
    predictions_accepts_beforeRI,
    predictions_accepts_base,
    predictions_accepts_iter,
    predictions_accepts_new,
    # predictions_accepts_rand_parc,
]
df_pred_accepts = pd.DataFrame(my_list_accepts).transpose()
df_pred_accepts = df_pred_accepts.rename(
    columns={
        0: "Before RI",
        1: "Baseline",
        2: "Iteration n",
        3: "Self-Training",
        # 4: "Rand Parcelling",
    },
)

In [None]:
my_list_rejects = [
    predictions_rejects_beforeRI,
    predictions_rejects_base,
    predictions_rejects_iter,
    predictions_rejects_new,
    # predictions_rejects_rand_parc,
]
df_pred_rejects = pd.DataFrame(my_list_rejects).transpose()
df_pred_rejects = df_pred_rejects.rename(
    columns={
        0: "Before RI",
        1: "Baseline",
        2: "Iteration n",
        3: "Self-Training",
        # 4: "Rand Parcelling",
    },
)

In [None]:
my_list_combined = [
    predictions_combined_beforeRI,
    predictions_combined_base,
    predictions_combined_iter,
    predictions_combined_new,
    # predictions_accepts_rand_parc,
]
df_pred_combined = pd.DataFrame(my_list_combined).transpose()
df_pred_combined = df_pred_combined.rename(
    columns={
        0: "Before RI",
        1: "Baseline",
        2: "Iteration n",
        3: "Self-Training",
        # 4: "Rand Parcelling",
    },
)

In [None]:
df_pred_accepts

In [None]:
df_pred_rejects

In [None]:
df_pred_combined