In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

# Generate Data

In [2]:
import pandas as pd
import numpy as np

# Generate Data
from sklearn.datasets import make_classification
from targen.data import target

<IPython.core.display.Javascript object>

## Get random Gaussian Data

In [3]:
# Get random Gaussian data with 18 features
n_features = 18
X, dummy = make_classification(n_samples=20000, n_features=n_features)

<IPython.core.display.Javascript object>

In [4]:
# Half of the features are known and half hidden
known_cols = [f"known_col_{ix}" for ix in range(int(n_features / 2))]
if n_features % 2 == 0:
    hidden_cols = [f"hidden_col_{ix}" for ix in range(int(n_features / 2))]

else:
    hidden_cols = [f"hidden_col_{ix}" for ix in range(int(n_features / 2) + 1)]
columns = known_cols + hidden_cols

<IPython.core.display.Javascript object>

In [5]:
# Map data to dataframe
data = pd.DataFrame(X, columns=columns)

<IPython.core.display.Javascript object>

## Add Correlations between the features

In [6]:
# Add correlation between the columns
# theta_pi_2 is the angle of rotation (expressed in pi/2). When setting it to 1, you will add 100% correaltion between the variables.
def add_correlation_between_columns(df, col_1, col_2, theta_pi_2=1):

    theta = theta_pi_2 * 0.5 * np.pi

    data_out = df.copy()
    data_out[col_1] = -1 * np.cos(theta) * df[col_1] + np.sin(theta * df[col_2])
    data_out[col_2] = np.cos(theta) * df[col_1] + np.sin(theta * df[col_2])

    return data_out

<IPython.core.display.Javascript object>

In [7]:
# Data with new columns
new_data = (
    data.pipe(
        add_correlation_between_columns,
        col_1="known_col_0",
        col_2="known_col_1",
        theta_pi_2=1,
    )
    .pipe(
        add_correlation_between_columns,
        col_1="known_col_2",
        col_2="known_col_3",
        theta_pi_2=-0.9,
    )
    .pipe(
        add_correlation_between_columns,
        col_1="known_col_2",
        col_2="hidden_col_0",
        theta_pi_2=-0.8,
    )
    .pipe(
        add_correlation_between_columns,
        col_1="hidden_col_1",
        col_2="known_col_1",
        theta_pi_2=0.4,
    )
)

<IPython.core.display.Javascript object>

In [8]:
new_data.corr()[
    [
        "known_col_0",
        "known_col_1",
        "known_col_2",
        "known_col_3",
        "hidden_col_0",
        "hidden_col_1",
    ]
]

Unnamed: 0,known_col_0,known_col_1,known_col_2,known_col_3,hidden_col_0,hidden_col_1
known_col_0,1.0,0.65893,0.006247,0.004253,0.00653,0.65305
known_col_1,0.65893,1.0,0.194076,-0.552586,-0.189407,-0.139093
known_col_2,0.006247,0.194076,1.0,-0.319668,0.761344,-0.187233
known_col_3,0.004253,-0.552586,-0.319668,1.0,0.321272,0.562098
known_col_4,0.005265,0.001213,0.001616,0.003132,0.003804,0.005962
known_col_5,-0.006122,-0.003203,0.013627,-0.006891,0.008275,-0.004482
known_col_6,-0.007004,-0.000766,0.005518,-0.0039,0.001359,-0.008507
known_col_7,0.008092,-0.002571,0.003704,0.015054,0.008266,0.013161
known_col_8,-0.011336,-0.450205,-0.166227,0.503013,0.181723,0.438455
hidden_col_0,0.00653,-0.189407,0.761344,0.321272,1.0,0.199314


<IPython.core.display.Javascript object>

## Generate X-y relationship (feature-target)

In [9]:
expressions_x_y = {
    "linear": "-0.5*known_col_0 + 2.*known_col_1 + 0.7*known_col_4 -0.4*hidden_col_0",
    "non_linear": "-0.7*known_col_3**1.5 + 0.2*sin(known_col_1)+ 0.9*log(hidden_col_4) -0.1*hidden_col_2**2",
    "interaction": "0.05*known_col_3*known_col_4 -0.1*(known_col_4/hidden_col_1)",
    "uniform_noise": {"weight": 0.5},
}

<IPython.core.display.Javascript object>

## Generate accept-reject relationship

In [10]:
expressions_accept_reject = {
    "linear": "-14*known_col_4 + 2.*known_col_5 + 0.9*hidden_col_6 -2.1*known_col_3",
    "non_linear": "-0.9*known_col_3**0.5 + 0.2*sin(known_col_5) -2.5*log(hidden_col_5)-0.05*hidden_col_6**2.5",
    #     'interaction': '0.01*known_col_3*known_col_4 -0.05*(hidden_col_5/hidden_col_6)',
    "uniform_noise": {"weight": 0.1},
}

<IPython.core.display.Javascript object>

In [11]:
def expressions(weight_x_y, weight_ar):
    # Generate X-y relationship (feature-target)
    expressions_x_y = {
        "linear": "-0.5*known_col_0 + 2.*known_col_1 + 0.7*known_col_4 -0.4*hidden_col_0",
        "non_linear": "-0.7*known_col_3**1.5 + 0.2*sin(known_col_1)+ 0.9*log(hidden_col_4) -0.1*hidden_col_2**2",
        "interaction": "0.05*known_col_3*known_col_4 -0.1*(known_col_4/hidden_col_1)",
        "uniform_noise": {"weight": weight_x_y},
    }
    # Generate accept-reject relationship
    expressions_accept_reject = {
        "linear": "-14*known_col_4 + 2.*known_col_5 + 0.9*hidden_col_6 -2.1*known_col_3",
        "non_linear": "-0.9*known_col_3**0.5 + 0.2*sin(known_col_5) -2.5*log(hidden_col_5)-0.05*hidden_col_6**2.5",
        # "interaction": "0.01*known_col_3*known_col_4 -0.05*(hidden_col_5/hidden_col_6)",
        "uniform_noise": {"weight": weight_ar},
    }
    return expressions_x_y, expressions_accept_reject

<IPython.core.display.Javascript object>

### Wholesale Expressions

In [103]:
expressions_x_y_1, expressions_accept_reject_1 = expressions(0.01, 0.01)
expressions_x_y_2, expressions_accept_reject_2 = expressions(0.75, 0.75)

<IPython.core.display.Javascript object>

### All Others Expressions

In [53]:
expressions_x_y_1, expressions_accept_reject_1 = expressions(0.01, 0.01)
expressions_x_y_2, expressions_accept_reject_2 = expressions(0.25, 0.25)
expressions_x_y_3, expressions_accept_reject_3 = expressions(0.75, 0.75)

<IPython.core.display.Javascript object>

## Apply targen and get the new dataset and Put all data together

In [54]:
def apply_targen(dr, ar, namedata, expressions_x_y, expressions_accept_reject):
    X_y = target.get_target_and_contributions(
        data, expressions=expressions_x_y, imbalance=dr, drop_features=False
    )
    X_acc_reject = target.get_target_and_contributions(
        data, expressions=expressions_accept_reject, imbalance=ar, drop_features=False
    )
    X_acc_reject = X_acc_reject.rename(columns={"y": "is_accepted"})
    namedata = pd.concat([X_y, X_acc_reject["is_accepted"]], axis=1).drop(
        [col for col in X_y.columns if col.startswith("score")], axis=1
    )
    return namedata

<IPython.core.display.Javascript object>

## Check the default rate per samples (kgb vs rejected)

In [55]:
def check(namedata):
    kgb = namedata[namedata["is_accepted"] == 1]
    rejected = namedata[namedata["is_accepted"] == 0]
    dr = kgb["y"].value_counts(normalize=True)
    ar = rejected["y"].value_counts(normalize=True)
    return dr, ar

<IPython.core.display.Javascript object>

## Get the modelling dataset

In [56]:
def get_data(namedata):
    namedata = namedata[
        [col for col in namedata.columns if col.startswith("known")]
        + ["y", "is_accepted"]
    ]
    return namedata

<IPython.core.display.Javascript object>

### Wholesale and Secured Loans

In [104]:
wholesale_and_secured_scen1 = apply_targen(
    0.03,
    0.95,
    "wholesale_and_secured_scen1",
    expressions_x_y_1,
    expressions_accept_reject_1,
)
check(wholesale_and_secured_scen1)
wholesale_and_secured_scen1 = get_data(wholesale_and_secured_scen1)
wholesale_and_secured_scen1.to_csv(
    r"C:\Users\Asus\Desktop\Repo\MasterThesis_RI\Data_28_04\wholesale_and_secured_scen1.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [105]:
wholesale_and_secured_scen2 = apply_targen(
    0.03,
    0.95,
    "wholesale_and_secured_scen2",
    expressions_x_y_2,
    expressions_accept_reject_2,
)
check(wholesale_and_secured_scen2)
wholesale_and_secured_scen2.to_csv(
    r"C:\Users\Asus\Desktop\Repo\MasterThesis_RI\Data_28_04\wholesale_and_secured_scen2.csv",
    index=False,
)

<IPython.core.display.Javascript object>

### Consumer Loans/SME/ Credit Cards Scenario 1

In [59]:
cons_scen1_1 = apply_targen(
    0.05, 0.9, "cons_scen1", expressions_x_y_1, expressions_accept_reject_1
)
cons_scen1_1 = get_data(cons_scen1_1)
cons_scen1_1.to_csv(
    r"C:\Users\Asus\Desktop\Repo\MasterThesis_RI\Data_28_04\cons_scen1_1.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [60]:
cons_scen1_2 = apply_targen(
    0.05, 0.9, "cons_scen1", expressions_x_y_2, expressions_accept_reject_2
)
cons_scen1_2 = get_data(cons_scen1_2)
cons_scen1_2.to_csv(
    r"C:\Users\Asus\Desktop\Repo\MasterThesis_RI\Data_28_04\cons_scen1_2.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [61]:
cons_scen1_3 = apply_targen(
    0.05, 0.9, "cons_scen1", expressions_x_y_3, expressions_accept_reject_3
)
cons_scen1_3 = get_data(cons_scen1_3)
cons_scen1_3.to_csv(
    r"C:\Users\Asus\Desktop\Repo\MasterThesis_RI\Data_28_04\cons_scen1_3.csv",
    index=False,
)

<IPython.core.display.Javascript object>

### Consumer Loans/SME/ Credit Cards Scenario 2

In [62]:
cons_scen2_1 = apply_targen(
    0.1, 0.9, "cons_scen2", expressions_x_y_1, expressions_accept_reject_1
)
cons_scen2_1 = get_data(cons_scen2_1)
cons_scen2_1.to_csv(
    r"C:\Users\Asus\Desktop\Repo\MasterThesis_RI\Data_28_04\cons_scen2_1.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [63]:
cons_scen2_2 = apply_targen(
    0.1, 0.9, "cons_scen2", expressions_x_y_2, expressions_accept_reject_2
)
cons_scen2_2 = get_data(cons_scen2_2)
cons_scen2_2.to_csv(
    r"C:\Users\Asus\Desktop\Repo\MasterThesis_RI\Data_28_04\cons_scen2_2.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [64]:
cons_scen2_3 = apply_targen(
    0.1, 0.9, "cons_scen2", expressions_x_y_3, expressions_accept_reject_3
)
cons_scen2_3 = get_data(cons_scen2_3)
cons_scen2_3.to_csv(
    r"C:\Users\Asus\Desktop\Repo\MasterThesis_RI\Data_28_04\cons_scen2_3.csv",
    index=False,
)

<IPython.core.display.Javascript object>

### Consumer Loans/SME/ Credit Cards Scenario 3

In [65]:
cons_scen3_1 = apply_targen(
    0.05, 0.8, "cons_scen3", expressions_x_y_1, expressions_accept_reject_1
)
cons_scen3_1 = get_data(cons_scen3_1)
cons_scen3_1.to_csv(
    r"C:\Users\Asus\Desktop\Repo\MasterThesis_RI\Data_28_04\cons_scen3_1.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [66]:
cons_scen3_2 = apply_targen(
    0.05, 0.8, "cons_scen3", expressions_x_y_2, expressions_accept_reject_2
)
cons_scen3_2 = get_data(cons_scen3_2)
cons_scen3_2.to_csv(
    r"C:\Users\Asus\Desktop\Repo\MasterThesis_RI\Data_28_04\cons_scen3_2.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [67]:
cons_scen3_3 = apply_targen(
    0.05, 0.8, "cons_scen3", expressions_x_y_3, expressions_accept_reject_3
)
cons_scen3_3 = get_data(cons_scen3_3)
cons_scen3_3.to_csv(
    r"C:\Users\Asus\Desktop\Repo\MasterThesis_RI\Data_28_04\cons_scen3_3.csv",
    index=False,
)

<IPython.core.display.Javascript object>

### Consumer Loans/SME/ Credit Cards Scenario 4

In [68]:
cons_scen4_1 = apply_targen(
    0.1, 0.8, "cons_scen4", expressions_x_y_1, expressions_accept_reject_1
)
cons_scen4_1 = get_data(cons_scen4_1)
cons_scen4_1.to_csv(
    r"C:\Users\Asus\Desktop\Repo\MasterThesis_RI\Data_28_04\cons_scen4_1.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [69]:
cons_scen4_2 = apply_targen(
    0.1, 0.8, "cons_scen4", expressions_x_y_2, expressions_accept_reject_2
)
cons_scen4_2 = get_data(cons_scen4_2)
cons_scen4_2.to_csv(
    r"C:\Users\Asus\Desktop\Repo\MasterThesis_RI\Data_28_04\cons_scen4_2.csv",
    index=False,
)

<IPython.core.display.Javascript object>

In [70]:
cons_scen4_3 = apply_targen(
    0.1, 0.8, "cons_scen4", expressions_x_y_3, expressions_accept_reject_3
)
cons_scen4_3 = get_data(cons_scen4_3)
cons_scen4_3.to_csv(
    r"C:\Users\Asus\Desktop\Repo\MasterThesis_RI\Data_28_04\cons_scen4_3.csv",
    index=False,
)

<IPython.core.display.Javascript object>

### MFI

In [71]:
mfi_1 = apply_targen(0.2, 0.5, "mfi", expressions_x_y_1, expressions_accept_reject_1)
mfi_1 = get_data(mfi_1)
mfi_1.to_csv(
    r"C:\Users\Asus\Desktop\Repo\MasterThesis_RI\Data_28_04\mfi_1.csv", index=False
)

<IPython.core.display.Javascript object>

In [72]:
mfi_2 = apply_targen(0.2, 0.5, "mfi", expressions_x_y_2, expressions_accept_reject_2)
mfi_2 = get_data(mfi_2)
mfi_2.to_csv(
    r"C:\Users\Asus\Desktop\Repo\MasterThesis_RI\Data_28_04\mfi_2.csv", index=False
)

<IPython.core.display.Javascript object>

In [73]:
mfi_3 = apply_targen(0.2, 0.5, "mfi", expressions_x_y_3, expressions_accept_reject_3)
mfi_3 = get_data(mfi_3)
mfi_3.to_csv(
    r"C:\Users\Asus\Desktop\Repo\MasterThesis_RI\Data_28_04\mfi_3.csv", index=False
)

<IPython.core.display.Javascript object>

### Kozodoi DR and RR

In [74]:
paper_1 = apply_targen(0.4, 0.7, "mfi", expressions_x_y_1, expressions_accept_reject_1)
paper_1 = get_data(paper_1)
paper_1.to_csv(
    r"C:\Users\Asus\Desktop\Repo\MasterThesis_RI\Data_28_04\paper_1.csv", index=False
)

<IPython.core.display.Javascript object>

In [75]:
paper_2 = apply_targen(0.4, 0.7, "mfi", expressions_x_y_2, expressions_accept_reject_2)
paper_2 = get_data(paper_2)
paper_2.to_csv(
    r"C:\Users\Asus\Desktop\Repo\MasterThesis_RI\Data_28_04\paper_2.csv", index=False
)

<IPython.core.display.Javascript object>

In [76]:
paper_3 = apply_targen(0.4, 0.7, "mfi", expressions_x_y_3, expressions_accept_reject_3)
paper_3 = get_data(paper_3)
paper_3.to_csv(
    r"C:\Users\Asus\Desktop\Repo\MasterThesis_RI\Data_28_04\paper_3.csv", index=False
)

<IPython.core.display.Javascript object>

## Test Noise vs Predictiveness

We expect that there should be monotonic decrease of AUC after introduction of more noise

In [77]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import roc_auc_score

<IPython.core.display.Javascript object>

In [78]:
def test_logreg(accepted, data):
    # accepted = 1, rejected = 0
    df = data[data["is_accepted"] == accepted]
    df = df.drop(["is_accepted"], axis=1)

    # Labels
    X = df.loc[:, df.columns != "y"]
    y = df.loc[:, df.columns == "y"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=7
    )
    logreg = LogisticRegression(fit_intercept=True, penalty="none")
    logreg.fit(X, y.values.ravel())
    logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
    return logit_roc_auc

<IPython.core.display.Javascript object>

### Test

In [91]:
print("cons_scen1_1:", test_logreg(1, cons_scen1_1))
print("cons_scen1_2:", test_logreg(1, cons_scen1_2))
print("cons_scen1_3:", test_logreg(1, cons_scen1_3))

cons_scen1_1: 0.6418304040432722
cons_scen1_2: 0.5760471364024912
cons_scen1_3: 0.5157112577801769


<IPython.core.display.Javascript object>

In [92]:
print("cons_scen1_1:", test_logreg(0, cons_scen1_1))
print("cons_scen1_2:", test_logreg(0, cons_scen1_2))
print("cons_scen1_3:", test_logreg(0, cons_scen1_3))

cons_scen1_1: 0.7308465490283672
cons_scen1_2: 0.6419783667740071
cons_scen1_3: 0.5290919230411748


<IPython.core.display.Javascript object>

In [93]:
print("cons_scen2_1:", test_logreg(1, cons_scen2_1))
print("cons_scen2_2:", test_logreg(1, cons_scen2_2))
print("cons_scen2_3:", test_logreg(1, cons_scen2_3))

cons_scen2_1: 0.7026084982214785
cons_scen2_2: 0.6239881897654913
cons_scen2_3: 0.5297393402048937


<IPython.core.display.Javascript object>

In [94]:
print("cons_scen2_1:", test_logreg(0, cons_scen2_1))
print("cons_scen2_2:", test_logreg(0, cons_scen2_2))
print("cons_scen2_3:", test_logreg(0, cons_scen2_3))

cons_scen2_1: 0.7651578073089701
cons_scen2_2: 0.6268920160549348
cons_scen2_3: 0.5354710385148573


<IPython.core.display.Javascript object>

In [95]:
print("cons_scen3_1:", test_logreg(1, cons_scen3_1))
print("cons_scen3_2:", test_logreg(1, cons_scen3_2))
print("cons_scen3_3:", test_logreg(1, cons_scen3_3))

cons_scen3_1: 0.6782477568207023
cons_scen3_2: 0.5979462996528524
cons_scen3_3: 0.5246570263255468


<IPython.core.display.Javascript object>

In [96]:
print("cons_scen3_1:", test_logreg(0, cons_scen3_1))
print("cons_scen3_2:", test_logreg(0, cons_scen3_2))
print("cons_scen3_3:", test_logreg(0, cons_scen3_3))

cons_scen3_1: 0.6650555327025915
cons_scen3_2: 0.613940036877068
cons_scen3_3: 0.5434782608695653


<IPython.core.display.Javascript object>

In [97]:
print("cons_scen4_1:", test_logreg(1, cons_scen4_1))
print("cons_scen4_2:", test_logreg(1, cons_scen4_2))
print("cons_scen4_3:", test_logreg(1, cons_scen4_3))

cons_scen4_1: 0.6950931013431013
cons_scen4_2: 0.6410681924619739
cons_scen4_3: 0.5319787625076577


<IPython.core.display.Javascript object>

In [98]:
print("cons_scen4_1:", test_logreg(0, cons_scen4_1))
print("cons_scen4_2:", test_logreg(0, cons_scen4_2))
print("cons_scen4_3:", test_logreg(0, cons_scen4_3))

cons_scen4_1: 0.7060022583625405
cons_scen4_2: 0.6376932620729147
cons_scen4_3: 0.5684027192538906


<IPython.core.display.Javascript object>

In [99]:
print("mfi_1:", test_logreg(1, mfi_1))
print("mfi_2:", test_logreg(1, mfi_2))
print("mfi_3:", test_logreg(1, mfi_3))

mfi_1: 0.7079523834200727
mfi_2: 0.6689478523578091
mfi_3: 0.5537916371367824


<IPython.core.display.Javascript object>

In [100]:
print("mfi_1:", test_logreg(0, mfi_1))
print("mfi_2:", test_logreg(0, mfi_2))
print("mfi_3:", test_logreg(0, mfi_3))

mfi_1: 0.7669304737989211
mfi_2: 0.7198472868515721
mfi_3: 0.5860070874437314


<IPython.core.display.Javascript object>

In [101]:
print("paper_1:", test_logreg(1, paper_1))
print("paper_2:", test_logreg(1, paper_2))
print("paper_3:", test_logreg(1, paper_3))

paper_1: 0.7780299927055427
paper_2: 0.7487272191452484
paper_3: 0.6396352084866728


<IPython.core.display.Javascript object>

In [102]:
print("paper_1:", test_logreg(0, paper_1))
print("paper_2:", test_logreg(0, paper_2))
print("paper_3:", test_logreg(0, paper_3))

paper_1: 0.7876438583135552
paper_2: 0.7733047572201531
paper_3: 0.6596320346320347


<IPython.core.display.Javascript object>

For all datasets above there is monotinic trend, as expected. For wholesale there is inversion between scenario 1 and 2 with the original scenarios - new ones are applied.

In [106]:
print("wholesale_and_secured_scen1:", test_logreg(1, wholesale_and_secured_scen1))
print("wholesale_and_secured_scen2:", test_logreg(1, wholesale_and_secured_scen2))

wholesale_and_secured_scen1: 0.6420344607743511
wholesale_and_secured_scen2: 0.5603354978354979


<IPython.core.display.Javascript object>

In [107]:
print("wholesale_and_secured_scen1:", test_logreg(0, wholesale_and_secured_scen1))
print("wholesale_and_secured_scen2:", test_logreg(0, wholesale_and_secured_scen2))

wholesale_and_secured_scen1: 0.6402664692820133
wholesale_and_secured_scen2: 0.5974358974358974


<IPython.core.display.Javascript object>