In [21]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

# Generate Data

In [22]:
import pandas as pd
import numpy as np

# Generate Data
from sklearn.datasets import make_classification
from targen.data import target

<IPython.core.display.Javascript object>

## Get random Gaussian Data

In [23]:
# Get random Gaussian data with 18 features
n_features = 18
X, dummy = make_classification(n_samples=10000, n_features=n_features)

<IPython.core.display.Javascript object>

In [24]:
# Half of the features are known and half hidden
known_cols = [f"known_col_{ix}" for ix in range(int(n_features / 2))]
if n_features % 2 == 0:
    hidden_cols = [f"hidden_col_{ix}" for ix in range(int(n_features / 2))]

else:
    hidden_cols = [f"hidden_col_{ix}" for ix in range(int(n_features / 2) + 1)]
columns = known_cols + hidden_cols

<IPython.core.display.Javascript object>

In [25]:
# Map data to dataframe
data = pd.DataFrame(X, columns=columns)

<IPython.core.display.Javascript object>

## Add Correlations between the features

In [26]:
# Add correlation between the columns
# theta_pi_2 is the angle of rotation (expressed in pi/2). When setting it to 1, you will add 100% correaltion between the variables.
def add_correlation_between_columns(df, col_1, col_2, theta_pi_2=1):

    theta = theta_pi_2 * 0.5 * np.pi

    data_out = df.copy()
    data_out[col_1] = -1 * np.cos(theta) * df[col_1] + np.sin(theta * df[col_2])
    data_out[col_2] = np.cos(theta) * df[col_1] + np.sin(theta * df[col_2])

    return data_out

<IPython.core.display.Javascript object>

In [27]:
# Data with new columns
new_data = (
    data.pipe(
        add_correlation_between_columns,
        col_1="known_col_0",
        col_2="known_col_1",
        theta_pi_2=1,
    )
    .pipe(
        add_correlation_between_columns,
        col_1="known_col_2",
        col_2="known_col_3",
        theta_pi_2=-0.9,
    )
    .pipe(
        add_correlation_between_columns,
        col_1="known_col_2",
        col_2="hidden_col_0",
        theta_pi_2=-0.8,
    )
    .pipe(
        add_correlation_between_columns,
        col_1="hidden_col_1",
        col_2="known_col_1",
        theta_pi_2=0.4,
    )
)

<IPython.core.display.Javascript object>

In [28]:
new_data.corr()[
    [
        "known_col_0",
        "known_col_1",
        "known_col_2",
        "known_col_3",
        "hidden_col_0",
        "hidden_col_1",
    ]
]

Unnamed: 0,known_col_0,known_col_1,known_col_2,known_col_3,hidden_col_0,hidden_col_1
known_col_0,1.0,0.337964,-0.012207,-0.00013,-0.019268,0.356009
known_col_1,0.337964,1.0,-0.00336,0.002254,-0.000997,-0.75911
known_col_2,-0.012207,-0.00336,1.0,-0.275623,0.813755,-0.005044
known_col_3,-0.00013,0.002254,-0.275623,1.0,0.276206,-0.002354
known_col_4,0.001752,-0.011962,-0.002393,-0.003712,-0.004437,0.013258
known_col_5,0.005117,0.009189,0.002074,-0.019802,-0.010552,-0.005537
known_col_6,0.002111,0.000863,-0.008765,-0.006724,-0.010625,0.000747
known_col_7,-0.004473,-0.00571,0.003634,-0.01174,-0.003303,0.002623
known_col_8,-0.013877,0.005188,0.002422,-0.003551,-0.002063,-0.014639
hidden_col_0,-0.019268,-0.000997,0.813755,0.276206,1.0,-0.012332


<IPython.core.display.Javascript object>

## Generate X-y relationship (feature-target)

In [29]:
expressions_x_y = {
    "linear": "-0.5*known_col_0 + 2.*known_col_1 + 0.7*known_col_4 -0.4*hidden_col_0",
    "non_linear": "-0.7*known_col_3**1.5 + 0.2*sin(known_col_1)+ 0.9*log(hidden_col_4) -0.1*hidden_col_2**2",
    "interaction": "0.05*known_col_3*known_col_4 -0.1*(known_col_4/hidden_col_1)",
    "uniform_noise": {"weight": 0.5},
}

<IPython.core.display.Javascript object>

## Generate accept-reject relationship

In [30]:
expressions_accept_reject = {
    "linear": "-14*known_col_4 + 2.*known_col_5 + 0.9*hidden_col_6 -2.1*known_col_3",
    "non_linear": "-0.9*known_col_3**0.5 + 0.2*sin(known_col_5) -2.5*log(hidden_col_5)-0.05*hidden_col_6**2.5",
    #     'interaction': '0.01*known_col_3*known_col_4 -0.05*(hidden_col_5/hidden_col_6)',
    "uniform_noise": {"weight": 0.1},
}

<IPython.core.display.Javascript object>

## Apply targen and get the new dataset

In [34]:
X_y = target.get_target_and_contributions(
    data, expressions=expressions_x_y, imbalance=0.05, drop_features=False
)
X_acc_reject = target.get_target_and_contributions(
    data, expressions=expressions_accept_reject, imbalance=0.7, drop_features=False
)
X_acc_reject = X_acc_reject.rename(columns={"y": "is_accepted"})

<IPython.core.display.Javascript object>

## Put all data together

In [35]:
dataset = pd.concat([X_y, X_acc_reject["is_accepted"]], axis=1).drop(
    [col for col in X_y.columns if col.startswith("score")], axis=1
)

<IPython.core.display.Javascript object>

## Check the default rate per samples (kgb vs rejected)

In [37]:
kgb = dataset[dataset["is_accepted"] == 1]
rejected = dataset[dataset["is_accepted"] == 0]

<IPython.core.display.Javascript object>

In [38]:
kgb["y"].value_counts(normalize=True)

0    0.956143
1    0.043857
Name: y, dtype: float64

<IPython.core.display.Javascript object>

In [39]:
rejected["y"].value_counts(normalize=True)

0    0.935667
1    0.064333
Name: y, dtype: float64

<IPython.core.display.Javascript object>

## Get the modelling dataset

In [54]:
model_ds = dataset[
    [col for col in dataset.columns if col.startswith("known")] + ["y", "is_accepted"]
]

<IPython.core.display.Javascript object>

In [65]:
model_ds.to_csv(r"C:\Users\Asus\Desktop\Repo\MasterThesis_RI\model_ds.csv", index=False)

<IPython.core.display.Javascript object>