In [1]:
import pandas as pd
import numpy as np
from scipy.linalg import toeplitz
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from typing import Union

In [2]:
def data_sim(n=100,
             intercept=-5,
             linear_vars=10,
             noise_vars=0,
             corr_vars=0,
             corr_type="AR1",
             corr_value=0,
             mislabel=0,
             surg_err=0.05,
             bin_var_p=0,
             bin_coef=0,
             outcome="classification",
             regression_err=None,
             ):
    """
    This function is for the most part a direct translation of the twoClassSim function from the R package caret.
    Full credit for the approach used for simulating binary classification data foes to the Authors and contributors
    of caret.

    There are some modifications from the R implementation:
    1. The ordinal outcome option has not been translated
    2. The addition of another linear feature that is a copy of another used in the linear predictor with a small amount
    of noise has been added to allow for the study of variable surrogacy
    3. Option for a binary predictor and surrogate has also been added
    4. Toggle option for regression versus classification has also been added

    Source:
        Caret: Kuhn, M. (2008). Caret package. Journal of Statistical Software, 28(5)
        https://rdrr.io/cran/caret/man/twoClassSim.html

    :param n: number of observations
    :param intercept: value for the intercept which can be modified to generate class imbalance
    :param linear_vars: number of linear features
    :param noise_vars: number of noise features (i.e., do not contribute to the linear predictor)
    :param corr_vars: number of correlated noise features
    :param corr_type: type of correlation (exchangeable or auto-regressive) for correlated noise features
    :param corr_value: correlation for correlated noise features
    :param mislabel: proportion of mis-labelling of target if required
    :param surg_err: degree of noise added to first linear predictor
    :param bin_var_p: prevalence for a binary feature to include in linear predictor
    :param bin_coef: coefficient for the impact of binary feature on linear predictor
    :param outcome: can be either classification for a binary outcome or regression for a continuous outcome
    :param regression_err: the error to be used in simulating a regression outcome
    :return: data frame containing the simulated features and target for classification

    """

    # set seed
    np.random.seed(seed=4763546)

    # add two correlated normal features
    sigma = np.array([[2, 1.3], [1.3, 2]])
    mu = [0, 0]
    tmp_data = pd.DataFrame(np.random.multivariate_normal(mu, sigma, size=n), columns=["TwoFactor1", "TwoFactor2"])

    # add linear features
    if linear_vars > 0:
        lin_cols = ['Linear' + str(x) for x in range(1, linear_vars + 1)]
        tmp_data = pd.concat([tmp_data, pd.DataFrame(np.random.normal(size=(n, linear_vars)), columns=lin_cols)],
                             axis=1)

    # add features for non-linear terms
    tmp_data['Nonlinear1'] = pd.Series(np.random.uniform(low=-1.0, high=1.0, size=n))
    tmp_data = pd.concat([tmp_data, pd.DataFrame(np.random.uniform(size=(n, 2)), columns=['Nonlinear2', 'Nonlinear3'])],
                         axis=1)

    # add noise variables as needed
    if noise_vars > 0:
        noise_cols = ['Noise' + str(x) for x in range(1, noise_vars + 1)]
        tmp_data = pd.concat([tmp_data, pd.DataFrame(np.random.normal(size=(n, noise_vars)), columns=noise_cols)],
                             axis=1)

    # add correlated noise features
    if corr_vars > 0:
        if corr_type == "exch":
            vc = corr_value * np.ones((corr_vars, corr_vars))
            np.fill_diagonal(vc, 1)

        elif corr_type == "AR1":
            vc_values = corr_value ** np.arange(corr_vars)
            vc = toeplitz(vc_values)

        corr_cols = ['Corr' + str(x) for x in range(1, corr_vars + 1)]
        tmp_data = pd.concat([tmp_data,
                              pd.DataFrame(np.random.multivariate_normal(np.zeros(corr_vars), vc, size=n),
                                           columns=corr_cols)],
                             axis=1)

    # add a surrogate linear feature
    if linear_vars > 0:
        tmp_data['Linear1_prime'] = tmp_data['Linear1'] + np.random.normal(0, surg_err, size=n)

    # add a binary feature
    if bin_var_p > 0:
        tmp_data['Binary1'] = np.where(np.random.uniform(size=n) <= bin_var_p, 0, 1)

    # generate contributions to linear predictor 4, 4, 2
    lp = intercept + 4 * tmp_data.TwoFactor1 + 4 * tmp_data.TwoFactor2 + 2 * tmp_data.TwoFactor1 * tmp_data.TwoFactor2 \
         + tmp_data.Nonlinear1 ** 3 + 2 * np.exp(-6 * (tmp_data.Nonlinear1 - 0.3) ** 2) + \
         2 * np.sin(np.pi * tmp_data.Nonlinear2 * tmp_data.Nonlinear3)
        

    if linear_vars > 0:
        lin_coeff = np.linspace(10, 1, num=linear_vars) / 4
        neg_idx = [_ for _ in range(1, linear_vars, 2)]
        lin_coeff[neg_idx] = lin_coeff[neg_idx] * -1
        lp = lp + tmp_data[lin_cols].dot(lin_coeff)

    if bin_var_p > 0:
        lp = lp + bin_coef * tmp_data['Binary1']
        tmp_data['Binary1_prime'] = 1 - tmp_data['Binary1']

    if outcome == 'classification':

        # convert to a probability
        prob = 1 / (1 + np.exp(-lp))

        # add mislabelling if desired - TO DO: need to fix
        if (mislabel > 0) and (mislabel < 1):
            shuffle_rows = np.random.choice(n, np.floor(n * mislabel), replace=False)
            prob[shuffle_rows] = 1 - prob[shuffle_rows]

        # generate target
        tmp_data['target'] = np.where(prob <= np.random.uniform(size=n), 0, 1)

    elif outcome == 'regression':

        # continuous outcome based on linear predictor
        tmp_data['target'] = np.random.normal(lp, regression_err, size=n)

    return tmp_data


def scale_var(df: pd.DataFrame, 
              feature_name: str, 
              min_: Union[int, float] =0, 
              max_: Union[int, float]=1) -> np.array: 
    """
    Takes in a data frame and applies a min-max scaler to given bounds for a single column
    """
    
    scaler = MinMaxScaler(feature_range=(min_, max_))
    scaled_arr = scaler.fit_transform(df[[feature_name]]).reshape(1, -1)[0]
    return scaled_arr


In [3]:
df = data_sim(n=500,
             intercept=0,
             linear_vars=3,
             noise_vars=1,
             corr_vars=0,
             corr_type="AR1",
             corr_value=0.4,
             mislabel=0,
             surg_err=0.05,
             bin_var_p=0,
             bin_coef=0,
             outcome="regression",
             regression_err=0.2,
             )

In [4]:
df.rename({ 
    "TwoFactor1": "Weight on bit [kg]", # higher weight --> higher weight will increase risks of danger 
    "TwoFactor2": "Drill rate [m/s]", # higher drill rate --> higher drill rate will provide less time for drilling engineers to observe real time data and adjust drilling parameter set up -> leading to a higher risk of incident (but more economic to drill faster)
    "Linear1": "Vertical depth of operation [m]", # lower point of the well
    "Linear1_prime": "Bit depth [m]", # current position of the drilling bit
    "Linear2": "Mud density [kg/m3]", # need to have equal mud and soil density to avoid well collapse (formation falling in well and blocking pipe) or mud loss (mud flowing in the formation)
    "Linear3": "Hole diameter [m]", # Diameter of the hole (diameter diminishes as depth increases)
    "Noise1": "Temperature [C]", # Temperature at the drilling bit 
    "target": "Failure likelihood (%)"
}, axis=1, inplace=True)



In [5]:
scaling_dict = { 
    'Weight on bit [kg]': [100, 500], 
    'Drill rate [m/s]': [0.1, 1],   
    'Vertical depth of operation [m]': [0, 1500], 
    'Mud density [kg/m3]': [0.5, 4],
    'Hole diameter [m]': [0.5, 10], 
    'Temperature [C]': [0, 40], 
    'Bit depth [m]': [0, 1500], 
    'Failure likelihood (%)': [0, 100]
}

for k,v in scaling_dict.items(): 
    df.loc[:, k] = scale_var(df, k, v[0], v[1])

In [6]:
# final dataframe
df.head()

Unnamed: 0,Weight on bit [kg],Drill rate [m/s],Vertical depth of operation [m],Mud density [kg/m3],Hole diameter [m],Nonlinear1,Nonlinear2,Nonlinear3,Temperature [C],Bit depth [m],Failure likelihood (%)
0,239.302549,0.660993,886.156784,2.949484,0.5,-0.030589,0.822986,0.917557,21.582837,902.435896,23.084672
1,212.767755,0.647397,567.530485,3.338703,4.617729,0.090338,0.741262,0.831136,27.133111,551.297785,16.010499
2,256.828737,0.441749,1125.905909,2.503559,6.177708,-0.915346,0.14937,0.53727,10.448471,1144.445348,18.217468
3,230.388689,0.457263,1297.342216,2.137124,6.089012,-0.885661,0.131785,0.941753,18.143884,1309.752689,20.377512
4,264.516013,0.498232,731.8454,2.531889,5.303489,0.275052,0.447095,0.750145,13.733294,739.591496,21.080085
