# Libraries

In [12]:
import pandas as pd
import numpy as np
from functools import reduce
import pickle
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer


# Load Data

In [13]:
df = pd.read_pickle(r".\df_data\df.pkl")
df1 = pd.read_pickle(r".\df_data\df1.pkl")
df2 = pd.read_pickle(r".\df_data\df2.pkl")
df3 = pd.read_pickle(r".\df_data\df3.pkl")
df4 = pd.read_pickle(r".\df_data\df4.pkl")

In [14]:
dfis = [df1, df2, df3, df4]
iss = [1, 2, 3, 4]

# Pivots

## Functions

In [15]:
def fun_sparse_all(dfs, List):

    all_pivots = []

    for i, dfi in enumerate(dfs, start=1):
        if List == True:
            cols = [col for col in dfi.columns if '_List' in col or 'Study_Status_Bin' in col]
        else:
            cols = [col for col in dfi.columns if '_Bin' in col or '_Categ' in col]

        pivots = [] 

        for col in cols:
            pivot = dfi.pivot_table(
                index=col,
                columns="Study_Status_Bin",
                aggfunc="size",
                fill_value=0,
                observed=False
            ).reset_index()

            pivot['value'] = col + ' = ' + pivot[col].astype(str)
            pivot.drop(columns=[col], inplace=True)

            final_cols = ['value'] + [c for c in pivot.columns if c != 'value']
            pivot = pivot[final_cols]

            pivots.append(pivot)

        pivot_sparse = pd.concat(pivots, axis=0, ignore_index=True)

        pivot_sparse = pivot_sparse.rename(columns={
            "COMPLETED": f"df{i}_COMPLETED",
            "TERMINATED": f"df{i}_TERMINATED"
        })

        all_pivots.append(pivot_sparse)

    pivot_merged = reduce(lambda left, right: pd.merge(left, right, on="value", how="outer"), all_pivots) 
    # sos 'outer' because may not all df have the same levels in some variables.
    return pivot_merged


- Terminated sample is going to be left as is after train test split.
- Train set ois going to be even more reduced.

In [16]:
def fun_train_test(pivot_sparse):
    global numeric

    # Train set - test spit both Completed - Terminated samples will be altered
    pivot_sparse_train = pivot_sparse.copy()
    pivot_sparse_test = pivot_sparse.copy()
    numeric = pivot_sparse.select_dtypes(include = "number").columns
    # Train-Test Split Scenario that X_Train will be 0.9 of total sample. (See models file)
    # Train set - 0.9 of dataset
    pivot_sparse_train[numeric] = np.round(pivot_sparse_train[numeric]* 0.8, 0) # train size 
    # Test set - 0.1 of dataset - Sample alters stop here
    pivot_sparse_test[numeric] = np.round(pivot_sparse_test[numeric]* 0.2 , 0)
    return pivot_sparse_train, pivot_sparse_test


In [17]:
def fun_res_scenario(iss, pivot_sparse_train, pivot_sparse_test):
    xis = []
    for i in iss:    
        dfi_comp = pivot_sparse_train.loc[pivot_sparse_train['value'] == 'Study_Status_Bin = COMPLETED', f'df{i}_COMPLETED']
        dfi_term = pivot_sparse_train.loc[pivot_sparse_train['value'] == 'Study_Status_Bin = TERMINATED', f'df{i}_TERMINATED']

        # When Resample upcoming train set, it must be: (df1_comp/df1_term)*x = 1
        xi = 1/(dfi_comp.values/dfi_term.values)  # 0.25695 etc. 
        xis.append(xi)

    return xis

In [18]:
def fun_zeros(pivot_sparse_train, pivot_sparse_test, count):
    zero_train = pivot_sparse_train[((pivot_sparse_train[numeric] < count).any(axis=1)) &\
                                    (pivot_sparse_train['value'] != 'Study_Status_Bin = COMPLETED')&\
                                    (pivot_sparse_train['value'] != 'Study_Status_Bin = TERMINATED')]
    zero_test = pivot_sparse_test[((pivot_sparse_test[numeric] < count).any(axis=1)) &\
                                    (pivot_sparse_train['value'] != 'Study_Status_Bin = COMPLETED')&\
                                    (pivot_sparse_train['value'] != 'Study_Status_Bin = TERMINATED')]
    return zero_train, zero_test


## Categ

In [19]:
pivot_sparse = fun_sparse_all(dfis, False)
pivot_sparse

pivot_sparse_train, pivot_sparse_test = fun_train_test(pivot_sparse)
pivot_sparse_train
pivot_sparse_test

xis = fun_res_scenario(iss, pivot_sparse_train, pivot_sparse_test)
xis

pivot_sparse_train['df1_COMPLETED'] = np.round(pivot_sparse_train['df1_COMPLETED']*xis[0], 0)
pivot_sparse_train['df2_COMPLETED'] = np.round(pivot_sparse_train['df2_COMPLETED']*xis[1], 0)
pivot_sparse_train['df3_COMPLETED'] = np.round(pivot_sparse_train['df3_COMPLETED']*xis[2], 0)
pivot_sparse_train['df4_COMPLETED'] = np.round(pivot_sparse_train['df4_COMPLETED']*xis[3], 0)

zero_train, zero_test = fun_zeros(pivot_sparse_train, pivot_sparse_test, 10)

display(pivot_sparse_train)


Study_Status_Bin,value,df1_COMPLETED,df1_TERMINATED,df2_COMPLETED,df2_TERMINATED,df3_COMPLETED,df3_TERMINATED,df4_COMPLETED,df4_TERMINATED
0,Adverse_Bin = No,3045.0,2958.0,3449.0,3750.0,1253.0,1558.0,1386.0,1530.0
1,Adverse_Bin = Yes,777.0,865.0,2597.0,2296.0,1105.0,800.0,461.0,318.0
2,Adverse_System_Categ = 0–1,3095.0,3009.0,3608.0,3934.0,1305.0,1609.0,1482.0,1621.0
3,Adverse_System_Categ = 2–22,727.0,814.0,2438.0,2112.0,1053.0,749.0,365.0,226.0
4,Arm_Categ = 1–2,2404.0,2646.0,4303.0,4707.0,1802.0,1896.0,1518.0,1618.0
5,Arm_Categ = 3–32,,,,,556.0,462.0,329.0,230.0
6,Arm_Categ = 3–43,,,1743.0,1338.0,,,,
7,Arm_Categ = 3–44,1418.0,1177.0,,,,,,
8,City_Categ = 0–1,2803.0,2366.0,3289.0,3590.0,1067.0,1190.0,1447.0,1510.0
9,City_Categ = 2–1064,,,,,1291.0,1167.0,,


## List

In [20]:
def fun_lists(dfi):
    for col in [c for c in dfi.columns if '_List' in c]:
        if dfi[col].apply(lambda x: isinstance(x, list)).any():
            mlb = MultiLabelBinarizer()
            dummies = pd.DataFrame(mlb.fit_transform(dfi[col]), 
                                   columns=[f"{col}_{cls}" for cls in mlb.classes_],
                                   index=dfi.index)
            dfi = pd.concat([dfi.drop(columns=[col]), dummies], axis=1)
    return dfi

df1 = fun_lists(df1)
df2 = fun_lists(df2)
df3 = fun_lists(df3)
df4 = fun_lists(df4)

# Combined Pivot
pivot_sparse_list = fun_sparse_all([df1, df2, df3, df4], True)

# display(pivot_sparse_list)
pivot_sparse_train_list, pivot_sparse_test_list = fun_train_test(pivot_sparse_list)
pivot_sparse_train_list

# xis = fun_res_scenario(iss, pivot_sparse_train_list, pivot_sparse_test_list) No need to rerun it. shapes same as above.

pivot_sparse_train_list['df1_COMPLETED'] = np.round(pivot_sparse_train_list['df1_COMPLETED']*xis[0], 0)
pivot_sparse_train_list['df2_COMPLETED'] = np.round(pivot_sparse_train_list['df2_COMPLETED']*xis[1], 0)
pivot_sparse_train_list['df3_COMPLETED'] = np.round(pivot_sparse_train_list['df3_COMPLETED']*xis[2], 0)
pivot_sparse_train_list['df4_COMPLETED'] = np.round(pivot_sparse_train_list['df4_COMPLETED']*xis[3], 0)

zero_train_list, zero_test_list = fun_zeros(pivot_sparse_train_list, pivot_sparse_test_list, 10)

display(pivot_sparse_train_list)


Study_Status_Bin,value,df1_COMPLETED,df1_TERMINATED,df2_COMPLETED,df2_TERMINATED,df3_COMPLETED,df3_TERMINATED,df4_COMPLETED,df4_TERMINATED
0,Adverse_List_Adv_Death = 0,3433.0,3264.0,4589.0,4637.0,1642.0,1842.0,1579.0,1659.0
1,Adverse_List_Adv_Death = 1,389.0,558.0,1457.0,1409.0,716.0,515.0,268.0,188.0
2,Adverse_List_Adv_None = 0,796.0,879.0,2648.0,2355.0,1125.0,823.0,512.0,370.0
3,Adverse_List_Adv_None = 1,3026.0,2943.0,3398.0,3690.0,1233.0,1534.0,1335.0,1477.0
4,Adverse_List_Adv_Serious = 0,3432.0,3169.0,4093.0,4327.0,1400.0,1702.0,1578.0,1691.0
...,...,...,...,...,...,...,...,...,...
171,Sex_List = ALL,3151.0,3418.0,5323.0,5409.0,2080.0,2094.0,1575.0,1603.0
172,Sex_List = FEMALE,164.0,201.0,474.0,398.0,206.0,196.0,214.0,204.0
173,Sex_List = MALE,506.0,204.0,250.0,239.0,72.0,68.0,58.0,40.0
174,Study_Status_Bin = COMPLETED,3822.0,0.0,6046.0,0.0,2358.0,0.0,1847.0,0.0


In [21]:
display(zero_train, zero_test)

Study_Status_Bin,value,df1_COMPLETED,df1_TERMINATED,df2_COMPLETED,df2_TERMINATED,df3_COMPLETED,df3_TERMINATED,df4_COMPLETED,df4_TERMINATED


Study_Status_Bin,value,df1_COMPLETED,df1_TERMINATED,df2_COMPLETED,df2_TERMINATED,df3_COMPLETED,df3_TERMINATED,df4_COMPLETED,df4_TERMINATED
24,Continent_Categ = 2–6,275.0,112.0,554.0,202.0,638.0,146.0,76.0,9.0
31,Covid_19_Bin = Yes,67.0,22.0,122.0,70.0,76.0,41.0,21.0,7.0


# Zero Rows

In [22]:
display(zero_train_list, zero_test_list)

Study_Status_Bin,value,df1_COMPLETED,df1_TERMINATED,df2_COMPLETED,df2_TERMINATED,df3_COMPLETED,df3_TERMINATED,df4_COMPLETED,df4_TERMINATED
149,Intervention_Type_List_RADIATION = 1,46.0,128.0,119.0,211.0,13.0,30.0,2.0,6.0


Study_Status_Bin,value,df1_COMPLETED,df1_TERMINATED,df2_COMPLETED,df2_TERMINATED,df3_COMPLETED,df3_TERMINATED,df4_COMPLETED,df4_TERMINATED
13,Adverse_System_List_Adv_Syst_Death = 1,25.0,5.0,46.0,19.0,28.0,7.0,67.0,16.0
15,Adverse_System_List_Adv_Syst_Ear = 1,222.0,66.0,478.0,137.0,300.0,55.0,61.0,6.0
23,Adverse_System_List_Adv_Syst_Hepatobiliary = 1,182.0,63.0,518.0,143.0,509.0,77.0,78.0,8.0
25,Adverse_System_List_Adv_Syst_Immune System = 1,181.0,61.0,415.0,117.0,329.0,48.0,62.0,5.0
35,Adverse_System_List_Adv_Syst_Neoplasms = 1,200.0,74.0,563.0,151.0,613.0,81.0,96.0,7.0
53,"Adverse_System_List_Adv_Syst_Surgical, Medical...",62.0,12.0,203.0,39.0,190.0,27.0,59.0,7.0
80,"Conditions_Detail_List_Health Care, Therapeuti...",70.0,8.0,75.0,19.0,51.0,7.0,64.0,14.0
118,Intervention_Model_List = FACTORIAL,29.0,4.0,33.0,7.0,27.0,6.0,28.0,7.0
120,Intervention_Model_List = SEQUENTIAL,486.0,171.0,142.0,88.0,16.0,5.0,18.0,3.0
133,Intervention_Type_List_BEHAVIORAL = 1,60.0,7.0,108.0,17.0,56.0,6.0,40.0,8.0


# Merge Categories for sparse data. 
- Some categorie have been merged from Thesis_data file from whole dataset (df). 
- Those were obviously much less compared to dataset. 
- Here it has been done an example of levels, after a perfect scenario of even train-test split and a resample across featture samples. Thus, there is no randomness to include more or less data of a feature in completed vs terminated and in train vs test set. 
- This way the levels that are in extreme risk of sparsity are going to be merged to greater ones. 
