# Libraries

In [176]:
import pandas as pd
import numpy as np
from functools import reduce
import pickle
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer


# Load Data

In [177]:
df = pd.read_pickle(r".\df_data\df.pkl")
df1 = pd.read_pickle(r".\df_data\df1.pkl")
df2 = pd.read_pickle(r".\df_data\df2.pkl")
df3 = pd.read_pickle(r".\df_data\df3.pkl")
df4 = pd.read_pickle(r".\df_data\df4.pkl")

In [178]:
dfis = [df1, df2, df3, df4]
iss = [1, 2, 3, 4]

# Pivots

## Functions

In [179]:
def fun_sparse_all(dfs, List):

    all_pivots = []

    for i, dfi in enumerate(dfs, start=1):
        if List == True:
            cols = [col for col in dfi.columns if '_List' in col or 'Study_Status_Bin' in col]
        else:
            cols = [col for col in dfi.columns if '_Bin' in col or '_Categ' in col]

        pivots = [] 

        for col in cols:
            pivot = dfi.pivot_table(
                index=col,
                columns="Study_Status_Bin",
                aggfunc="size",
                fill_value=0,
                observed=False
            ).reset_index()

            pivot['value'] = col + ' = ' + pivot[col].astype(str)
            pivot.drop(columns=[col], inplace=True)

            final_cols = ['value'] + [c for c in pivot.columns if c != 'value']
            pivot = pivot[final_cols]

            pivots.append(pivot)

        pivot_sparse = pd.concat(pivots, axis=0, ignore_index=True)

        pivot_sparse = pivot_sparse.rename(columns={
            "COMPLETED": f"df{i}_COMPLETED",
            "TERMINATED": f"df{i}_TERMINATED"
        })

        all_pivots.append(pivot_sparse)

    pivot_merged = reduce(lambda left, right: pd.merge(left, right, on="value", how="outer"), all_pivots) 
    # sos 'outer' because may not all df have the same levels in some variables.
    return pivot_merged


- Terminated sample is going to be left as is after train test split.
- Train set ois going to be even more reduced.

In [180]:
def fun_pivot(pivot_sparse):
    global numeric

    # Train set - test spit both Completed - Terminated samples will be altered
    pivot_sparse_train = pivot_sparse.copy()
    pivot_sparse_test = pivot_sparse.copy()
    numeric = pivot_sparse.select_dtypes(include = "number").columns
    # Train-Test Split Scenario that X_Train will be 0.9 of total sample. (See models file)
    # Train set - 0.9 of dataset
    pivot_sparse_train[numeric] = np.round(pivot_sparse_train[numeric]* 0.88, 0) # train size 
    # Test set - 0.1 of dataset - Sample alters stop here
    pivot_sparse_test[numeric] = np.round(pivot_sparse_test[numeric]* 0.12 , 0)
    return pivot_sparse_train, pivot_sparse_test


In [181]:
def fun_res_scenario(iss, pivot_sparse_train, pivot_sparse_test):
    xis = []
    for i in iss:    
        dfi_comp = pivot_sparse_train.loc[pivot_sparse_train['value'] == 'Study_Status_Bin = COMPLETED', f'df{i}_COMPLETED']
        dfi_term = pivot_sparse_train.loc[pivot_sparse_train['value'] == 'Study_Status_Bin = TERMINATED', f'df{i}_TERMINATED']

        # When Resample upcoming train set, it must be: (df1_comp/df1_term)*x = 1
        xi = 1/(dfi_comp.values/dfi_term.values)  # 0.25695 etc. 
        xis.append(xi)

    return xis

In [182]:
def fun_zeros(pivot_sparse_train, pivot_sparse_test, count):
    zero_train = pivot_sparse_train[((pivot_sparse_train[numeric] < count).any(axis=1)) &\
                                    (pivot_sparse_train['value'] != 'Study_Status_Bin = COMPLETED')&\
                                    (pivot_sparse_train['value'] != 'Study_Status_Bin = TERMINATED')]
    zero_test = pivot_sparse_test[((pivot_sparse_test[numeric] < count).any(axis=1)) &\
                                    (pivot_sparse_train['value'] != 'Study_Status_Bin = COMPLETED')&\
                                    (pivot_sparse_train['value'] != 'Study_Status_Bin = TERMINATED')]
    return zero_train, zero_test


## Categ

In [183]:
pivot_sparse = fun_sparse_all(dfis, False)
pivot_sparse

pivot_sparse_train, pivot_sparse_test = fun_pivot(pivot_sparse)
pivot_sparse_train
pivot_sparse_test

xis = fun_res_scenario(iss, pivot_sparse_train, pivot_sparse_test)
xis

pivot_sparse_train['df1_COMPLETED'] = np.round(pivot_sparse_train['df1_COMPLETED']*xis[0], 0)
pivot_sparse_train['df2_COMPLETED'] = np.round(pivot_sparse_train['df2_COMPLETED']*xis[1], 0)
pivot_sparse_train['df3_COMPLETED'] = np.round(pivot_sparse_train['df3_COMPLETED']*xis[2], 0)
pivot_sparse_train['df4_COMPLETED'] = np.round(pivot_sparse_train['df4_COMPLETED']*xis[3], 0)

zero_train, zero_test = fun_zeros(pivot_sparse_train, pivot_sparse_test, 10)

display(pivot_sparse_train)


Study_Status_Bin,value,df1_COMPLETED,df1_TERMINATED,df2_COMPLETED,df2_TERMINATED,df3_COMPLETED,df3_TERMINATED,df4_COMPLETED,df4_TERMINATED
0,Adverse_Bin = No,3350.0,3253.0,3793.0,4125.0,1378.0,1713.0,1525.0,1683.0
1,Adverse_Bin = Yes,855.0,951.0,2857.0,2526.0,1215.0,880.0,507.0,349.0
2,Adverse_System_Categ = 0–1,3405.0,3310.0,3969.0,4327.0,1436.0,1770.0,1630.0,1783.0
3,Adverse_System_Categ = 2–26,800.0,895.0,,,,,,
4,Adverse_System_Categ = 2–27,,,2681.0,2323.0,1157.0,824.0,402.0,249.0
5,Arm_Categ = 1–2,2646.0,2910.0,4733.0,5178.0,1982.0,2086.0,1670.0,1779.0
6,Arm_Categ = 3–32,,,,,611.0,508.0,362.0,253.0
7,Arm_Categ = 3–43,,,1916.0,1472.0,,,,
8,Arm_Categ = 3–44,1560.0,1294.0,,,,,,
9,City_Categ = 0–1,3084.0,2603.0,3617.0,3949.0,1173.0,1309.0,1592.0,1661.0


## List

In [184]:
def fun_lists(dfi):
    for col in [c for c in dfi.columns if '_List' in c]:
        if dfi[col].apply(lambda x: isinstance(x, list)).any():
            mlb = MultiLabelBinarizer()
            dummies = pd.DataFrame(mlb.fit_transform(dfi[col]), 
                                   columns=[f"{col}_{cls}" for cls in mlb.classes_],
                                   index=dfi.index)
            dfi = pd.concat([dfi.drop(columns=[col]), dummies], axis=1)
    return dfi

df1 = fun_lists(df1)
df2 = fun_lists(df2)
df3 = fun_lists(df3)
df4 = fun_lists(df4)

# Combined Pivot
pivot_sparse_list = fun_sparse_all([df1, df2, df3, df4], True)

# display(pivot_sparse_list)
pivot_sparse_train_list, pivot_sparse_test_list = fun_pivot(pivot_sparse_list)
pivot_sparse_train_list

# xis = fun_res_scenario(iss, pivot_sparse_train_list, pivot_sparse_test_list) No need to rerun it. shapes same as above.

pivot_sparse_train_list['df1_COMPLETED'] = np.round(pivot_sparse_train_list['df1_COMPLETED']*xis[0], 0)
pivot_sparse_train_list['df2_COMPLETED'] = np.round(pivot_sparse_train_list['df2_COMPLETED']*xis[1], 0)
pivot_sparse_train_list['df3_COMPLETED'] = np.round(pivot_sparse_train_list['df3_COMPLETED']*xis[2], 0)
pivot_sparse_train_list['df4_COMPLETED'] = np.round(pivot_sparse_train_list['df4_COMPLETED']*xis[3], 0)

zero_train_list, zero_test_list = fun_zeros(pivot_sparse_train_list, pivot_sparse_test_list, 10)

display(pivot_sparse_train_list)


Study_Status_Bin,value,df1_COMPLETED,df1_TERMINATED,df2_COMPLETED,df2_TERMINATED,df3_COMPLETED,df3_TERMINATED,df4_COMPLETED,df4_TERMINATED
0,Adverse_List_Adv_Death = 0,3777.0,3590.0,5048.0,5100.0,1805.0,2027.0,1737.0,1825.0
1,Adverse_List_Adv_Death = 1,428.0,614.0,1602.0,1550.0,788.0,567.0,295.0,207.0
2,Adverse_List_Adv_None = 0,876.0,967.0,2912.0,2591.0,1237.0,906.0,563.0,407.0
3,Adverse_List_Adv_None = 1,3329.0,3238.0,3738.0,4059.0,1356.0,1688.0,1469.0,1624.0
4,Adverse_List_Adv_Serious = 0,3776.0,3486.0,4502.0,4760.0,1539.0,1873.0,1736.0,1860.0
...,...,...,...,...,...,...,...,...,...
203,Sex_List = ALL,3467.0,3759.0,5854.0,5950.0,2287.0,2303.0,1733.0,1764.0
204,Sex_List = FEMALE,181.0,221.0,521.0,437.0,226.0,216.0,235.0,224.0
205,Sex_List = MALE,557.0,224.0,274.0,263.0,79.0,75.0,64.0,44.0
206,Study_Status_Bin = COMPLETED,4205.0,0.0,6650.0,0.0,2593.0,0.0,2032.0,0.0


In [185]:
display(zero_train, zero_test)

Study_Status_Bin,value,df1_COMPLETED,df1_TERMINATED,df2_COMPLETED,df2_TERMINATED,df3_COMPLETED,df3_TERMINATED,df4_COMPLETED,df4_TERMINATED


Study_Status_Bin,value,df1_COMPLETED,df1_TERMINATED,df2_COMPLETED,df2_TERMINATED,df3_COMPLETED,df3_TERMINATED,df4_COMPLETED,df4_TERMINATED
25,Continent_Categ = 2–6,165.0,67.0,332.0,121.0,383.0,88.0,46.0,6.0
32,Covid_19_Bin = Yes,40.0,13.0,73.0,42.0,46.0,24.0,13.0,4.0


In [186]:
display(zero_train_list, zero_test_list)

Study_Status_Bin,value,df1_COMPLETED,df1_TERMINATED,df2_COMPLETED,df2_TERMINATED,df3_COMPLETED,df3_TERMINATED,df4_COMPLETED,df4_TERMINATED
25,Adverse_System_List_Adv_Syst_Genetic = 1,20.0,33.0,111.0,80.0,128.0,89.0,15.0,5.0
47,Adverse_System_List_Adv_Syst_Pregnancy/ Perina...,14.0,4.0,97.0,33.0,119.0,66.0,19.0,11.0
61,Adverse_System_List_Adv_Syst_Social = 1,11.0,10.0,54.0,27.0,46.0,26.0,11.0,1.0
86,Conditions_Detail_List_Education = 1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
92,Conditions_Detail_List_Health Care = 1,6.0,7.0,18.0,21.0,10.0,10.0,18.0,23.0
94,Conditions_Detail_List_Health Care Economics a...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
100,Conditions_Detail_List_Information Science = 1,2.0,2.0,5.0,2.0,1.0,0.0,0.0,0.0
167,Intervention_Type_List_DIAGNOSTIC_TEST = 1,9.0,11.0,20.0,18.0,5.0,3.0,7.0,11.0
173,Intervention_Type_List_GENETIC = 1,13.0,24.0,18.0,29.0,3.0,3.0,1.0,4.0
179,Intervention_Type_List_RADIATION = 1,50.0,141.0,131.0,232.0,14.0,33.0,3.0,6.0


Study_Status_Bin,value,df1_COMPLETED,df1_TERMINATED,df2_COMPLETED,df2_TERMINATED,df3_COMPLETED,df3_TERMINATED,df4_COMPLETED,df4_TERMINATED
9,Adverse_System_List_Adv_Syst_Blood/ Lymphatic = 1,228.0,86.0,563.0,200.0,333.0,62.0,70.0,9.0
13,Adverse_System_List_Adv_Syst_Death = 1,15.0,3.0,27.0,11.0,17.0,4.0,40.0,10.0
15,Adverse_System_List_Adv_Syst_Ear = 1,133.0,39.0,287.0,82.0,180.0,33.0,37.0,4.0
17,Adverse_System_List_Adv_Syst_Endocrine = 1,76.0,30.0,204.0,63.0,128.0,23.0,27.0,3.0
19,Adverse_System_List_Adv_Syst_Eye = 1,207.0,61.0,430.0,123.0,273.0,44.0,61.0,7.0
25,Adverse_System_List_Adv_Syst_Genetic = 1,13.0,4.0,43.0,11.0,80.0,12.0,10.0,1.0
27,Adverse_System_List_Adv_Syst_Hepatobiliary = 1,109.0,38.0,311.0,86.0,306.0,46.0,47.0,5.0
29,Adverse_System_List_Adv_Syst_Immune System = 1,108.0,37.0,249.0,70.0,197.0,29.0,37.0,3.0
35,Adverse_System_List_Adv_Syst_Investigations = 1,283.0,92.0,617.0,212.0,382.0,72.0,65.0,9.0
37,Adverse_System_List_Adv_Syst_Metabolism/ Nutri...,245.0,90.0,608.0,207.0,402.0,72.0,80.0,9.0


# Merge Categories for sparse data. 
- Some categorie have been merged from Thesis_data file from whole dataset (df). 
- Those were obviously much less compared to dataset. 
- Here it has been done an example of levels, after a perfect scenario of even train-test split and a resample across featture samples. Thus, there is no randomness to include more or less data of a feature in completed vs terminated and in train vs test set. 
- This way the levels that are in extreme risk of sparsity are going to be merged to greater ones. 

In [None]:
cond_map = {
    
}

df['Conditions_Detail_List'] = df['Conditions_Detail_List'].apply(lambda row: [cond_map.get(val, val) for val in row])

display(df["Funder_Industry_Bin"].value_counts(dropna=False))  

In [188]:
values = zero_train_list['value'].to_list() +  zero_test_list['value'].to_list()
values = list(set(values))
values

['Adverse_System_List_Adv_Syst_Genetic = 1',
 'Conditions_Detail_List_Information Science = 1',
 'Adverse_System_List_Adv_Syst_Blood/ Lymphatic = 1',
 'Intervention_Type_List_BEHAVIORAL = 1',
 'Intervention_Type_List_BIOLOGICAL = 1',
 'Intervention_Type_List_GENETIC = 1',
 'Adverse_System_List_Adv_Syst_Metabolism/ Nutrition = 1',
 'Conditions_Detail_List_Chemical Disorders = 1',
 'Adverse_System_List_Adv_Syst_Psychiatric/ Social = 1',
 'Conditions_Detail_List_Health Care = 1',
 'Intervention_Type_List_RADIATION = 1',
 'Conditions_Detail_List_Education = 1',
 'Adverse_System_List_Adv_Syst_Product Issues = 1',
 'Intervention_Model_List = SEQUENTIAL',
 'Conditions_Detail_List_Hereditary, Neonatal, Abnormalities = 1',
 'Continents_List_South America = 1',
 'Adverse_System_List_Adv_Syst_Death = 1',
 'Conditions_Detail_List_Otorhinolaryngologic = 1',
 'Conditions_Detail_List_Health Care Economics and Organizations = 1',
 'Primary_Purpose_List = SUPPORTIVE_CARE',
 'Adverse_System_List_Adv_Sys