# Libraries

In [863]:
# Stat Libs
import pandas as pd
from pandas.api.types import CategoricalDtype
import pickle
import re

# Stat Libs
import statsmodels.api as sm
from itertools import product
from functools import reduce

# Statistical libs
from sklearn.preprocessing import MultiLabelBinarizer


# Load Data

In [864]:
df1 = pd.read_pickle(r".\df_vizual\df1_vizual.pkl")
df2 = pd.read_pickle(r".\df_vizual\df2_vizual.pkl")
df3 = pd.read_pickle(r".\df_vizual\df3_vizual.pkl")
df4 = pd.read_pickle(r".\df_vizual\df4_vizual.pkl")

p_cl1 = pd.read_pickle(r".\p_values\p_cl1_vizual.pkl")
p_cl2 = pd.read_pickle(r".\p_values\p_cl2_vizual.pkl")
p_cl3 = pd.read_pickle(r".\p_values\p_cl3_vizual.pkl")
p_cl4 = pd.read_pickle(r".\p_values\p_cl4_vizual.pkl")

In [865]:
dfis = [df1, df2, df3, df4]
iss = [1, 2, 3, 4]

# Dummies
"_List” data: List element data, were encoded through dummy creation. The problem was that list-element data cannot be used from models. Firstly, list-element rows were exploded to one row per element of list. However, this inflates sample size, as rows increase and duplicate for same nct_id if more than one element occur in a list-row. For this reason, these data were then grouped by nct_id, so number of rows remained the same as initial datasets (df0, df1, df2, df3, df4, df5).

In [866]:
# Alternatively for List element columns 
mlb = MultiLabelBinarizer()

In [867]:
## Dummies
def fun_dum_enc(dfi, cols):
    for col in cols:  
        df_expl = dfi.copy()
        df_expl = df_expl.explode(col)

        df_expl[col] = df_expl[col].astype('category') 
        df_expl[col] = df_expl[col].cat.remove_unused_categories()
        df_expl[col] = df_expl[col].astype('str') #str cause of error in encoding. After astype(cat) so to drop unused categories

        dummies = pd.get_dummies(df_expl[col], drop_first = False, dtype = int, prefix = col , prefix_sep='_')
        
        dummies.index = df_expl.index # ensure same indexing with df_expl
        dummies = dummies.groupby(dummies.index).sum()

        dfi = pd.concat([dfi.drop(columns = [col], axis = 1), dummies], axis = 1)  
    return dfi

### Cols
def fun_dum_cols(dfis):
    dum_cols = []
    for dfi in dfis: # loop inputed in case dfis have not all the same columns. # * Plus not to run function into funtion.
        dum_cols = dum_cols + [[col for col in dfi.columns if 'list' in col.lower()]]
    return dum_cols

dum_cols = fun_dum_cols(dfis) 

# Apply
# * loop so not to run function into function
df1 = fun_dum_enc(df1, dum_cols[0])
df2 = fun_dum_enc(df2, dum_cols[1])
df3 = fun_dum_enc(df3, dum_cols[2])
df4 = fun_dum_enc(df4, dum_cols[3])


# Example
display(dum_cols[0])
df1[[col for col in df1.columns if 'list' in col.lower()]].head()


['Sex_List',
 'Age_List',
 'Intervention_Type_List',
 'Intervention_Route_List',
 'Conditions_Detail_List',
 'Adverse_List',
 'Adverse_System_List',
 'Allocation_List',
 'Intervention_Model_List',
 'Masking_List',
 'Masking_Detail_List',
 'Primary_Purpose_List',
 'Continents_List']

Unnamed: 0,Sex_List_ALL,Sex_List_FEMALE,Sex_List_MALE,Age_List_ADULT,Age_List_CHILD,Age_List_OLDER_ADULT,Intervention_Type_List_BEHAVIORAL,Intervention_Type_List_BIOLOGICAL,Intervention_Type_List_COMBINATION_PRODUCT,Intervention_Type_List_DEVICE,...,Primary_Purpose_List_SCREENING,Primary_Purpose_List_SUPPORTIVE_CARE,Primary_Purpose_List_TREATMENT,Continents_List_Africa,Continents_List_Asia,Continents_List_Cont_Other,Continents_List_Europe,Continents_List_North America,Continents_List_Oceania,Continents_List_South America
0,0,0,1,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
1,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,1,0,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,1,0,0,1,0,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,1,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


## Drop First
- Drop first was not done through get_dummies command, so to choose the column to drop, based on its characterisic. e.g., drop phase 0 is preferred that to drop phase 3.


In [868]:
dfis = [df1, df2, df3, df4]
iss = [1, 2, 3, 4]


In [869]:

def fun_drop_first(dfi):
    dum_cols = [col for col in dfi.columns if '_list' in col.lower()]
    
    keywords = ['other', 'none', 'sex_list_all', 'age_list_child', 'na_randomize', 'single_grou']
    drop_cols = [col for col in dum_cols if any(key in col.lower() for key in keywords) and '_x_' not in col]
    
    dfi = dfi.drop(columns = drop_cols, axis = 1)
    return dfi

df1 = fun_drop_first(df1)
df2 = fun_drop_first(df2)
df3 = fun_drop_first(df3)
df4 = fun_drop_first(df4)

# Example Check
df1.columns.values.tolist()

['Study_Status_Bin',
 'Funder_Industry_Bin',
 'Completion_Gap_Log',
 'Placebo_Bin',
 'Standard_Care_Bin',
 'Healthy_Bin',
 'Covid_19_Bin',
 'Adverse_Counts_Log',
 'Adverse_Bin',
 'Adverse_System_Counts_Log',
 'Arm_Counts_Log',
 'Countries_Counts_Log',
 'City_Counts_Log',
 'Enrollment_Counts_Log',
 'Intervention_Type_Categ',
 'Intervention_Route_Categ',
 'Sex_List_FEMALE',
 'Sex_List_MALE',
 'Age_List_ADULT',
 'Age_List_OLDER_ADULT',
 'Intervention_Type_List_BEHAVIORAL',
 'Intervention_Type_List_BIOLOGICAL',
 'Intervention_Type_List_COMBINATION_PRODUCT',
 'Intervention_Type_List_DEVICE',
 'Intervention_Type_List_DIAGNOSTIC_TEST',
 'Intervention_Type_List_DIETARY_SUPPLEMENT',
 'Intervention_Type_List_DRUG',
 'Intervention_Type_List_GENETIC',
 'Intervention_Type_List_PROCEDURE',
 'Intervention_Type_List_RADIATION',
 'Intervention_Route_List_Injection',
 'Intervention_Route_List_Oral',
 'Intervention_Route_List_Surgical',
 'Intervention_Route_List_Topical',
 'Conditions_Detail_List_Bacteri

# Binary

In [870]:
# Binary Encoding
def fun_bin_enc(dfi, cols):
    dfi = dfi.copy()
    for col in cols:
        cats = CategoricalDtype(categories = sorted(dfi[col].dropna().unique()), ordered = False)
        dfi[col] = dfi[col].astype(cats).cat.codes
    return dfi


### Cols
def fun_bin_cols(dfis):
    bin_cols = []
    for dfi in dfis:
        bin_cols = bin_cols + [[col for col in dfi.columns if '_categ' in col.lower() or '_bin' in col.lower()]] 
    return bin_cols

# Apply
bin_cols = fun_bin_cols(dfis)

df1 = fun_bin_enc(df1, bin_cols[0])
df2 = fun_bin_enc(df2, bin_cols[1])
df3 = fun_bin_enc(df3, bin_cols[2])
df4 = fun_bin_enc(df4, bin_cols[3])

# Example
display(df1['Study_Status_Bin'].value_counts())  # Completed = 0, Terminated = 1
display(bin_cols[0])  # bin_cols[0] --> bin_cols of df1
display(df1[bin_cols[0]]) 

Study_Status_Bin
0    22546
1     4778
Name: count, dtype: int64

['Study_Status_Bin',
 'Funder_Industry_Bin',
 'Placebo_Bin',
 'Standard_Care_Bin',
 'Healthy_Bin',
 'Covid_19_Bin',
 'Adverse_Bin',
 'Intervention_Type_Categ',
 'Intervention_Route_Categ']

Unnamed: 0,Study_Status_Bin,Funder_Industry_Bin,Placebo_Bin,Standard_Care_Bin,Healthy_Bin,Covid_19_Bin,Adverse_Bin,Intervention_Type_Categ,Intervention_Route_Categ
0,0,1,0,0,1,0,0,0,0
1,0,0,1,0,1,0,0,0,0
2,0,1,0,0,1,0,1,0,0
3,0,1,0,0,0,0,0,0,0
4,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...
27319,0,1,0,0,1,0,0,0,0
27320,1,1,0,0,0,0,0,0,0
27321,0,0,1,0,0,0,0,0,1
27322,0,0,1,0,0,0,1,0,0


# Interactions

In [871]:
dfis = [df1, df2, df3, df4]
iss = [1, 2, 3, 4]

display(df1.shape[1])
display(df2.shape[1])
display(df3.shape[1])
display(df4.shape[1])

118

118

118

118

In [872]:
# Interaction Continues_x_Categorical
col = 'Conditions_Detail_List_Neoplasms'
df4[f'Enrollment_x_{col}'] = df4['Enrollment_Counts_Log'] * df4[col]

inter_cols = [col for col in df4.columns if '_x_' in col]  # i have the world other in too many data levels !!
display(len(inter_cols))  # Must have created 1

# Intreaction of Categorical_x_Binary
df4[inter_cols].columns
def fun_inter(dfi, col1, col2, stip):

    cols1 = [col for col in df1.columns if col1 in col]  # All dfs have the same columns
    cols2 = [col for col in df1.columns if col2 in col]

    for col1, col2 in product(cols1, cols2):
        inter_col1 = f"{col1}_x_{col2}"
        dfi[inter_col1] = dfi[col1] * dfi[col2]
        inter_col = re.sub(stip, "", inter_col1)
        dfi.rename(columns = {inter_col1: inter_col}, inplace=True)
        
    return dfi
    
df1 = fun_inter(df1, 'Intervention_Type_List', 'Funder_Industry_Bin', r"Intervention_Type_List_|_Bin" )
df2 = fun_inter(df2, 'Intervention_Type_List', 'Funder_Industry_Bin', r"Intervention_Type_List_|_Bin")
df3 = fun_inter(df3, 'Intervention_Type_List', 'Funder_Industry_Bin', r"Intervention_Type_List_|_Bin")
df4 = fun_inter(df4, 'Intervention_Type_List', 'Funder_Industry_Bin', r"Intervention_Type_List_|_Bin")

inter_cols = [col for col in df2.columns if '_x_' in col]  # i have the world other in too many data levels !!
display(len(inter_cols))  # Funder (3 levels) x INTERV_TYPE (11 levels) = 33

dfis = [df1, df2, df3, df4]
iss = [1, 2, 3, 4]

# Pivot
dfis = [df1, df2, df3, df4]
iss = [1, 2, 3, 4]

pivots_inter = []

for i, dfi in zip(iss, dfis):
    pivot = dfi.pivot_table(
        index = "Study_Status_Bin",
        values = [col for col in dfi.columns if '_x_' in col],
        aggfunc = "sum", 
        observed = False)
    
    pivot_inter_1 = pivot.T
    pivot_inter_1.columns = [f"df{i}_{outcome}" for outcome in pivot_inter_1.columns]  # Optional: label by df index
    
    pivots_inter.append(pivot_inter_1)

pivot_inter1 = pivots_inter[0]
pivot_inter2 = pivots_inter[1]
pivot_inter3 = pivots_inter[2]
pivot_inter4 = pivots_inter[3]

pivot_inter = pd.concat(pivots_inter, axis=1)
pivot_inter['Sum_Counts'] = pivot_inter.sum(axis = 1)
pivot_inter.sort_values(by = 'Sum_Counts')

pivot_inter

1

10

Unnamed: 0,df1_0,df1_1,df2_0,df2_1,df3_0,df3_1,df4_0,df4_1,Sum_Counts
BEHAVIORAL_x_Funder_Industry,7.0,1.0,35.0,11.0,14.0,3.0,7.0,3.0,81.0
BIOLOGICAL_x_Funder_Industry,1471.0,419.0,1308.0,450.0,1193.0,146.0,264.0,17.0,5268.0
COMBINATION_PRODUCT_x_Funder_Industry,92.0,30.0,70.0,29.0,69.0,14.0,16.0,5.0,325.0
DEVICE_x_Funder_Industry,189.0,35.0,179.0,66.0,207.0,53.0,185.0,42.0,956.0
DIAGNOSTIC_TEST_x_Funder_Industry,13.0,2.0,9.0,8.0,9.0,1.0,3.0,0.0,45.0
DIETARY_SUPPLEMENT_x_Funder_Industry,62.0,4.0,96.0,14.0,45.0,10.0,34.0,4.0,269.0
DRUG_x_Funder_Industry,12980.0,2112.0,7163.0,2490.0,6056.0,1253.0,1548.0,285.0,33887.0
GENETIC_x_Funder_Industry,40.0,19.0,33.0,17.0,12.0,2.0,2.0,0.0,125.0
PROCEDURE_x_Funder_Industry,53.0,39.0,75.0,44.0,82.0,18.0,26.0,9.0,346.0
RADIATION_x_Funder_Industry,46.0,30.0,34.0,44.0,7.0,8.0,6.0,2.0,177.0


# p_values

In [873]:
p_cls = [p_cl1[['Variable', 'df1-Stat_Sig']], 
         p_cl2[['Variable', 'df2-Stat_Sig']], 
         p_cl3[['Variable', 'df3-Stat_Sig']], 
         p_cl4[['Variable', 'df4-Stat_Sig']]]

p_cl = reduce(lambda left, right: pd.merge(left, right, on='Variable', how='outer'), p_cls)
p_cl[p_cl['Variable'].isin(df1.columns)]

Unnamed: 0,Variable,df1-Stat_Sig,df2-Stat_Sig,df3-Stat_Sig,df4-Stat_Sig
32,Adverse_Bin,Stat_Sig,Stat_Sig,Stat_Sig,Stat_Sig
33,Adverse_Counts_Log,-,Stat_Sig,Stat_Sig,Stat_Sig
34,Adverse_System_Counts_Log,Stat_Sig,Stat_Sig,Stat_Sig,Stat_Sig
47,City_Counts_Log,Stat_Sig,Stat_Sig,Stat_Sig,Stat_Sig
48,Completion_Gap_Log,Stat_Sig,Stat_Sig,Stat_Sig,-
49,Countries_Counts_Log,-,Stat_Sig,Stat_Sig,Stat_Sig
50,Covid_19_Bin,Stat_Sig,Stat_Sig,Stat_Sig,Stat_Sig
60,Enrollment_Counts_Log,Stat_Sig,Stat_Sig,Stat_Sig,Stat_Sig
65,Funder_Industry_Bin,Stat_Sig,-,Stat_Sig,Stat_Sig
70,Healthy_Bin,Stat_Sig,Stat_Sig,Stat_Sig,Stat_Sig


# Save Dfs

In [874]:
df1.to_pickle(r".\df_dummies\df1_dummies.pkl")
df2.to_pickle(r".\df_dummies\df2_dummies.pkl")
df3.to_pickle(r".\df_dummies\df3_dummies.pkl")
df4.to_pickle(r".\df_dummies\df4_dummies.pkl")


In [875]:
display(df1.shape)
display(df2.shape)
display(df3.shape)
display(df4.shape)


(27324, 128)

(29260, 128)

(16503, 128)

(13246, 129)