# Libraries

In [226]:
# Stat Libs
import pandas as pd
from pandas.api.types import CategoricalDtype
import pickle
import re

# Stat Libs
import statsmodels.api as sm
from itertools import product
from functools import reduce

# Statistical libs
from sklearn.preprocessing import MultiLabelBinarizer


# Load Data

In [227]:
df1 = pd.read_pickle(r".\df_vizual\df1_vizual.pkl")
df2 = pd.read_pickle(r".\df_vizual\df2_vizual.pkl")
df3 = pd.read_pickle(r".\df_vizual\df3_vizual.pkl")
df4 = pd.read_pickle(r".\df_vizual\df4_vizual.pkl")

p_cl1 = pd.read_pickle(r".\p_values\p_cl1_vizual.pkl")
p_cl2 = pd.read_pickle(r".\p_values\p_cl2_vizual.pkl")
p_cl3 = pd.read_pickle(r".\p_values\p_cl3_vizual.pkl")
p_cl4 = pd.read_pickle(r".\p_values\p_cl4_vizual.pkl")

In [228]:
dfis = [df1, df2, df3, df4]
iss = [1, 2, 3, 4]

# Dummies
"_List” data: List element data, were encoded through dummy creation. The problem was that list-element data cannot be used from models. Firstly, list-element rows were exploded to one row per element of list. However, this inflates sample size, as rows increase and duplicate for same nct_id if more than one element occur in a list-row. For this reason, these data were then grouped by nct_id, so number of rows remained the same as initial datasets (df0, df1, df2, df3, df4, df5).

In [229]:
# Alternatively for List element columns 
mlb = MultiLabelBinarizer()

In [230]:
## Dummies
def fun_dum_enc(dfi, cols):
    for col in cols:  
        df_expl = dfi.copy()
        df_expl = df_expl.explode(col)

        df_expl[col] = df_expl[col].astype('category') 
        df_expl[col] = df_expl[col].cat.remove_unused_categories()
        df_expl[col] = df_expl[col].astype('str') #str cause of error in encoding. After astype(cat) so to drop unused categories

        dummies = pd.get_dummies(df_expl[col], drop_first = False, dtype = int, prefix = col , prefix_sep='_')
        
        dummies.index = df_expl.index # ensure same indexing with df_expl
        dummies = dummies.groupby(dummies.index).sum().clip(upper = 1) 
        # clip: if a row has double entry data ['UNSPES', 'UNSPES'] it avoids double vounting with sum().

        dfi = pd.concat([dfi.drop(columns = [col], axis = 1), dummies], axis = 1)  
    return dfi

### Cols
def fun_dum_cols(dfis):  # In case they are not the same.
    dum_cols = []
    for dfi in dfis: # loop inputed in case dfis have not all the same columns. # * Plus not to run function into funtion.
        dum_cols = dum_cols + [[col for col in dfi.columns if '_List' in col]]
    return dum_cols

dum_cols = fun_dum_cols(dfis) 

# Apply
# * loop so not to run function into function
df1 = fun_dum_enc(df1, dum_cols[0])
df2 = fun_dum_enc(df2, dum_cols[1])
df3 = fun_dum_enc(df3, dum_cols[2])
df4 = fun_dum_enc(df4, dum_cols[3])


# Example
display(dum_cols[0])
df1[[col for col in df1.columns if '_list' in col.lower()]] #.head()  # Transposed for better view


['Sex_List',
 'Age_List',
 'Intervention_Type_List',
 'Intervention_Route_List',
 'Conditions_Detail_List',
 'Adverse_List',
 'Adverse_System_List',
 'Allocation_List',
 'Intervention_Model_List',
 'Masking_List',
 'Masking_Detail_List',
 'Primary_Purpose_List',
 'Continents_List']

Unnamed: 0,Sex_List_ALL,Sex_List_FEMALE,Sex_List_MALE,Age_List_ADULT,Age_List_CHILD,Age_List_OLDER_ADULT,Intervention_Type_List_BEHAVIORAL,Intervention_Type_List_BIOLOGICAL,Intervention_Type_List_DEVICE,Intervention_Type_List_DIETARY_SUPPLEMENT,...,Primary_Purpose_List_BASIC_SCIENCE,Primary_Purpose_List_DIAGNOSTIC,Primary_Purpose_List_PREVENTION,Primary_Purpose_List_PRIM_PURP_OTHER,Primary_Purpose_List_SUPPORTIVE_CARE,Primary_Purpose_List_TREATMENT,Continents_List_Asia,Continents_List_Cont_Other,Continents_List_Europe,Continents_List_North America
0,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
1,0,0,1,1,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
2,1,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
3,1,0,0,1,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
4,1,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21824,1,0,0,1,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
21825,1,0,0,1,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
21826,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
21827,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


## Drop First
- Drop first was not done through get_dummies command, so to choose the column to drop, based on its characterisic. e.g., drop phase 0 is preferred that to drop phase 3.


In [231]:
dfis = [df1, df2, df3, df4]
iss = [1, 2, 3, 4]

In [232]:

def fun_drop_first(dfi):
    dum_cols = [col for col in dfi.columns if '_list' in col.lower()]
    
    keywords = ['other', 'none', 'sex_list_all', 'age_list_child', 'na_randomize', 'single_grou']
    drop_cols = [col for col in dum_cols if any(key in col.lower() for key in keywords) and '_x_' not in col]
    
    dfi = dfi.drop(columns = drop_cols, axis = 1)
    return dfi

df1 = fun_drop_first(df1)
df2 = fun_drop_first(df2)
df3 = fun_drop_first(df3)
df4 = fun_drop_first(df4)

# Example Check
df1.columns.values.tolist()

['Study_Status_Bin',
 'Funder_Industry_Bin',
 'Placebo_Bin',
 'Standard_Care_Bin',
 'Healthy_Bin',
 'Adverse_Counts_Log',
 'Adverse_System_Counts_Log',
 'Arm_Counts_Log',
 'Country_Counts_Log',
 'Continent_Counts_Log',
 'City_Counts_Log',
 'Enrollment_Counts_Log',
 'Sex_List_FEMALE',
 'Sex_List_MALE',
 'Age_List_ADULT',
 'Age_List_OLDER_ADULT',
 'Intervention_Type_List_BEHAVIORAL',
 'Intervention_Type_List_BIOLOGICAL',
 'Intervention_Type_List_DEVICE',
 'Intervention_Type_List_DIETARY_SUPPLEMENT',
 'Intervention_Type_List_DRUG',
 'Intervention_Type_List_INTERV_UNSPES',
 'Intervention_Type_List_PROCEDURE',
 'Intervention_Route_List_Injection',
 'Intervention_Route_List_Oral',
 'Intervention_Route_List_Surgical',
 'Intervention_Route_List_Topical',
 'Conditions_Detail_List_Bacterial Infections and Mycoses',
 'Conditions_Detail_List_Cardiovascular',
 'Conditions_Detail_List_Chemical Disorders',
 'Conditions_Detail_List_Digestive System, Nutritional, Metabolic',
 'Conditions_Detail_List_En

# Binary

In [233]:
# Binary Encoding
def fun_bin_enc(dfi, cols):
    dfi = dfi.copy()
    for col in cols:
        cats = CategoricalDtype(categories = sorted(dfi[col].dropna().unique()), ordered = False)
        dfi[col] = dfi[col].astype(cats).cat.codes
    return dfi


### Cols
def fun_bin_cols(dfis):
    bin_cols = []
    for dfi in dfis:
        bin_cols = bin_cols + [[col for col in dfi.columns if '_Categ' in col.lower() or '_Bin' in col]] 
    return bin_cols

# Apply
bin_cols = fun_bin_cols(dfis)

df1 = fun_bin_enc(df1, bin_cols[0])
df2 = fun_bin_enc(df2, bin_cols[1])
df3 = fun_bin_enc(df3, bin_cols[2])
df4 = fun_bin_enc(df4, bin_cols[3])

# Example
display(df1['Study_Status_Bin'].value_counts())  # Completed = 0, Terminated = 1
display(bin_cols[0])  # bin_cols[0] --> bin_cols of df1
display(df1[bin_cols[0]]) 

Study_Status_Bin
0    18646
1     3183
Name: count, dtype: int64

['Study_Status_Bin',
 'Funder_Industry_Bin',
 'Placebo_Bin',
 'Standard_Care_Bin',
 'Healthy_Bin']

Unnamed: 0,Study_Status_Bin,Funder_Industry_Bin,Placebo_Bin,Standard_Care_Bin,Healthy_Bin
0,0,1,0,0,1
1,0,0,1,0,1
2,0,1,0,0,0
3,0,1,0,0,1
4,1,1,1,1,0
...,...,...,...,...,...
21824,0,1,0,0,0
21825,0,1,0,0,1
21826,0,1,0,0,1
21827,0,0,1,0,0


# Interactions

In [234]:
dfis = [df1, df2, df3, df4]
iss = [1, 2, 3, 4]


In [235]:
# Interaction Continues_x_Categorical
'''col = 'Conditions_Detail_List_Neoplasms'
df1['Enrollment_x_Neoplasms_Counts_Log'] = df1['Enrollment_Counts_Log'] * df1[col]
df2['Enrollment_x_Neoplasms_Counts_Log'] = df2['Enrollment_Counts_Log'] * df2[col]
df3['Enrollment_x_Neoplasms_Counts_Log'] = df3['Enrollment_Counts_Log'] * df3[col]
df4['Enrollment_x_Neoplasms_Counts_Log'] = df4['Enrollment_Counts_Log'] * df4[col]'''

# Intreaction of Categorical_x_Binary
def fun_inter(dfi, col1, col2, stip):

    cols1 = [col for col in dfi.columns if col1 in col]  # All dfs have the same columns
    cols2 = [col for col in dfi.columns if col2 in col]

    for col1, col2 in product(cols1, cols2):
        inter_col1 = f"{col1}_x_{col2}"
        dfi[inter_col1] = dfi[col1] * dfi[col2]
        inter_col = re.sub(stip, "", inter_col1)
        dfi.rename(columns = {inter_col1: inter_col}, inplace=True)
        
    return dfi
    
df1 = fun_inter(df1, 'Intervention_Type_List', 'Funder_Industry_Bin', r"Intervention_Type_List_|_Bin" )
df2 = fun_inter(df2, 'Intervention_Type_List', 'Funder_Industry_Bin', r"Intervention_Type_List_|_Bin")
df3 = fun_inter(df3, 'Intervention_Type_List', 'Funder_Industry_Bin', r"Intervention_Type_List_|_Bin")
df4 = fun_inter(df4, 'Intervention_Type_List', 'Funder_Industry_Bin', r"Intervention_Type_List_|_Bin")

df1 = df1.rename(columns = {col: col + "_List" if "_x_Funder_Industry" in col else col for col in df1.columns})
df2 = df2.rename(columns = {col: col + "_List" if "_x_Funder_Industry" in col else col for col in df2.columns})
df3 = df3.rename(columns = {col: col + "_List" if "_x_Funder_Industry" in col else col for col in df3.columns})
df4 = df4.rename(columns = {col: col + "_List" if "_x_Funder_Industry" in col else col for col in df4.columns})

df1 = fun_inter(df1, 'Enrollment_Counts_Log', 'Conditions_Detail_List', r"Conditions_Detail_List|_Counts" )
df2 = fun_inter(df2, 'Enrollment_Counts_Log', 'Conditions_Detail_List', r"Conditions_Detail_List|_Counts")
df3 = fun_inter(df3, 'Enrollment_Counts_Log', 'Conditions_Detail_List', r"Conditions_Detail_List|_Counts")
df4 = fun_inter(df4, 'Enrollment_Counts_Log', 'Conditions_Detail_List', r"Conditions_Detail_List|_Counts")


In [236]:
dfis = [df1, df2, df3, df4]
iss = [1, 2, 3, 4]

# Pivot
pivots_inter = []

for i, dfi in zip(iss, dfis):
    pivot = dfi.pivot_table(
        index = "Study_Status_Bin",
        values = [col for col in dfi.columns if '_x_' in col],
        aggfunc = "sum", 
        observed = False)
    
    pivot_inter_1 = pivot.T
    pivot_inter_1.columns = [f"df{i}_{outcome}" for outcome in pivot_inter_1.columns]  # Optional: label by df index
    
    pivots_inter.append(pivot_inter_1)

pivot_inter1 = pivots_inter[0]
pivot_inter2 = pivots_inter[1]
pivot_inter3 = pivots_inter[2]
pivot_inter4 = pivots_inter[3]

# Checks
inter_cols = [col for col in df4.columns if '_x_' in col]  # i have the world other in too many data levels !!
display(len(inter_cols))  # Must have created 1

pivot_inter = pd.concat(pivots_inter, axis=1)
pivot_inter['Sum_Counts'] = pivot_inter.sum(axis = 1)
pivot_inter.sort_values(by = 'Sum_Counts')


28

Unnamed: 0,df1_0,df1_1,df2_0,df2_1,df3_0,df3_1,df4_0,df4_1,Sum_Counts
BEHAVIORAL_x_Funder_Industry_List,4.0,1.0,32.0,10.0,14.0,2.0,7.0,3.0,73.0
DIETARY_SUPPLEMENT_x_Funder_Industry_List,44.0,4.0,69.0,13.0,36.0,9.0,34.0,4.0,213.0
PROCEDURE_x_Funder_Industry_List,43.0,14.0,61.0,15.0,78.0,14.0,26.0,9.0,260.0
INTERV_UNSPES_x_Funder_Industry_List,138.0,32.0,88.0,47.0,93.0,24.0,27.0,7.0,456.0
DEVICE_x_Funder_Industry_List,148.0,20.0,112.0,42.0,181.0,44.0,185.0,42.0,774.0
BIOLOGICAL_x_Funder_Industry_List,1103.0,224.0,872.0,233.0,1125.0,124.0,264.0,17.0,3962.0
"Enrollment_Log_x__Wounds, Injuries",436.617394,54.33869,972.846593,148.711942,887.733188,130.051091,1217.926541,194.372227,4042.597667
Enrollment_Log_x__Chemical Disorders,938.826852,59.486189,1383.475841,143.817476,646.582592,84.31241,809.164414,106.736817,4172.402591
"Enrollment_Log_x__Health Care, Therapeutics",1041.991745,43.851619,1092.901654,135.442873,1026.15846,81.81172,1375.154222,148.853345,4946.165637
"Enrollment_Log_x__Phenomena, Processes",1344.264162,64.079995,1537.344431,204.213515,1528.431877,138.232219,1988.246141,178.274233,6983.086573


# p_values

In [237]:
p_cls = [p_cl1[['Variable', 'df1-Stat_Sig']], 
         p_cl2[['Variable', 'df2-Stat_Sig']], 
         p_cl3[['Variable', 'df3-Stat_Sig']], 
         p_cl4[['Variable', 'df4-Stat_Sig']]]

p_cl = reduce(lambda left, right: pd.merge(left, right, on='Variable', how='outer'), p_cls)
p_cl[p_cl['Variable'].isin(df1.columns)]

Unnamed: 0,Variable,df1-Stat_Sig,df2-Stat_Sig,df3-Stat_Sig,df4-Stat_Sig
26,Adverse_Counts_Log,Stat_Sig,Stat_Sig,Stat_Sig,Stat_Sig
27,Adverse_System_Counts_Log,-,Stat_Sig,Stat_Sig,Stat_Sig
38,City_Counts_Log,Stat_Sig,Stat_Sig,Stat_Sig,Stat_Sig
39,Continent_Counts_Log,Stat_Sig,Stat_Sig,Stat_Sig,Stat_Sig
40,Country_Counts_Log,-,Stat_Sig,Stat_Sig,Stat_Sig
48,Enrollment_Counts_Log,Stat_Sig,Stat_Sig,Stat_Sig,Stat_Sig
53,Funder_Industry_Bin,Stat_Sig,Stat_Sig,Stat_Sig,Stat_Sig
55,Healthy_Bin,Stat_Sig,Stat_Sig,Stat_Sig,Stat_Sig
76,Placebo_Bin,Stat_Sig,Stat_Sig,Stat_Sig,-
85,Standard_Care_Bin,Stat_Sig,Stat_Sig,Stat_Sig,Stat_Sig


# Save Dfs

In [238]:
df1.to_pickle(r".\df_dummies\df1_dummies.pkl")
df2.to_pickle(r".\df_dummies\df2_dummies.pkl")
df3.to_pickle(r".\df_dummies\df3_dummies.pkl")
df4.to_pickle(r".\df_dummies\df4_dummies.pkl")


In [239]:
display(df1.shape)
display(df2.shape)
display(df3.shape)
display(df4.shape)


(21829, 123)

(21228, 123)

(13966, 123)

(13246, 123)