# Libraries

In [23]:
import pandas as pd
from pandas.api.types import CategoricalDtype

# Stat Libs
import statsmodels.api as sm
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

# Plotting Libs
import seaborn as sns

# Statistical libs
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OrdinalEncoder

# Load Data

In [24]:
df0 = pd.read_pickle(r"C:\Users\Eugenia\OneDrive\Documents\THESIS\df0_vizual.pkl")
df1 = pd.read_pickle(r"C:\Users\Eugenia\OneDrive\Documents\THESIS\df1_vizual.pkl")
df2 = pd.read_pickle(r"C:\Users\Eugenia\OneDrive\Documents\THESIS\df2_vizual.pkl")
df3 = pd.read_pickle(r"C:\Users\Eugenia\OneDrive\Documents\THESIS\df3_vizual.pkl")
df4 = pd.read_pickle(r"C:\Users\Eugenia\OneDrive\Documents\THESIS\df4_vizual.pkl")
df5 = pd.read_pickle(r"C:\Users\Eugenia\OneDrive\Documents\THESIS\df5_vizual.pkl")
df = pd.read_pickle(r"C:\Users\Eugenia\OneDrive\Documents\THESIS\df_vizual.pkl")

In [25]:
dfis = [df0, df1, df2, df3, df4, df5, df]
iss = [0, 1, 2, 3, 4, 5, '']

## Dropna

In [15]:
df0 = df0.dropna()
df1 = df1.dropna()
df2 = df2.dropna()
df3 = df3.dropna()
df4 = df4.dropna()
df5 = df5.dropna()
df = df.dropna()

# Encode

## Study Status

In [16]:
# Study Status bin_map
status_type = CategoricalDtype(categories = ['TERMINATED', 'COMPLETED'], ordered = False)

def fun_y_enc(dfi):
    dfi = dfi.copy()
    dfi['Study Status'] = dfi['Study Status'].astype(status_type)
    dfi['Study Status'] = dfi['Study Status'].cat.codes
    return dfi

df0 = fun_y_enc(df0)
df1 = fun_y_enc(df1)
df2 = fun_y_enc(df2)
df3 = fun_y_enc(df3)
df4 = fun_y_enc(df4)
df5 = fun_y_enc(df5)
df0['Study Status'].value_counts()

Study Status
0    1748
1     488
Name: count, dtype: int64

## Binary_Encode

In [17]:
# Binary Encoding
def fun_bin_enc(dfi, cols):
    dfi = dfi.copy()
    for col in cols:
        cats = CategoricalDtype(categories = sorted(dfi[col].dropna().unique()), ordered = False)
        dfi[col] = dfi[col].astype(cats).cat.codes
    return dfi


### Cols
def fun_bin_cols(dfis):
    bin_cols = []
    for dfi in dfis:
        bin_cols = bin_cols + [[col for col in dfi.columns if 'categ' in col.lower() or 'bin' in col.lower()]]
    return bin_cols

bin_cols = fun_bin_cols(dfis)
display(bin_cols[1])  # bin_cols[1] --> bin_cols of df1

# Apply
df0 = fun_bin_enc(df0, bin_cols[0])
df1 = fun_bin_enc(df1, bin_cols[1])
df2 = fun_bin_enc(df2, bin_cols[2])
df3 = fun_bin_enc(df3, bin_cols[3])
df4 = fun_bin_enc(df4, bin_cols[4])
df5 = fun_bin_enc(df5, bin_cols[5])

display(df0.head())

['City_Categ',
 'Country_Categ',
 'Continent_Categ',
 'Completion_Gap_Categ',
 'Placebo_Bin',
 'Standard_Care_Bin',
 'Healthy_Bin',
 'Covid_19_Bin',
 'Adverse_Bin']

Unnamed: 0,Adverse_Counts_Log,Adverse_System_Counts_Log,Arm_Counts,Intervention_Type_Categ,Intervention_Method_Counts,City_Counts,Country_Counts,Continent_Counts,Completion_Gap_Categ,Start_Date_Year_Categ,...,Adverse_List,Adverse_Bin,Adverse_System_List,Allocation_Dum,Intervention_Model_Dum,Masking_Dum,Masking_Detail_List,Primary_Purpose_Dum,Continents_List,Study Status
0,0.0,0.0,1.0,0,1,1,1,1,1,0,...,[Adv_None],0,[Adv_Sy_None],NOT_APPLICABLE,SINGLE_GROUP,MASK_NONE,[MASK_DET_NONE],TREATMENT,[North America],0
1,0.0,0.0,2.0,0,2,1,1,1,1,0,...,[Adv_None],0,[Adv_Sy_None],NON_RANDOMIZED,PARALLEL,MASK_NONE,[MASK_DET_NONE],TREATMENT,[Europe],0
2,0.0,0.0,1.0,0,1,1,1,1,1,0,...,[Adv_None],0,[Adv_Sy_None],NOT_APPLICABLE,SINGLE_GROUP,MASK_NONE,[MASK_DET_NONE],TREATMENT,[North America],0
3,0.0,0.0,1.0,0,1,1,1,1,0,1,...,[Adv_None],0,[Adv_Sy_None],NOT_APPLICABLE,SINGLE_GROUP,MASK_NONE,[MASK_DET_NONE],TREATMENT,[Europe],0
4,0.0,0.0,1.0,0,1,1,1,1,1,1,...,[Adv_None],0,[Adv_Sy_None],NOT_APPLICABLE,SINGLE_GROUP,MASK_NONE,[MASK_DET_NONE],SCREENING,[North America],0


## Dummies_Encode

In [None]:
def fun_dum_enc(dfi, cols):
    for col in cols:  
        df_expl = dfi.copy()
        df_expl = df_expl.explode(col)

        df_expl[col] = df_expl[col].astype('category') 
        df_expl[col] = df_expl[col].cat.remove_unused_categories()
        df_expl[col] = df_expl[col].astype('str') #str cause of error in encoding. After astype(cat) so to drop unused categories

        dummies = pd.get_dummies(df_expl[col], drop_first = False, dtype = int, prefix = col , prefix_sep='_')
        
        dummies.index = df_expl.index # ensure same indexing with df_expl
        dummies = dummies.groupby(dummies.index).sum()

        dfi = pd.concat([dfi.drop(columns = [col], axis = 1), dummies], axis = 1)  
    return dfi

### Cols
def fun_dum_cols(dfis):
    dum_cols = []
    for dfi in dfis: # loop inputed in case dfis have not all the same columns. # * So not to run function into funtion.
        dum_cols = dum_cols + [[col for col in dfi.columns if 'list' in col.lower() or 'dum' in col.lower()]]
    return dum_cols

dum_cols = fun_dum_cols(dfis) 
display(dum_cols[0])

# Apply
df0 = fun_dum_enc(df0, dum_cols[0]) # * loop so not ro run function into function
df1 = fun_dum_enc(df1, dum_cols[1])
df2 = fun_dum_enc(df2, dum_cols[2])
df3 = fun_dum_enc(df3, dum_cols[3])
df4 = fun_dum_enc(df4, dum_cols[4])
df5 = fun_dum_enc(df5, dum_cols[5])
gir
display(df0.head())

['Sex_Dum',
 'Age_List',
 'Funder_Type_Dum',
 'Sponsor_Collab_List',
 'Intervention_Type_List',
 'Intervention_Method_List',
 'Conditions_List',
 'Conditions_Spec_List',
 'Adverse_List',
 'Adverse_System_List',
 'Allocation_Dum',
 'Intervention_Model_Dum',
 'Masking_Dum',
 'Masking_Detail_List',
 'Primary_Purpose_Dum',
 'Continents_List']

Unnamed: 0,Adverse_Counts_Log,Adverse_System_Counts_Log,Arm_Counts,Intervention_Type_Categ,Intervention_Method_Counts,City_Counts,Country_Counts,Continent_Counts,Completion_Gap_Categ,Start_Date_Year_Categ,...,Primary_Purpose_Dum_SCREENING,Primary_Purpose_Dum_SUPPORTIVE_CARE,Primary_Purpose_Dum_TREATMENT,Continents_List_Africa,Continents_List_Asia,Continents_List_Cont_None,Continents_List_Europe,Continents_List_North America,Continents_List_Oceania,Continents_List_South America
0,0.0,0.0,1.0,0,1,1,1,1,1,0,...,0,0,1,0,0,0,0,1,0,0
1,0.0,0.0,2.0,0,2,1,1,1,1,0,...,0,0,1,0,0,0,1,0,0,0
2,0.0,0.0,1.0,0,1,1,1,1,1,0,...,0,0,1,0,0,0,0,1,0,0
3,0.0,0.0,1.0,0,1,1,1,1,0,1,...,0,0,1,0,0,0,1,0,0,0
4,0.0,0.0,1.0,0,1,1,1,1,1,1,...,1,0,0,0,0,0,0,1,0,0


### Drop First

In [19]:
def fun_drop_first(dfi):
    dum_cols = [col for col in dfi.columns if 'list' in col.lower() or 'dum' in col.lower()]
    keywords = ['other', 'none', 'unknown']
    drop_cols = [col for col in dum_cols if any(key in col.lower() for key in keywords)]
    drop_cols1 = ['Sex_Dum_ALL', 'Age_List_CHILD'] # Only columns without other/none values

    drop_cols += drop_cols1

    dfi = dfi.drop(columns = drop_cols, axis = 1)
    return dfi

df0 = fun_drop_first(df0)
df1 = fun_drop_first(df1)
df2 = fun_drop_first(df2)
df3 = fun_drop_first(df3)
df4 = fun_drop_first(df4)
df5 = fun_drop_first(df5)


# Save dfs

In [20]:
df0.to_pickle(r"C:\Users\Eugenia\OneDrive\Documents\THESIS\df0_dummies.pkl")
df1.to_pickle(r"C:\Users\Eugenia\OneDrive\Documents\THESIS\df1_dummies.pkl")
df2.to_pickle(r"C:\Users\Eugenia\OneDrive\Documents\THESIS\df2_dummies.pkl")
df3.to_pickle(r"C:\Users\Eugenia\OneDrive\Documents\THESIS\df3_dummies.pkl")
df4.to_pickle(r"C:\Users\Eugenia\OneDrive\Documents\THESIS\df4_dummies.pkl")
df5.to_pickle(r"C:\Users\Eugenia\OneDrive\Documents\THESIS\df5_dummies.pkl")
df.to_pickle(r"C:\Users\Eugenia\OneDrive\Documents\THESIS\df_dummies.pkl")