# Libraries

In [24]:
# Stat Libs
import pandas as pd
from pandas.api.types import CategoricalDtype
from scipy import stats
import statsmodels.api as sm
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

# Plotting Libs
import seaborn as sns
import matplotlib.pyplot as plt 

# Statistical libs
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OrdinalEncoder

# Load Data

In [25]:
df0 = pd.read_pickle(r".\df_vizual\df0_vizual.pkl")
df1 = pd.read_pickle(r".\df_vizual\df1_vizual.pkl")
df2 = pd.read_pickle(r".\df_vizual\df2_vizual.pkl")
df3 = pd.read_pickle(r".\df_vizual\df3_vizual.pkl")
df4 = pd.read_pickle(r".\df_vizual\df4_vizual.pkl")
df5 = pd.read_pickle(r".\df_vizual\df5_vizual.pkl")
df = pd.read_pickle(r".\df_vizual\df_vizual.pkl")

In [26]:
df0.shape

(2304, 26)

In [27]:
dfis = [df0, df1, df2, df3, df4, df5, df]
iss = [0, 1, 2, 3, 4, 5, '']

## Dropna

In [28]:
df0 = df0.dropna()
df1 = df1.dropna()
df2 = df2.dropna()
df3 = df3.dropna()
df4 = df4.dropna()
df5 = df5.dropna()
df = df.dropna()

# Encode

## Dummies

In [29]:
## Dummies
def fun_dum_enc(dfi, cols):
    for col in cols:  
        df_expl = dfi.copy()
        df_expl = df_expl.explode(col)

        df_expl[col] = df_expl[col].astype('category') 
        df_expl[col] = df_expl[col].cat.remove_unused_categories()
        df_expl[col] = df_expl[col].astype('str') #str cause of error in encoding. After astype(cat) so to drop unused categories

        dummies = pd.get_dummies(df_expl[col], drop_first = False, dtype = int, prefix = col , prefix_sep='_')
        
        dummies.index = df_expl.index # ensure same indexing with df_expl
        dummies = dummies.groupby(dummies.index).sum()

        dfi = pd.concat([dfi.drop(columns = [col], axis = 1), dummies], axis = 1)  
    return dfi

### Cols
def fun_dum_cols(dfis):
    dum_cols = []
    for dfi in dfis: # loop inputed in case dfis have not all the same columns. # * Plus not to run function into funtion.
        dum_cols = dum_cols + [[col for col in dfi.columns if 'list' in col.lower()]]
    return dum_cols

dum_cols = fun_dum_cols(dfis) 

# Apply
df0 = fun_dum_enc(df0, dum_cols[0]) # * loop so not to run function into function
df1 = fun_dum_enc(df1, dum_cols[1])
df2 = fun_dum_enc(df2, dum_cols[2])
df3 = fun_dum_enc(df3, dum_cols[3])
df4 = fun_dum_enc(df4, dum_cols[4])
df5 = fun_dum_enc(df5, dum_cols[5])
df = fun_dum_enc(df, dum_cols[6])

# Example
display(dum_cols[0])
df0[[col for col in df0.columns if 'list' in col.lower()]].head()


['Age_List',
 'Masking_List',
 'Masking_Detail_List',
 'Primary_Purpose_List',
 'Adverse_List',
 'Sex_List',
 'Intervention_Model_List',
 'Conditions_Detail_List',
 'Intervention_Method_List',
 'Intervention_Type_List',
 'Allocation_List',
 'Funder_Type_List',
 'Continents_List']

Unnamed: 0,Age_List_ADULT,Age_List_CHILD,Age_List_OLDER_ADULT,Masking_List_DOUBLE,Masking_List_MASK_NONE,Masking_List_QUADRUPLE,Masking_List_SINGLE,Masking_List_TRIPLE,Masking_Detail_List_CARE_PROVIDER,Masking_Detail_List_INVESTIGATOR,...,Funder_Type_List_GOVERM,Funder_Type_List_HEALTH,Funder_Type_List_INDUSTRY,Continents_List_Africa,Continents_List_Asia,Continents_List_Cont_None,Continents_List_Europe,Continents_List_North America,Continents_List_Oceania,Continents_List_South America
0,1,0,1,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
1,1,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,1,0,1,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
3,1,1,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
4,1,1,1,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0


### Drop First

In [30]:
# Checking the keywords of col names / Check which col names to drop if no kewwords (e.g 'none', 'other', etc.)
df0[[col for col in df0.columns if 'list' in col.lower()]].columns.tolist()
def fun_drop_first(dfi):
    dum_cols = [col for col in dfi.columns if 'list' in col.lower()]
    keywords = ['other', 'none', 
                'Sex_List_ALL', 'Age_List_CHILD', 'phase0', 'na_randomized', 'single_group']
    drop_cols = [col for col in dum_cols if any(key in col.lower() for key in keywords)]

    dfi = dfi.drop(columns = drop_cols, axis = 1)
    return dfi

df0 = fun_drop_first(df0)
df1 = fun_drop_first(df1)
df2 = fun_drop_first(df2)
df3 = fun_drop_first(df3)
df4 = fun_drop_first(df4)
df5 = fun_drop_first(df5)
df = fun_drop_first(df)

# Example Check
df.columns.values.tolist()

['Placebo_Bin',
 'Covid_19_Bin',
 'Healthy_Bin',
 'Intervention_Method_Counts',
 'Enrollment_Counts_Log',
 'City_Categ',
 'Adverse_Detail_Counts',
 'Standard_Care_Bin',
 'Completion_Gap_Categ',
 'Arm_Counts',
 'Adverse_Bin',
 'Study_Status_Bin',
 'Intervention_Type_Counts',
 'Age_List_ADULT',
 'Age_List_CHILD',
 'Age_List_OLDER_ADULT',
 'Masking_List_DOUBLE',
 'Masking_List_QUADRUPLE',
 'Masking_List_SINGLE',
 'Masking_List_TRIPLE',
 'Masking_Detail_List_CARE_PROVIDER',
 'Masking_Detail_List_INVESTIGATOR',
 'Masking_Detail_List_OUTCOMES_ASSESSOR',
 'Masking_Detail_List_PARTICIPANT',
 'Primary_Purpose_List_BASIC_SCIENCE',
 'Primary_Purpose_List_DEVICE_FEASIBILITY',
 'Primary_Purpose_List_DIAGNOSTIC',
 'Primary_Purpose_List_HEALTH_SERVICES_RESEARCH',
 'Primary_Purpose_List_PREVENTION',
 'Primary_Purpose_List_SCREENING',
 'Primary_Purpose_List_SUPPORTIVE_CARE',
 'Primary_Purpose_List_TREATMENT',
 'Sex_List_ALL',
 'Sex_List_FEMALE',
 'Sex_List_MALE',
 'Intervention_Model_List_CROSSOVER',
 

## Binary

In [31]:
display(df['Study_Status_Bin'].value_counts())

# Binary Encoding
def fun_bin_enc(dfi, cols):
    dfi = dfi.copy()
    for col in cols:
        cats = CategoricalDtype(categories = sorted(dfi[col].dropna().unique()), ordered = False)
        dfi[col] = dfi[col].astype(cats).cat.codes
    return dfi


### Cols
def fun_bin_cols(dfis):
    bin_cols = []
    for dfi in dfis:
        bin_cols = bin_cols + [[col for col in dfi.columns if 'categ' in col.lower() or 'bin' in col.lower()]]
    return bin_cols

bin_cols = fun_bin_cols(dfis)

# Apply
df0 = fun_bin_enc(df0, bin_cols[0])
df1 = fun_bin_enc(df1, bin_cols[1])
df2 = fun_bin_enc(df2, bin_cols[2])
df3 = fun_bin_enc(df3, bin_cols[3])
df4 = fun_bin_enc(df4, bin_cols[4])
df5 = fun_bin_enc(df5, bin_cols[5])
df = fun_bin_enc(df, bin_cols[6])

# Example
display(bin_cols[0])  # bin_cols[0] --> bin_cols of df0
display(df0[bin_cols[0]]) 
display(df['Study_Status_Bin'].value_counts())  # Completed = 0, Terminated = 1

Study_Status_Bin
COMPLETED     148180
TERMINATED     25361
Name: count, dtype: int64

['Placebo_Bin',
 'Intervention_Type_Categ',
 'Covid_19_Bin',
 'Healthy_Bin',
 'Standard_Care_Bin',
 'Completion_Gap_Categ',
 'Adverse_Bin',
 'Study_Status_Bin']

Unnamed: 0,Placebo_Bin,Intervention_Type_Categ,Covid_19_Bin,Healthy_Bin,Standard_Care_Bin,Completion_Gap_Categ,Adverse_Bin,Study_Status_Bin
0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,1,0,0
2,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0
4,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...
2299,0,0,0,0,0,0,0,0
2300,0,0,0,1,0,0,0,0
2301,0,0,0,0,1,1,0,0
2302,0,0,0,0,0,0,0,0


Study_Status_Bin
0    148180
1     25361
Name: count, dtype: int64

# Save Dfs

In [32]:
df0.to_pickle(r".\df_dummies\df0_dummies.pkl")
df1.to_pickle(r".\df_dummies\df1_dummies.pkl")
df2.to_pickle(r".\df_dummies\df2_dummies.pkl")
df3.to_pickle(r".\df_dummies\df3_dummies.pkl")
df4.to_pickle(r".\df_dummies\df4_dummies.pkl")
df5.to_pickle(r".\df_dummies\df5_dummies.pkl")
df.to_pickle(r".\df_dummies\df_dummies.pkl")

In [33]:
display(df0.shape)
display(df1.shape)
display(df2.shape)
display(df3.shape)
display(df4.shape)
display(df5.shape)
display(df.shape)

(2236, 90)

(26045, 115)

(28409, 115)

(16093, 115)

(12813, 90)

(95731, 115)

(173541, 120)