# Libraries

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import time
import re
from collections import Counter
import matplotlib.pyplot as plt 
import seaborn as sns

# Plotting Libs
import matplotlib.pyplot as plt 
import matplotlib.ticker as ticker
from matplotlib.ticker import MaxNLocator
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D

# Statistical libs
import scipy
import pymannkendall as mk
import statsmodels.api as sm
from scipy import stats
from scipy.stats import norm, skew, chi2_contingency, chi2, chisquare, chi, ttest_rel, ttest_ind, ttest_1samp, ttest_ind_from_stats, ttest_ind_from_stats, ttest_1samp
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,summarize)
from ISLP import confusion_table
from ISLP.models import contrast
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OrdinalEncoder
from collections import Counter
from sklearn.utils import resample
from imblearn.under_sampling import RandomUnderSampler

# Load Data

In [31]:
df0 = pd.read_pickle("df0_vizual.pkl")
df1 = pd.read_pickle("df1_vizual.pkl")
df2 = pd.read_pickle("df2_vizual.pkl")
df3 = pd.read_pickle("df3_vizual.pkl")
df4 = pd.read_pickle("df4_vizual.pkl")
df5 = pd.read_pickle("df5_vizual.pkl")

In [32]:
dfis = [df0, df1, df2, df3, df4, df5]
iss = [0, 1, 2, 3, 4, 5]

# Dummies

In [21]:
df0.columns

Index(['Study Status', 'Sex', 'Age_List', 'Funder_Type', 'Sponsor_Collab_List',
       'Study_Documents_List', 'Document_Counts', 'Document_Counts_Bin',
       'Intervention_Method_List', 'Intervention_Type_List', 'Placebo_Bin',
       'Standard_Care_Bin', 'Healthy_Bin', 'Covid_19_Bin', 'Conditions_List',
       'Adverse_List', 'Adverse_System_List', 'Termination', 'Allocation',
       'Intervention_Model', 'Masking', 'Masking_Detail_List',
       'Primary_Purpose', 'Outcomes_List', 'Continents_List', 'Enrollment_Log',
       'Document_Counts_Categ', 'Adverse_Counts_Log', 'Completion_Gap_Categ',
       'Intervention_Counts_Categ', 'Comorbidity_Counts_Categ',
       'Adverse_System_Counts_Categ', 'City_Counts_Categ',
       'Country_Counts_Categ', 'Continent_Counts_Categ', 'Arm_Counts_Categ'],
      dtype='object')

## Binary

In [24]:
def bin_enc(dfi, cols):
    for col in cols:
        cats = sorted(dfi[col].dropna().unique())
        dfi[col] = dfi[col].map({cats[0]: 0, cats[1]: 1, 'No':0, 'Yes':1})
    return dfi

In [35]:
cols = [
       'Document_Counts_Bin', 'Placebo_Bin', 'Standard_Care_Bin',
       'Healthy_Bin', 'Covid_19_Bin',
       'Document_Counts_Categ',
       'Completion_Gap_Categ',
       'Intervention_Counts_Categ', 'Comorbidity_Counts_Categ',
       'Adverse_System_Counts_Categ', 'City_Counts_Categ',
       'Country_Counts_Categ', 'Continent_Counts_Categ', 'Arm_Counts_Categ'
       ]

df0 = bin_enc(df0, cols)
df1 = bin_enc(df1, cols)
df2 = bin_enc(df2, cols)
df3 = bin_enc(df3, cols)
df4 = bin_enc(df4, cols)
df5 = bin_enc(df5, cols)

df0.head()

Unnamed: 0,Study Status,Document_Counts_Bin,Placebo_Bin,Standard_Care_Bin,Healthy_Bin,Covid_19_Bin,Enrollment_Log,Document_Counts_Categ,Adverse_Counts_Log,Completion_Gap_Categ,...,Outcomes_List_OTHER_PRE_SPECIFIED,Outcomes_List_POST_HOC,Outcomes_List_PRIMARY,Outcomes_List_SECONDARY,Continents_List_Asia,Continents_List_Europe,Continents_List_None,Continents_List_North America,Continents_List_Oceania,Continents_List_South America
0,COMPLETED,0,0,0,0,0,3.850148,,0.0,1,...,0,0,0,0,0,1,0,0,0,0
1,COMPLETED,0,0,0,1,0,5.288267,,0.0,1,...,0,0,0,0,0,0,0,1,0,0
2,COMPLETED,0,0,0,1,0,2.944439,,0.0,1,...,0,0,0,0,0,1,0,0,0,0
3,TERMINATED,0,1,0,1,0,1.386294,,0.0,0,...,0,0,1,1,0,0,0,1,0,0
4,TERMINATED,0,0,0,0,0,0.0,,0.0,0,...,0,0,0,0,0,0,1,0,0,0


## Dummies

In [33]:
def dum_enc(dfi, cols):
    for col in cols:
        df_exp = dfi.explode(col)
        dummies = pd.get_dummies(df_exp[col], drop_first = True, dtype = int, prefix = col , prefix_sep='_')
        dummies = dummies.groupby(dummies.index).sum()
        dfi = pd.concat([dfi.drop(columns = [col], axis = 1), dummies], axis = 1)  
    return dfi


In [34]:
cols = ['Sex', 'Age_List', 'Funder_Type', 'Sponsor_Collab_List',
       'Study_Documents_List', 'Document_Counts', 
       'Intervention_Method_List', 'Intervention_Type_List', 
       'Conditions_List',
       'Adverse_List', 'Adverse_System_List', 'Allocation',
       'Intervention_Model', 'Masking', 'Masking_Detail_List',
       'Primary_Purpose', 'Outcomes_List', 'Continents_List',
       ]

df0 = dum_enc(df0, cols)
df1 = dum_enc(df1, cols)
df2 = dum_enc(df2, cols)
df3 = dum_enc(df3, cols)
df4 = dum_enc(df4, cols)
df5 = dum_enc(df5, cols)

df0.head()


Unnamed: 0,Study Status,Document_Counts_Bin,Placebo_Bin,Standard_Care_Bin,Healthy_Bin,Covid_19_Bin,Enrollment_Log,Document_Counts_Categ,Adverse_Counts_Log,Completion_Gap_Categ,...,Outcomes_List_OTHER_PRE_SPECIFIED,Outcomes_List_POST_HOC,Outcomes_List_PRIMARY,Outcomes_List_SECONDARY,Continents_List_Asia,Continents_List_Europe,Continents_List_None,Continents_List_North America,Continents_List_Oceania,Continents_List_South America
0,COMPLETED,No,No,No,No,No,3.850148,,0.0,"(20.0, 128.0]",...,0,0,0,0,0,1,0,0,0,0
1,COMPLETED,No,No,No,Yes,No,5.288267,,0.0,"(20.0, 128.0]",...,0,0,0,0,0,0,0,1,0,0
2,COMPLETED,No,No,No,Yes,No,2.944439,,0.0,"(20.0, 128.0]",...,0,0,0,0,0,1,0,0,0,0
3,TERMINATED,No,Yes,No,Yes,No,1.386294,,0.0,"(0.0, 20.0]",...,0,0,1,1,0,0,0,1,0,0
4,TERMINATED,No,No,No,No,No,0.0,,0.0,"(0.0, 20.0]",...,0,0,0,0,0,0,1,0,0,0


## Multi Label Binazer

In [None]:
def mlb_enc(dfis, cols):
    for dfi, col in zip(dfis, cols):
        mlb = MultiLabelBinarizer()
        dummies = pd.DataFrame(mlb.fit_transform(df[col]), columns = mlb.classes_, index = dfi.index)
        dfi = pd.concat([dfi.drop(columns = [col], axis = 1), dummies], axis=1)
    return dfi


## Ordinal

In [51]:
def ord_encod(dfis, cols):
    for dfi, col in zip(dfis, cols):
        categories_list = [[x for x in sorted(dfi[col].dropna().unique())]]
        encoder = OrdinalEncoder(categories = categories_list, dtype = int)
        dfi[col] = encoder.fit_transform(dfi[[col]])
    return dfi


In [37]:
df0.isnull().sum()

Study Status                     0
Document_Counts_Bin              0
Placebo_Bin                      0
Standard_Care_Bin                0
Healthy_Bin                      0
                                ..
Continents_List_Europe           0
Continents_List_None             0
Continents_List_North America    0
Continents_List_Oceania          0
Continents_List_South America    0
Length: 117, dtype: int64