# Libraries

In [8]:
import pandas as pd
import seaborn as sns


# Load Data
- Only on train data. 
- Test data are 'unseen' --> no sparsity check. 

In [9]:
df1 = pd.read_pickle(r".\df_dummies\df1_dummies.pkl")
df2 = pd.read_pickle(r".\df_dummies\df2_dummies.pkl")
df3 = pd.read_pickle(r".\df_dummies\df3_dummies.pkl")
df4 = pd.read_pickle(r".\df_dummies\df4_dummies.pkl")


# Resample

In [10]:
def fun_res(dfi):

    X = dfi.drop("Study_Status_Bin", axis = 1)
    y = dfi["Study_Status_Bin"]

    X_train_tts, X_test_tts, y_train_tts, y_test_tts = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 42)

    res = RandomUnderSampler(sampling_strategy = 'auto', random_state = 42)
    X_train_res, y_train_res = res.fit_resample(X_train_tts, y_train_tts) 
    
    X_train_res = pd.DataFrame(X_train_res)

    return X_train_res, y_train_res

X_train1, y_train1 = fun_res(df1)
X_train2, y_train2 = fun_res(df2)
X_train3, y_train3 = fun_res(df3)
X_train4, y_train4 = fun_res(df4)

# Create X_y dfs

In [11]:
df1_train = pd.concat([X_train1, y_train1], axis=1)  # X_train y_train have same index
df2_train = pd.concat([X_train2, y_train2], axis=1)  # X_train y_train have same index
df3_train = pd.concat([X_train3, y_train3], axis=1)  # X_train y_train have same index
df4_train = pd.concat([X_train4, y_train4], axis=1)  # X_train y_train have same index

# Function

In [12]:
# there are no categ_cols. Dropped all due to obvious high spasity risk seen at distribution plots.(see vizual file distributios)
# All dfis same cols 
def fun_sparse(i, dfi, categ_cols):
    pivot_tables = []

    for col in dfi[categ_cols].columns:
        pivot_table = pd.pivot_table(
            data = dfi,
            index = col,
            columns = "Study_Status_Bin",
            aggfunc = "size",
            fill_value = 0,
            observed = False
        ).reset_index()
        # Change Column/Element Names of Pivot
        pivot_table['Variables'] = col + ' = ' + pivot_table[col].astype(str)
        pivot_table.drop(columns=[col], inplace=True)    
        # Reindex Column of value
        final_cols = ['Variables'] + [c for c in pivot_table.columns if c != 'Variables']
        pivot_table = pivot_table[final_cols]
        # Connect pivots
        pivot_tables.append(pivot_table)
    # Merge all pivots
    pivot_merged = pd.concat(pivot_tables, ignore_index = True)
    pivot_merged = pivot_merged.rename(columns = {0 : f'0 - df{i}', 1 : f'1 - df{i}'})
    
    return pivot_merged


# Categ Pivot

In [13]:
# train sets
categ_cols = [col for col in df3_train.columns if '_Categ' in col or '_Bin' in col and 'Study_Status_Bin' not in col]  

pivot_merged1 = fun_sparse(1, df1_train, categ_cols)
pivot_merged2 = fun_sparse(2, df2_train, categ_cols)
pivot_merged3 = fun_sparse(3, df3_train, categ_cols)
pivot_merged4 = fun_sparse(4, df4_train, categ_cols)

all_pivots = [pivot_merged1, pivot_merged2, pivot_merged3, pivot_merged4]
pivot_merged_train_categ = reduce(lambda left, right: pd.merge(left, right, on="Variables", how="outer"), all_pivots) 
pivot_merged_train_categ


Study_Status_Bin,Variables,0 - df1,1 - df1,0 - df2,1 - df2,0 - df3,1 - df3,0 - df4,1 - df4
0,Funder_Industry_Bin = 0,694,1104,2367,2619,679,820,1502,1539
1,Funder_Industry_Bin = 1,1852,1442,1972,1720,1248,1107,345,308
2,Healthy_Bin = 0,977,1935,3817,4095,1655,1789,1390,1561
3,Healthy_Bin = 1,1569,611,522,244,272,138,457,286
4,Placebo_Bin = 0,2058,2154,2754,3052,1198,1114,1411,1383
5,Placebo_Bin = 1,488,392,1585,1287,729,813,436,464
6,Standard_Care_Bin = 0,2519,2455,4169,4061,1856,1783,1771,1705
7,Standard_Care_Bin = 1,27,91,170,278,71,144,76,142


# List pivot

In [14]:
# train sets
list_cols = [col for col in df3_train.columns if '_List' in col and 'Study_Status_Bin' not in col]  

pivot_merged1 = fun_sparse(1, df1_train, list_cols)
pivot_merged2 = fun_sparse(2, df2_train, list_cols)
pivot_merged3 = fun_sparse(3, df3_train, list_cols)
pivot_merged4 = fun_sparse(4, df4_train, list_cols)

all_pivots = [pivot_merged1, pivot_merged2, pivot_merged3, pivot_merged4]
pivot_merged_train_list = reduce(lambda left, right: pd.merge(left, right, on="Variables", how="outer"), all_pivots) 
pivot_merged_train_list



Study_Status_Bin,Variables,0 - df1,1 - df1,0 - df2,1 - df2,0 - df3,1 - df3,0 - df4,1 - df4
0,Adverse_List_Adv_Death = 0,2363,2333,3209,3308,1289,1479,1582,1656
1,Adverse_List_Adv_Death = 1,183,213,1130,1031,638,448,265,191
2,Adverse_List_Adv_Serious = 0,2396,2303,2832,3093,1065,1350,1586,1692
3,Adverse_List_Adv_Serious = 1,150,243,1507,1246,862,577,261,155
4,Adverse_List_Adv_Unspecified = 0,2158,2167,2414,2714,991,1252,1438,1571
...,...,...,...,...,...,...,...,...,...
175,Primary_Purpose_List_TREATMENT = 1,1609,1977,3632,3812,1563,1633,1237,1342
176,Sex_List_FEMALE = 0,2443,2413,3950,4053,1763,1766,1638,1644
177,Sex_List_FEMALE = 1,103,133,389,286,164,161,209,203
178,Sex_List_MALE = 0,2147,2388,4177,4164,1875,1871,1787,1805


# Zeros

In [15]:
def fun_zeros(pivot_merged, count):
    sparse = pivot_merged[((pivot_merged.select_dtypes(include='number') < count).any(axis=1))]

    return sparse

In [16]:
# train sets
sparse_categ_train = fun_zeros(pivot_merged_train_categ, 5)  # Bin columns are all ok
display(sparse_categ_train)
sparse_list_train = fun_zeros(pivot_merged_train_list, 5)
display(sparse_list_train)


Study_Status_Bin,Variables,0 - df1,1 - df1,0 - df2,1 - df2,0 - df3,1 - df3,0 - df4,1 - df4


Study_Status_Bin,Variables,0 - df1,1 - df1,0 - df2,1 - df2,0 - df3,1 - df3,0 - df4,1 - df4
59,BEHAVIORAL_x_Funder_Industry_List = 1,0,1,14,6,3,1,1,1
113,DIETARY_SUPPLEMENT_x_Funder_Industry_List = 1,5,3,23,12,7,6,5,1
117,INTERV_UNSPES_x_Funder_Industry_List = 1,16,27,24,39,20,22,4,5
165,PROCEDURE_x_Funder_Industry_List = 1,9,8,15,12,12,13,4,7


# Merge/Drop Categories for sparse data. 
- Some categories have been merged from Thesis_data file from whole dataset (df). Those were obviously much less compared to dataset. 
- Some categories have been merged fom vizual file. As they created sparse data and inf coeffs in logisitc models output. IN vizual file there was no train_test split, so they were already obviously sparse.
- Some categories were merged after displayed here as 'risky for sparse' data. They were not merged in this file, in order to not to have different merged levels spare in files. 

- Some columns were dropped here. These columns are iteraction dummy columns, which had too little sample e.g, < 5


In [17]:
def fun_drop_cols(dfi):
    cols = ['BEHAVIORAL_x_Funder_Industry_List', 'DIETARY_SUPPLEMENT_x_Funder_Industry_List', 
     'INTERV_UNSPES_x_Funder_Industry_List', 'PROCEDURE_x_Funder_Industry_List']
    dfi = dfi.drop(columns = cols, axis = 1)
    return dfi

df1 = fun_drop_cols(df1)
df2 = fun_drop_cols(df2)
df3 = fun_drop_cols(df3)
df4 = fun_drop_cols(df4)

# Save Dfis

In [18]:
df1.to_pickle(r".\df_sparse\df1_sparse.pkl")
df2.to_pickle(r".\df_sparse\df2_sparse.pkl")
df3.to_pickle(r".\df_sparse\df3_sparse.pkl")
df4.to_pickle(r".\df_sparse\df4_sparse.pkl")