In [445]:
import shutup; shutup.please()

In [446]:
import pandas as pd
pd.set_option('display.max_rows', 5000)
from datetime import datetime
import numpy as np
from mlxtend.frequent_patterns import fpgrowth, apriori
from mlxtend.preprocessing import TransactionEncoder
from spmf import Spmf

In [447]:
def basic_preprocessing(df):
    df = df.dropna()
    df['charttime'] = pd.to_datetime(df['charttime'])
    df['hadm_id'] = df['hadm_id'].astype(int)
    # combine admission with subject_ids
    df['subject_id'] = df['subject_id'].astype("str")+"_"+df['hadm_id'].astype("str")
    df = df.drop_duplicates()
    return df
    
def map_mimic_to_loinc(df):
    lonic = pd.read_csv("./mimic_to_loinc.csv") 
    joined_df = pd.merge(df.query("tbl_name=='labevents'"), lonic, how='inner', left_on='col_id', right_on='itemid')
    lab_events = joined_df[['subject_id', 'hadm_id', 'charttime', 'loinc_code', 'tbl_name']].dropna().drop_duplicates()
    lab_events = lab_events.rename(columns={"loinc_code":"col_id"})
    df = pd.concat([df.query("tbl_name!='labevents'"), lab_events]).reset_index(drop=True)
    return df

# Generate unique 5-length digit IDs for each string
def generate_string_pharm(df):
    unique_strings = df.query("tbl_name=='pharmacy'")['col_id'].drop_duplicates().tolist()
    id_mapping = {string: f"{index:05}" for index, string in enumerate(unique_strings)}
    pharm_mapping = pd.DataFrame(list(id_mapping.items()), columns=['col_id', 'new_col_id'])
    return pharm_mapping

def map_pharmacy_codes(df, pharm_mapping):
    df = pd.merge(df, pharm_mapping, how='left', left_on='col_id', right_on='col_id')
    df['col_id'] = (np.where(df['tbl_name'] == 'pharmacy', df['new_col_id'], df['col_id']))
    df = df.drop(columns=["new_col_id"])
    return df

def add_prefix(df):
    # create a prefix based on 3 characters of each tbl_name
    df['prefix'] = df['tbl_name'].apply(lambda x: x[0:3]+'_')
    # add the prefix to the col_id
    df['col_id'] = df['prefix']+df['col_id']
    df = df.drop(columns=["hadm_id", "prefix"])
    return df

def group_into_list(df):
    df = df.sort_values(['subject_id','charttime'], ascending=[True, True])
    # add all items to a list 
    df = (df.groupby(['subject_id', 'charttime'])['col_id'].apply(list)
            .reset_index(name='col_id_list'))
    return df

def closed_patterns(frequent):
    su = frequent.support.unique()#all unique support count
    #Dictionay storing itemset with same support count key
    fredic = {}
    for i in range(len(su)):
        inset = list(frequent.loc[frequent.support ==su[i]]['itemsets'])
        fredic[su[i]] = inset
    #Dictionay storing itemset with  support count <= key
    fredic2 = {}
    for i in range(len(su)):
        inset2 = list(frequent.loc[frequent.support<=su[i]]['itemsets'])
        fredic2[su[i]] = inset2
    
    #Find Closed frequent itemset
    cl = []
    for index, row in frequent.iterrows():
        isclose = True
        cli = row['itemsets']
        cls = row['support']
        checkset = fredic[cls]
        for i in checkset:
            if (cli!=i):
                if(frozenset.issubset(cli,i)):
                    isclose = False
                    break
        
        if(isclose):
            cl.append(row['itemsets'])
    return cl
    
def generate_frequent_itemsets(df, min_support):
    te = TransactionEncoder()
    te_ary = te.fit(df[ "col_id_list"].to_list()).transform(df[ "col_id_list"].to_list())
    onehot = pd.DataFrame(te_ary, columns=te.columns_)
    patterns = fpgrowth(onehot, min_support=min_support, use_colnames=True)
    patterns['itemsets'] = patterns['itemsets'].apply(lambda x: frozenset(x))
    cl = closed_patterns(patterns)
    patterns = patterns[patterns['itemsets'].isin(cl)]
    return patterns

def get_buckets(patterns):
    patterns['length'] = patterns['itemsets'].apply(len)
    patterns = patterns.sort_values(["length", "support"],ascending=[False, False])
    
    buckets = (patterns
               .groupby(['length'])['itemsets'].apply(list).reset_index(name='buckets')
              ).sort_values("length",ascending=False)['buckets'].to_list()       
    
    return buckets

def basket_mappings(patterns):
    long_baskets = patterns.query("length>1")['itemsets'].drop_duplicates().to_list()
    id_mapping_basket = {string: frozenset({f"basket_{index:05}"}) for index, string in enumerate(long_baskets)}
    mapping_basket_df = pd.DataFrame(list(id_mapping_basket.items()), columns=['basket', 'basket_id'])
    return id_mapping_basket, mapping_basket_df
    
def break_down(pattern, buckets, id_mapping_basket):
    best_subsets = []
    for B in buckets:
        if len(B[0])<len(pattern):
            for SE in B:
                if SE.issubset(pattern):
                    best_subsets.append(SE)
                    pattern = pattern.difference(SE)
                    if not pattern:
                         return best_subsets
    best_subsets = [id_mapping_basket[best_subset] for best_subset in best_subsets if len(best_subset)>1]
    if pattern:
        best_subsets.append(pattern)
    
    return frozenset.union(*best_subsets)   
    
def get_final_mapping(df):
    df = df.sort_values(['subject_id','charttime'], ascending=[True, True])
    df['col_id_list'] = df['col_id_list'].apply(list)
    all_items_mapping = df.explode('col_id_list')['col_id_list'].drop_duplicates().reset_index(drop=True).to_dict()
    reverse_final_mapping = df.explode('col_id_list')['col_id_list'].drop_duplicates().reset_index(drop=True).to_dict()
    final_mapping = {v: k for k, v in reverse_final_mapping.items()}
    return final_mapping, reverse_final_mapping

def apply_mapping(item_list, final_mapping):
    return [final_mapping[item] for item in item_list]

def get_sequences(df):
    sequences = df.groupby(['subject_id'])['col_id_list'].apply(list).reset_index(name='sequences')['sequences'].to_list()
    return sequences

def convert_string_list(string_list):
    return [list(map(int, s.split())) if ' ' in s else [int(s)] for s in string_list]

def reverse_apply_final_mapping(pattern):
    mapped_pattern = []
    for sublist in pattern:
        mapped_sublist = [reverse_final_mapping[num] for num in sublist]
        mapped_pattern.append(mapped_sublist)
    return mapped_pattern


In [452]:
lonic = pd.read_csv("./mimic_to_loinc.csv") 
path = "./mimic_raw_data2.csv"
df = pd.read_csv(path) 
df = basic_preprocessing(df)
df = map_mimic_to_loinc(df)
pharm_mapping = generate_string_pharm(df)
df = map_pharmacy_codes(df, pharm_mapping)
df = add_prefix(df)
df = group_into_list(df)
#patterns = generate_frequent_itemsets(df, 0.01)
id_mapping_basket, mapping_basket_df = basket_mappings(patterns)
df['col_id_list'] = df["col_id_list"].apply(lambda x: break_down(frozenset(x), buckets, id_mapping_basket))
final_mapping, reverse_final_mapping = get_final_mapping(df)
df['col_id_list'] = df['col_id_list'].apply(lambda x: apply_mapping(x, final_mapping))
sequences = get_sequences(df)

In [503]:
spmf = Spmf("CloSpan", input_direct=sequences,
             arguments=[0.05,"", True])
spmf.run()
# print(spmf.parse_output())
df_spmf = spmf.to_pandas_dataframe(pickle=True)

In [464]:
df_spmf['pattern'] = df_spmf['pattern'].apply(convert_string_list)
df_spmf['pattern'] = df_spmf['pattern'].apply(reverse_apply_final_mapping)
df_spmf['len'] = df_spmf['pattern'].apply(len)

In [502]:
#df_spmf.query("len>1").sort_values("sup", ascending=False)

In [495]:
#mapping_basket_df[mapping_basket_df["basket_id"] == frozenset({"basket_01639"})]['basket'].tolist()

In [496]:
#lonic[lonic['loinc_code']=='1959-6']

In [501]:
#pharm_mapping.query("new_col_id == '00002'")