# Functions documentation

## Use these functions in succession to create a custom dataset. If you would like, clone this repository and adjust the functions to include the features of your liking. 

## For example, under *file_merge(), adjust the "kept" variable to reflect whichever columns you prefer from the original dataset to be included within your own.

### *sorter (files)

* Convert all values in 'prod_ai' column to string values, and then separate non-nan values for class mapping.

* The 'prod_ai' (product active ingredient) column is used for this and downstream functions because unlike the brand name, an active ingredient/generic name may have a shared suffix with other medications, which makes the mapping functions computationally efficient.

* Append each sorted dataframe as a list to allow mapping function iterations to run separately and maintain data integrity.
* The top_indices argument slices a pd.value_counts() output to return the indices of the top pd.value_counts() of the prod_ai column.

In [None]:
class_dfs = []
missing_dfs = []
positives = []
indices = []
grouped_list = []

def sorter(drug_file_path,top_indices=None):
    
    grouped_list = []
    drug_file = pd.read_csv(drug_file_path, delimiter='$')

    drug_file.prod_ai = drug_file.prod_ai.astype(str)
    drug_file.prod_ai = drug_file.prod_ai.map(lambda x: x.replace('.', ''))
    
    index = drug_file[drug_file.prod_ai != 'nan'].index
    nan_index = drug_file[drug_file.prod_ai == 'nan'].index
    
    present = drug_file.prod_ai.loc[index]
    present_primaryids = drug_file.primaryid.loc[index]
    
    absent = drug_file.prod_ai.loc[nan_index]
    absent_primaryids = drug_file.primaryid.loc[nan_index]
    
    class_df = pd.DataFrame(columns=['primaryid', 'prod_ai', 'class_id', 'class', 'indication'])
    missing_df = pd.DataFrame(columns=['primaryid', 'prod_ai', 'class_id', 'class', 'indication'])
    
    class_df.primaryid = present_primaryids
    class_df.prod_ai = present
    
    missing_df = absent_primaryids
    missing_df.prod_ai = absent
    
    if top_indices == None:
        top_indices = -1
    grouped_list = [class_df[class_df.loc[:,'prod_ai'] == x].index for x in class_df.prod_ai.value_counts()[:top_indices].index]
    class_df = class_df.loc[itertools.chain.from_iterable(grouped_list)]
    
    
    class_dfs.append([class_df])
    missing_dfs.append([missing_df])
    positives.append([present])
    indices.append([index])
    
    print('Check "class_dfs", "missing_dfs", "positives" and "indices" for output')

### *map_1(class_df,array_split)

* First round of mapping logic.
* The array_split parameter is passed through each round of logic and is used within the final round of mapping (map_5). This integer value tells a final if/else statement whether the split dataframe has its full parts within a list, in which case it will concatenate the parts into the size of the original dataframe (minus values that did not meet mapping logic conditions). See 'for loop' example at bottom.
* At completion of iteration, separate mapped drug names and indices from drug names and indices where no class was mapped, then send the unmapped entries into the next mapping function.
     * Instead of sending each original dataframe through the full mapping logic, which is extremely computationally expensive, split the dataframe with numpy function np.array_split(), and send each section through the logic, and separate the entries that returned nan. That smaller dataframe is then sent through the next mapping function, which has the same .loc separater steps, and then send an even smaller dataframe through the third round of logic and so on.
     * This cascade-style mapping proves to be very efficient, especially when handling 1.5+ million observations per dataframe.

* Create local variable for mapped entries, and send that to next function to merge with the next round of mapped entries.

In [None]:
def map_1(class_df, array_split):

    for x,y in zip(class_df.prod_ai,class_df.index):
    
        (mapping logic)...
        
    
    class_df.class_id = class_df.class_id.astype(str)
    lead_df = class_df[class_df.class_id != 'nan']
    df_2 = class_df[class_df.class_id == 'nan']
    
    idx = df_2.index
    drugs = df_2.prod_ai
   
    return map_2(df_2,drugs,idx,lead_df,array_split)

### *map_2(class_df,drugs,idx,lead_df,array_split)

* Second round of mapping logic.
* See map_1 for explanation...
* Create local variable of concatenated dataframes, and send that to next function to merge with the next round of mapped entries.

In [None]:
def map_2(class_df,drugs,idx,lead_df,array_split):

    for x,y in zip(drugs,idx):
    
        (mapping logic)...
        
    class_df.class_id = class_df.class_id.astype(str)
            
    df_2 = class_df[class_df.class_id != 'nan']
    df_3 = class_df[class_df.class_id == 'nan']
    final_df = pd.concat([lead_df, df_2])
    
    idx = df_3.index
    drugs = df_3.prod_ai
    
    return map_3(df_3,drugs,idx, final_df,array_split)

### *map_3(class_df,drugs,idx, final_df,array_split)

* Third round of mapping logic.
* see map_1 for explanation...
* Create local variable of concatenated dataframes, and send that to next function to merge with the next round of mapped entries.

In [None]:
final_dfs = []
miss_dfs = []

def map_3(class_df,drugs,idx,final_df,array_split):
    
    for x,y in zip(drugs,idx):
        
        (mapping logic)...
        
    class_df.class_id = class_df.class_id.astype(str)
            
    df_3 = class_df[class_df.class_id != 'nan']
    df_4 = class_df[class_df.class_id == 'nan']
    final_df = pd.concat([final_df, df_3])
    
    idx = df_4.index
    drugs = df_4.prod_ai
    
    return map_4(df_4,drugs,idx,final_df, array_split)

### *map_4(class_df,drugs,idx,final_df,array_split)
* Fourth round of mapping logic.
* See map_1 for explanation...
* Create local variable of concatenated dataframes, and send that to next function to merge with the next round of mapped entries.

In [None]:
def map_4(class_df,drugs,idx,final_df,array_split):
    
    for x,y in zip(drugs,idx):
        
    class_df.class_id = class_df.class_id.astype(str)
            
    df_4 = class_df[class_df.class_id != 'nan']
    df_5 = class_df[class_df.class_id == 'nan']
    final_df = pd.concat([final_df, df_4])
    
    idx = df_5.index
    drugs = df_5.prod_ai
    
    return map_5(df_5,drugs,idx,final_df,array_split)
    

### *map_5(class_df,drugs,idx,final_df,array_split)
* Create local variable of concatenated dataframes and a dataframe of all entries that did not meet any of the mapping logic, then append each into their respective global list to examine once functions are completed.
* The final if/else statement checks if each section of the original dataframe is present in a list, and then concatenates the list to recreate the original dataframe (minus values that did not meet mapping logic).

In [None]:
def map_5(class_df,drugs,idx, final_df,array_split):
    
    for x,y in zip(drugs,idx):
        
    class_df.class_id = class_df.class_id.astype(str)
    miss_df = class_df[class_df.class_id == 'nan']
    class_df = class_df[class_df.class_id != 'nan']
    final_df = pd.concat([final_df, class_df])
    
    global final_storage_list
    global final_missing_storage_list
    
    final_storage_list.append(final_df)
    final_missing_storage_list.append(miss_df)
    
    if len(final_storage_list) == array_split:
        final_df = pd.concat(final_storage_list)
        final_df = final_df.sort_values(by='primaryid').reset_index(drop=True)
        final_dfs.append(final_df)
        
        miss_df = pd.concat(final_missing_storage_list)
        miss_df = miss_df.sort_values(by='primaryid').reset_index(drop=True)
        miss_dfs.append(miss_df)

        
        final_storage_list = []
        final_missing_storage = []
        
    else:
        pass

### Example for loop usage

In [None]:
for c in class_dfs:
    for class_df in c:
        for df in np.array_split(class_df,100):
            map_1(df,100)

### *reacs_map(reacs)
* Create dictionary of primaryids as key, and pt (Preferred Term) reaction as value(s). 
* Iterate through dictionary, join values and create dataframe.

In [None]:
def reacs_map(reactions__file_path):
    
    reactions_by_id = {}
    with open(reactions__file_path) as csvfile:
        reacreader = csv.reader(csvfile, delimiter='$')
        next(reacreader) 
         
        for row in reacreader:

            ptlist = reactions_by_id.get(row[0], [])
            ptlist.append(row[2])
            reactions_by_id[row[0]] = ptlist

        reactions_by_id_list.append(reactions_by_id) 
        
            
    reac_df = pd.DataFrame(reactions_by_id.keys(), columns=(['primaryid']))
    reac_df = reac_df.sort_values(by='primaryid').set_index('primaryid')
    reac_df['pt'] = 'nan'
    
    for k,v in reactions_by_id.items():
        reac_df.loc[k, 'pt'] = ' , '.join(v)
    final_reacs.append(reac_df)
    
    print('completed')

### *outs_map(outs) 
* Create dictionary of primaryids as key, and out_code (Outcome Code) as value(s).
* Iterate through dictionary, join values, and create dataframe.

In [None]:
def outs_map(outcomes_file_path):
    
    outcomes_by_id = {}
    with open(outcomes_file_path) as csvfile:
        outcreader = csv.reader(csvfile, delimiter='$')
        next(outcreader)
        
        for row in outcreader:
            ptlist = outcomes_by_id.get(row[0], [])
            ptlist.append(row[2])
            outcomes_by_id[row[0]] = ptlist
        outcomes_by_id_list.append(outcomes_by_id)
        

    out_df = pd.DataFrame(outcomes_by_id.keys(), columns=(['primaryid']))
    out_df = out_df.sort_values(by='primaryid').set_index('primaryid')
    out_df['out_code'] = 'nan'
       
    for k,v in outcomes_by_id.items():
        out_df.loc[k,'out_code'] = ' , '.join(v)
    final_outs.append(out_df)
    
    print('done')

### *file_merge(saved_dfs, drug_files, df1, df2)
* First check if each of the three arguments are string values, if they are, read the path. Otherwise it must be a dataframe object.
* Then set index to primaryid for the dataframes, iterate through them, and cast values on matching indices from the reactions and outcomes dataframes to the dataframe retrieved from the class mapping logic.

In [None]:
def file_merge(final_class_df_path, final_reac_path, final_out_path):
    
    if isinstance(final_class_df_path, str):
        sd = pd.read_csv(final_class_df_path)  
    if isinstance(final_reac_path, str):
        fr = pd.read_csv(final_reac_path)
    if isinstance(final_out_path, str):
        fo = pd.read_csv(final_out_path)
    else:
        sd = final_class_df_path
        fr = final_reac_path
        fo = final_out_path
    
    sd = sd.set_index('primaryid')
    fr = fr.set_index('primaryid')
    fo = fo.set_index('primaryid')
    sd['pt'] = 'nan'
    sd['out_code'] = 'nan'
    
    for x in fr.index:
        x = int(x)
        
        for z in sd.index:
            if x == z:
                sd.loc[z,'pt'] = fr.loc[x,'pt'] 
            else:
                pass
    print('finished with reactions')

    for y in fo.index:    
        y = int(y)
        
        for z in sd.index:
            if y == z:
                sd.loc[z,'out_code'] = fo.loc[y,'out_code']
            else:
                pass    
    print('finished with outcomes')
    sd = sd[['primaryid', 'drugname', 'class', 'class_id', 'indication', 'pt', 'out_code']].reset_index()
    print('completed')         
    custom_dfs.append(sd)