# This notebook is for the drug/prescription information.

In [None]:
import pandas as pd
import numpy as np
import time

In [None]:
drug_file = pd.read_csv('DRUG20Q3.txt', delimiter='$')

In [None]:
# quantify total missing values 


null_df = pd.DataFrame(drug_file.isnull().sum().sort_values(ascending=False)).reset_index()
null_df.columns = [['column', 'missing_values']]

null_df

In [None]:
# identify data types 

drug_file.info()

# Handling Missing Values

In [None]:
# Convert all values to string, then match indicies for all nan and non-nan values 
# into seperate series'



drug_file.prod_ai = drug_file.prod_ai.astype(str)

indicies = drug_file[drug_file.prod_ai != 'nan'].index
indicies2 = drug_file[drug_file.prod_ai == 'nan'].index

present = drug_file.prod_ai.loc[indicies]
needs_work = drug_file.prod_ai.loc[indicies2]



In [None]:
# Create DataFrame of 'nan' values, and attach teh series of matching indicies for which the drugname is present.
# Needs work...



needs_work_df = pd.DataFrame(needs_work)
needs_work_df['drugname'] = drug_file.drugname.loc[indicies2]

needs_work_df.drugname.value_counts()


# Custom Table Creation

In [None]:
new_drug_df = drug_file[['primaryid', 'caseid', 'drug_seq', 'role_cod', 'drugname', 'prod_ai', 'val_vbm', 'route', 'dose_vbm', 'cum_dose_chr']]

drugs = pd.DataFrame()

In [None]:
# for loop for determining the amount of 'PS' (Primary suspect), 'SS' (Secondary Suspects), 'C', (concamitants), 
# or 'I' (Interacting) medications present for every unique CaseID ***6.5hr runtime***

start_time = time.time()
p = 0
s = 0
c = 0
i = 0


for x,j in enumerate(new_drug_df.caseid.unique()):
    df = new_drug_df[new_drug_df.caseid == j]


    for y in range(len(df)):
        if df.iloc[y,3] == 'PS':
            p += 1
        elif df.iloc[y,3] == 'SS':
            s += 1
        elif df.iloc[y,3] == 'c':
            c += 1
        else:
            i += 1

    drugs.loc[x, 'CaseID'] = j
    drugs.loc[x, 'Primary'] = p
    drugs.loc[x, 'Secondary'] = s
    drugs.loc[x, 'Concamitant'] = c
    drugs.loc[x, 'Interacting'] = i
        
    p,s,c,i = 0,0,0,0
end_time = time.time()

total_time = (end_time - start_time) / 60
total_time / 60

In [None]:
# change data type of CaseID from float to integer

drugs.CaseID = drugs.CaseID.astype(int)

# export dataframe to .csv file for manipulation later

drugs.to_csv('drugs.csv', index=False)

# Mapping drugnames into numerical classes

In [None]:
class_df = pd.DataFrame(columns=['drugname', 'class_id', 'class', 'indication'])


In [None]:
# Convert all values in 'prod_ai' column to string values, and then separate non-nan values out for class mapping



drug_file.prod_ai = drug_file.prod_ai.astype(str)
drug_file.prod_ai = drug_file.prod_ai.map(lambda x: x.replace('.', ''))
indicies = drug_file[drug_file.prod_ai != 'nan'].index
present = drug_file.prod_ai.loc[indicies]
class_df.drugname = present

In [None]:
len(drug_file)

In [None]:
drug_file[drug_file.drugname=='EMTRIVA']

In [None]:

# using string methods 'startswith' and 'endswith', filter each product-active-ingredient to help map classes for each 
# medication   

for x,y in zip(present,indicies):
    if x.endswith('MAB'):
        class_df.loc[y, 'class_id'] = 1
        class_df.loc[y, 'class'] = 'monoclonal_antibody'
        class_df.loc[y, 'indication'] = 'autoimmune diseases'
    elif x.endswith('PRIL'):
        class_df.loc[y, 'class_id'] = 2
        class_df.loc[y, 'class'] = 'ACE_inhibitor'
        class_df.loc[y, 'indication'] = 'hypertenstion' 
    elif x.endswith('TIDINE') or x.endswith('ZANTAC'):
        class_df.loc[y, 'class_id'] = 3
        class_df.loc[y, 'class'] = '2nd_gen_antihistamine'
        class_df.loc[y, 'indication'] = 'allergy'
    elif x.endswith('STATIN', 0, 12) or x.endswith('STATIN'):
        class_df.loc[y, 'class_id'] = 4
        class_df.loc[y, 'class'] = 'HMG-CoA reductase inhibitor'
        class_df.loc[y, 'indication'] = 'hyperlipidemia'
    elif x.endswith('AZEPAM') or x.endswith('ZOLAM'):
        class_df.loc[y, 'class_id'] = 5
        class_df.loc[y, 'class'] = 'benzodiazepine'
        class_df.loc[y, 'indication'] = 'anxiety'
    elif x.endswith('AFIL'):
        class_df.loc[y, 'class_id'] = 6
        class_df.loc[y, 'class'] = 'phosphodiesterase inhibitor'
        class_df.loc[y, 'indication'] = 'erectile dysfunction, hypertension'
    elif x.endswith('ANE'):
        class_df.loc[y, 'class_id'] = 7
        class_df.loc[y, 'class'] = 'inhaled anestetics'
        class_df.loc[y, 'indication'] = 'anesthesia'
    elif x.endswith('ARTAN'):
        class_df.loc[y, 'class_id'] = 8
        class_df.loc[y, 'class'] = 'angiotension receptor blocker'
        class_df.loc[y, 'indication'] = 'hypertension'
    elif x.endswith('AZINE'):
        class_df.loc[y, 'class_id'] = 9
        class_df.loc[y, 'class'] = 'phenothiazines'
        class_df.loc[y, 'indication'] = 'antipsychotic'
    elif x.endswith('AZOLE'):
        class_df.loc[y, 'class_id'] = 10
        class_df.loc[y, 'class'] = 'azole-antifungal'
        class_df.loc[y, 'indication'] = 'antifungal'
    elif x.endswith('BARBITAL'):
        class_df.loc[y, 'class_id'] = 11
        class_df.loc[y, 'class'] = 'barbituates'
        class_df.loc[y, 'indication'] = 'anxiety'
    elif x.endswith('CAINE'):
        class_df.loc[y, 'class_id'] = 12
        class_df.loc[y, 'class'] = 'local anesthetics'
        class_df.loc[y, 'indication'] = 'anesthesia'
    elif x.endswith('CILLIN'):
        class_df.loc[y, 'class_id'] = 13
        class_df.loc[y, 'class'] = 'penecillin antibiotics'
        class_df.loc[y, 'indication'] = 'antibiotic'
    elif x.endswith('CYCLINE'):
        class_df.loc[y, 'class_id'] = 14
        class_df.loc[y, 'class'] = 'tetracyclines'
        class_df.loc[y, 'indication'] = 'antibiotic'
    elif x.endswith('ETINE'):
        class_df.loc[y, 'class_id'] = 15
        class_df.loc[y, 'class'] = 'selective serotonia reuptake inhibitors'
        class_df.loc[y, 'indication'] = 'depression'
    elif x.endswith('FEB') or x.endswith('FENE'):
        class_df.loc[y, 'class_id'] = 16
        class_df.loc[y, 'class'] = 'selective estrogen response modifiers'
        class_df.loc[y, 'indication'] = 'osteoprosis, cancer treatment'
    elif x.endswith('FLOXACIN'):
        class_df.loc[y, 'class_id'] = 17
        class_df.loc[y, 'class'] = 'fluoroquinolones'
        class_df.loc[y, 'indication'] = 'antibiotics'
    elif x.endswith('FUNGIN'):
        class_df.loc[y, 'class_id'] = 18
        class_df.loc[y, 'class'] = 'echinocandins'
        class_df.loc[y, 'indication'] = 'antifungal'
    elif x.endswith('GRASTIM') or x.endswith('GRAMOSTIM'):
        class_df.loc[y, 'class_id'] = 19
        class_df.loc[y, 'class'] = 'granulocyte colony stimulating factors'
        class_df.loc[y, 'indication'] = 'blood dyscrasias'
    elif x.endswith('IDE'):
        class_df.loc[y, 'class_id'] = 20
        class_df.loc[y, 'class'] = 'loop diuretics'
        class_df.loc[y, 'indication'] = 'hypertension'
    elif x.endswith('IPINE'):
        class_df.loc[y, 'class_id'] = 21
        class_df.loc[y, 'class'] = 'dihydropyridine calcium channel blockers'
        class_df.loc[y, 'indication'] = 'hypertension'
    elif x.endswith('IPRAMINE'):
        class_df.loc[y, 'class_id'] = 22
        class_df.loc[y, 'class'] = 'tricyclic antidepressants'
        class_df.loc[y, 'indication'] = 'depression'
    elif x.endswith('IUM') or x.endswith('URONIUM'):
        class_df.loc[y, 'class_id'] = 23
        class_df.loc[y, 'class'] = 'nondepolarizing paralytics'
        class_df.loc[y, 'indication'] = 'anesthesia'
    elif x.endswith('LUKAST'):
        class_df.loc[y, 'class_id'] = 24
        class_df.loc[y, 'class'] = 'LTD receptor antagonist'
        class_df.loc[y, 'indication'] = 'asthma'
    elif x.endswith('NAVIR'):
        class_df.loc[y, 'class_id'] = 25
        class_df.loc[y, 'class'] = 'protease inhibitor'
        class_df.loc[y, 'indication'] = 'antiviral'
    elif x.endswith('OLOL'):
        class_df.loc[y, 'class_id'] = 26
        class_df.loc[y, 'class'] = 'beta blocker'
        class_df.loc[y, 'indication'] = 'hypertension'
    elif x.endswith('OXIN'):
        class_df.loc[y, 'class_id'] = 27
        class_df.loc[y, 'class'] = 'cardiac glycoside'
        class_df.loc[y, 'indication'] = 'arrhythmias'
    elif x.endswith('PYHLLINE'):
        class_df.loc[y, 'class_id'] = 28
        class_df.loc[y, 'class'] = 'methlxanthine'
        class_df.loc[y, 'indication'] = 'bronchodilator'
    elif x.endswith('QUINE'):
        class_df.loc[y, 'class_id'] = 29
        class_df.loc[y, 'class'] = 'quinolone derivatives'
        class_df.loc[y, 'indication'] = 'antimalarial'
    elif x.endswith('TECAN'):
        class_df.loc[y, 'class_id'] = 30
        class_df.loc[y, 'class'] = 'topoisomerase-1 inhibitor'
        class_df.loc[y, 'indication'] = 'chemotherapy'
    elif x.endswith('TEROL'):
        class_df.loc[y, 'class_id'] = 31
        class_df.loc[y, 'class'] = 'Beta-2 agonist'
        class_df.loc[y, 'indication'] = 'bronchodilator'
    elif x.endswith('TINE'):
        class_df.loc[y, 'class_id'] = 32
        class_df.loc[y, 'class'] = 'allylamine antifungals'
        class_df.loc[y, 'indication'] = 'antifungal'
    elif x.endswith('TOPOSIDE'):
        class_df.loc[y, 'class_id'] = 33
        class_df.loc[y, 'class'] = 'topoisomerase-2 inhibitor'
        class_df.loc[y, 'indication'] = 'chemotherapy'
    elif x.endswith('TRIPTAN'):
        class_df.loc[y, 'class_id'] = 34
        class_df.loc[y, 'class'] = '5-HT1B/1D agonist'
        class_df.loc[y, 'indication'] = 'migraines'
    elif x.endswith('TROPIN'):
        class_df.loc[y, 'class_id'] = 35
        class_df.loc[y, 'class'] = 'pituitary hormone'
        class_df.loc[y, 'indication'] = 'hormone deficiency'
    elif x.endswith('VAPTAN'):
        class_df.loc[y, 'class_id'] = 36
        class_df.loc[y, 'class'] = 'vasopressin receptor antagonist'
        class_df.loc[y, 'indication'] = 'hypertension'
    elif x.endswith('ZOSIN'):
        class_df.loc[y, 'class_id'] = 37
        class_df.loc[y, 'class'] = 'alpha-1 antagonist'
        class_df.loc[y, 'indication'] = 'hypertension, BPH'
    elif x.startswith('PREDNISONE') or x.startswith('PREDNISOLONE'):
        class_df.loc[y, 'class_id'] = 38
        class_df.loc[y, 'class'] = 'corticosteroid'
        class_df.loc[y, 'indication'] = 'immunosupressant'
    elif x.startswith('METHOTREXATE'):
        class_df.loc[y, 'class_id'] = 39
        class_df.loc[y, 'class'] = 'antimetabolites'
        class_df.loc[y, 'indication'] = 'cancer treatment'
    elif x.startswith('XARELTO') or x.startswith('WARFARIN') or x.startswith('RIVAROXABAN'):
        class_df.loc[y, 'class_id'] = 40
        class_df.loc[y, 'class'] = 'anticoagulant' 
        class_df.loc[y, 'indication'] = 'blood clots'
    elif x.startswith('INFLECTRA') or x.startswith('INFLIXIMAB-DYYB'):
        class_df.loc[y, 'class_id'] = 41
        class_df.loc[y, 'class'] = 'TNF blocking agent'
        class_df.loc[y, 'indication'] = 'autoimmune diseases'
    elif x.startswith('ENBREL') or x.startswith('ETANERCEPT'):
        class_df.loc[y, 'class_id'] = 42
        class_df.loc[y, 'class'] = 'TNF inhibitor'
        class_df.loc[y, 'indication'] = 'autoimmune diseases'
    elif x.startswith('DEXAMETHASONE'):
        class_df.loc[y, 'class_id'] = 43
        class_df.loc[y, 'class'] = 'glucocorticoid'
        class_df.loc[y, 'indication'] = 'immunosupressant'
    elif x.startswith('AVONEX') or x.startswith('INTERFERON BETA-1A'):
        class_df.loc[y, 'class_id'] = 44
        class_df.loc[y, 'class'] = 'interferon'
        class_df.loc[y, 'indication'] = 'multiple sclerosis'
    elif x.startswith('GABAPENTIN') or x.startswith('LYRICA') or x.startswith('PREGABALIN'):
        class_df.loc[y, 'class_id'] = 45
        class_df.loc[y, 'class'] = 'GABA analogue'
        class_df.loc[y, 'indication'] = 'anticonvulsant, fibromyalgia, nerve pain'
    elif x.startswith('AMLODIPINE'):
        class_df.loc[y, 'class_id'] = 46
        class_df.loc[y, 'class'] = 'calcium channel blocker'
        class_df.loc[y, 'indication'] = 'hypertension, chest pain'
    elif x.startswith('XELJANZ') or x.endswith('TINIB') or x.startswith('TOFACITINIB CITRATE'):
        class_df.loc[y, 'class_id'] = 47
        class_df.loc[y, 'class'] = 'tyrosine kinase inhibitors'
        class_df.loc[y, 'indication'] = 'autoimmune diseases'
    elif x.endswith('ANIB'):
        class_df.loc[y, 'class_id'] = 48
        class_df.loc[y, 'class'] = 'angiogenesis inhibitor, tyrosine kinase inhibitor'
        class_df.loc[y, 'indication'] = 'cancer treatment'
    elif x.endswith('RAFENIB'):
        class_df.loc[y, 'class_id'] = 49
        class_df.loc[y, 'class'] = 'rapidly accelerated fibrosarcoma kinase inhibitor'
        class_df.loc[y, 'indication'] = 'cancer treatment'
    elif x.startswith('ORENCIA') or x.startswith('ABATACEPT'):
        class_df.loc[y, 'class_id'] = 50
        class_df.loc[y, 'class'] = 'immunomodulator'
        class_df.loc[y, 'indication'] = 'autoimmune diseases'
    elif x.startswith('TRUVADA') or x.startswith('DESCOVY') or x.startswith('EMTRICITABINE\TENOFOVIR DISOPROXIL FUMARATE'):
        class_df.loc[y, 'class_id'] = 51 
        class_df.loc[y, 'class'] = 'reverse transcriptase inhibitor'
        class_df.loc[y, 'indication'] = 'antiviral'
    elif x.startswith('ACETAMINOPHEN') or x.startswith('TYLENOL'):
        class_df.loc[y, 'class_id'] = 52
        class_df.loc[y, 'class'] = 'analgesic'
        class_df.loc[y, 'indication'] = 'fever reducer'
    elif x.startswith('FOLIC ACID'):
        class_df.loc[y, 'class_id'] = 53
        class_df.loc[y, 'class'] = 'vitamin'
        class_df.loc[y, 'indication'] = 'anemia, prenatal'
    elif x.startswith('OTEZLA') or x.startswith('APREMILAST'):
        class_df.loc[y, 'class_id'] = 54
        class_df.loc[y, 'class'] = 'phosphodiesterase'
        class_df.loc[y, 'indication'] = 'autoimmune diseases'
    elif x.startswith('ASPIRIN'):
        class_df.loc[y, 'class_id'] = 55
        class_df.loc[y, 'class'] = 'nonsteroidal anti-inflammatory drug, blood thinners'
        class_df.loc[y, 'indication'] = 'fever reducer, inflammation'
    elif x.startswith('TECFIDERA') or x.startswith('DIMETHYL FUMARATE'):
        class_df.loc[y, 'class_id'] = 56
        class_df.loc[y, 'class'] = 'dimethyl fumarate, fumaric acid ester'
        class_df.loc[y, 'indication'] = 'multiple sclerosis'
    elif x.startswith('VIREAD') or x.startswith('TENOFOVIR DISOPROXIL FUMARATE'):
        class_df.loc[y, 'class_id'] = 57
        class_df.loc[y, 'class'] = 'reverse transcriptase inhibitor'
        class_df.loc[y, 'indication'] = 'antiviral'
    elif x.startswith('EMTRIVA') or x.startswith('EMTRICITABINE'):
        class_df.loc[y, 'class_id'] = 58
        class_df.loc[y, 'class'] = 'nucleotide reverse transcriptase inhibitor'
        class_df.loc[y, 'indication'] = 'antiviral'
    elif x.startswith('REVLIMID') or x.startswith('LENALIDOMIDE'):
        class_df.loc[y, 'class_id'] = 59
        class_df.loc[y, 'class'] = 'immunomodulator'
        class_df.loc[y, 'indication'] = 'cancer treatment'
    elif x.startswith('HUMIRA'):
        class_df.loc[y, 'class_id'] = 60
        class_df.loc[y, 'class'] = 'TNF blocking agent'
        class_df.loc[y, 'indication'] = 'plaque psoriasis'
    elif x.startswith('COSENTYX'):
        class_df.loc[y, 'class_id'] = 61
        class_df.loc[y, 'class'] = 'monoclonal antibody, interleukin-17A blocker'
        class_df.loc[y, 'indication'] = 'plaque psoriasis, psoratic arthritis'
    elif x.startswith('METFORMIN'):
        class_df.loc[y, 'class_id'] = 62
        class_df.loc[y, 'class'] = 'biguanides'
        class_df.loc[y, 'indication'] = 'diabetic management'
    elif x.startswith('NEULASTA'):
        class_df.loc[y, 'class_id'] = 63
        class_df.loc[y, 'class'] = 'granulocyte colony stimulating factor'
        class_df.loc[y, 'indication'] = 'febrile neutropenia'
    elif x.startswith('DUPIXENT') or x.endswith('DUPIXENT'):
        class_df.loc[y, 'class_id'] = 64
        class_df.loc[y, 'class'] = 'monoclonal antibody, IL-4 and IL-13 blocker'
        class_df.loc[y, 'indication'] = 'atopic dermatitis, add-on asthma treatment'
    elif x.startswith('XOLAIR'):
        class_df.loc[y, 'class_id'] = 65
        class_df.loc[y, 'class'] = 'monoclonal antibody, anti-IgE antibody'
        class_df.loc[y, 'indication'] = 'asthma treatment, chronic idiopathic urticaria'
    elif x.startswith('ACTEMRA'):
        class_df.loc[y, 'class_id'] = 66
        class_df.loc[y, 'class'] = 'monoclonal antibody, IL-6 receptor antagonist'
        class_df.loc[y, 'indication'] = 'rheumatoid arthritis, giant cell arteritis, cytokine release syndrome'
    elif x.startswith('OXYCONTIN') or x.startswith('OXYCODONE'):
        class_df.loc[y, 'class_id'] = 67
        class_df.loc[y, 'class'] = 'opioid agonist'
        class_df.loc[y, 'indication'] = 'pain management'
    elif x.startswith('ELIQUIS') or x.startswith('APIXABAN'):
        class_df.loc[y, 'class_id'] = 68
        class_df.loc[y, 'class'] = 'factor Xa inhibitor anticoagulant'
        class_df.loc[y, 'indication'] = 'nonvalvular atrial fibrilation'
    elif x.startswith('STELARA'):
        class_df.loc[y, 'class_id'] = 69
        class_df.loc[y, 'class'] = 'monoclonal antibody, IL-12 and IL-23 antagonist'
        class_df.loc[y, 'indication'] = 'plaque psoriasis, psoriatic arthritis, chrons disease'
    elif x.startswith('SYNTHROID') or x.startswith('LEVOTHYROXINE'):
        class_df.loc[y, 'class_id'] = 70
        class_df.loc[y, 'class'] = 'hormone'
        class_df.loc[y, 'indication'] = 'hyperthyroidism, pituitary tsh suppression'
    elif x.startswith('IBUPROFEN'):
        class_df.loc[y, 'class_id'] = 71
        class_df.loc[y, 'class'] = 'analgesic, non-steriodal anti-inflammatory drug'
        class_df.loc[y, 'indication'] = 'pain management'
    elif x.startswith('TRAMADOL'):
        class_df.loc[y, 'class_id'] = 72
        class_df.loc[y, 'class'] = 'opiate analgesic/agonist'
        class_df.loc[y, 'indication'] = 'pain management'
    elif x.startswith('IBRANCE') or x.startswith('PALBOCICLIB'):
        class_df.loc[y, 'class_id'] = 73
        class_df.loc[y, 'class'] = 'kinase inhibitor'
        class_df.loc[y, 'indication'] = 'cancer treatment'
    elif x.startswith('VITAMIN')  or x.startswith('VITAMINS') or x.startswith('CHOLECALCIFEROL') or x.startswith('CYANOCOBALAMIN') or x.startswith('ASCORBIC ACID'):
        class_df.loc[y, 'class_id'] = 74
        class_df.loc[y, 'class'] = 'vitamin'
        class_df.loc[y, 'indication'] = 'dietary supplement'
    elif x.startswith('METHYLPREDNISOLONE') or x.endswith('METHYLPREDNISOLONE'):
        class_df.loc[y, 'class_id'] = 75
        class_df.loc[y, 'class'] = 'glucocorticoid'
        class_df.loc[y, 'indication'] = 'endocrine disorders, rheumatic disorders'
    elif x.startswith('TRULICITY'):
        class_df.loc[y, 'class_id'] = 76
        class_df.loc[y, 'class'] = 'glp-1 receptor agonist'
        class_df.loc[y, 'indication'] = 'glycemic management'
    elif x.startswith('ROXANOL') or x.startswith('MORPHINE SULFATE'):
        class_df.loc[y, 'class_id'] = 77
        class_df.loc[y, 'class'] = 'opioid analgesic'
        class_df.loc[y, 'indication'] = 'pain management'
    elif x.startswith('REMICADE') or x.endswith('INFLIXIMAB'):
        class_df.loc[y, 'class_id'] = 78
        class_df.loc[y, 'class'] = 'TNF blocking agent'
        class_df.loc[y, 'indication'] = 'autoimmune diseases'
    elif x.startswith('PROAIR HFA') or x.startswith('ALBUTEROL SULFATE'):
        class_df.loc[y, 'class_id'] = 79
        class_df.loc[y, 'class'] = 'beta-2 adrenergic agonist'
        class_df.loc[y, 'indication'] = 'asthma'
    elif x.startswith('PLAQUENIL') or x.startswith('HYDROXYCHLOROQUINE SULFATE'):
        class_df.loc[y, 'class_id'] = 80
        class_df.loc[y, 'class'] = 'antimalarial' 
        class_df.loc[y, 'indication'] = 'malaria, autoimmune diseases'
    elif x.startswith('PROGRAF') or x.startswith('TACROLIMUS'):
        class_df.loc[y, 'class_id'] = 81
        class_df.loc[y, 'class'] = 'immunosuppressant'
        class_df.loc[y, 'indication'] = 'prophylaxis of organ rejection'
    elif x.startswith('LANTUS') or x.startswith('INSULIN GLARGINE'): 
        class_df.loc[y, 'class_id'] = 82
        class_df.loc[y, 'class'] = 'human insulin analog'
        class_df.loc[y, 'indication'] = 'glycemic management, T1 diabetes, T2 diabetes'
    elif x.startswith('REMODULIN') or x.startswith('TREPROSTINIL'):
        class_df.loc[y, 'class_id'] = 83
        class_df.loc[y, 'class'] = 'prostacyclin vasodialator'
        class_df.loc[y, 'indication'] = 'pulmonary arterial hypertension, transition from Flolan'
    elif x.startswith('SINEMET') or x.startswith('CARBIDOPA\LEVODOPA'):
        class_df.loc[y, 'class_id'] = 84
        class_df.loc[y, 'class'] = 'decarboxylase inhibitor, CNS agent'
        class_df.loc[y, 'indication'] = 'parkinsons disease'
    elif x.startswith('DILANTIN') or x.startswith('PHENYTOIN'):
        class_df.loc[y, 'class_id'] = 85
        class_df.loc[y, 'class'] = 'anticonvulsants'
        class_df.loc[y, 'indication'] = 'epilepsy'
    elif x.startswith('ZITHROMAX') or x.startswith('AZITHROMYCIN'):
        class_df.loc[y, 'class_id'] = 86
        class_df.loc[y, 'class'] = 'antibacterial'
        class_df.loc[y, 'indication'] = 'bacterial infection'
    elif x.startswith('BACTRIM') or x.startswith('SULFAMETHOXAZOLE\TRIMETHOPRIM'):
        class_df.loc[y, 'class_id'] = 87
        class_df.loc[y, 'class'] = 'antimicrobial, antibacterial'
        class_df.loc[y, 'indication'] = 'bacterial infection'
    elif x.startswith('IMIQUIMOD') or x.startswith('ALDARA'):
        class_df.loc[y, 'class_id'] = 88
        class_df.loc[y, 'class'] = 'immune response modifier'
        class_df.loc[y, 'indication'] = 'actinic keratosis, genital warts'
    elif x.startswith('ZYLOPRIM') or x.startswith('ALOPRIM') or x.startswith('ALLOPURINOL'):
        class_df.loc[y, 'class_id'] = 89
        class_df.loc[y, 'class'] = 'xanthine oxidase inhibitor'
        class_df.loc[y, 'indication'] = 'gout prevention'
    elif x.startswith('HUMAN IMMUNOGLOBULIN G'):
        class_df.loc[y, 'class_id'] = 90
        class_df.loc[y, 'class'] = 'immune system supplement'
        class_df.loc[y, 'indication'] = 'immunodeficiency, Kawasaki syndrome, GvH disease'
    elif x.startswith('XALATAN') or x.startswith('LATANOPROST'):
        class_df.loc[y, 'class_id'] = 91
        class_df.loc[y, 'class'] = 'prostanoid selective FP receptor agonist'
        class_df.loc[y, 'indication'] = 'open-angle glaucoma, ocular hypertension'
    elif x.startswith('HUMALOG') or x.startswith('INSULIN LISPRO'):
        class_df.loc[y, 'class_id'] = 92
        class_df.loc[y, 'class'] = 'human insulin analog'
        class_df.loc[y, 'indication'] = 'glycemic management'
    elif x.startswith('ACYCLOVIR') or x.startswith('ZOVIRAX'):
        class_df.loc[y, 'class_id'] = 93
        class_df.loc[y, 'class'] = 'synthetic nucleoside analogue'
        class_df.loc[y, 'indication'] = 'herpes'
    elif x.startswith('PLAVIX') or x.startswith('CLOPIDOGREL'):
        class_df.loc[y, 'class_id'] = 94
        class_df.loc[y, 'class'] = 'P2Y-12 platelet inhibitor'
        class_df.loc[y, 'indication'] = 'myocardial infarction, stroke, extablished peripheral arterial disease'
    elif x.startswith('SUBLIMAZE') or x.startswith('FENTANYL'):
        class_df.loc[y, 'class_id'] = 95
        class_df.loc[y, 'class'] = 'opiod agonist, narcotic analgesic'
        class_df.loc[y, 'indication'] = 'anesthetic'
    elif x.startswith('ZOFRAN') or x.startswith('ONDANSETRON'):
        class_df.loc[y, 'class_id'] = 96
        class_df.loc[y, 'class'] = '5-HT receptor antagonist'
        class_df.loc[y, 'indication'] = 'nausea prevention'
    elif x.startswith('ATRIPLA') or x.startswith('EFAVIRENZ\EMTRICITABINE\TENOFOVIR DISOPROXIL FUMARATE'):
        class_df.loc[y, 'class_id'] = 97
        class_df.loc[y, 'class'] = 'reverse transcriptase inhibitor'
        class_df.loc[y, 'indication'] = 'antiviral'
    elif x.startswith('CLOZARIL') or x.startswith('CLOZAPINE'):
        class_df.loc[y, 'class_id'] = 98
        class_df.loc[y, 'class'] = 'antipsychotic'
        class_df.loc[y, 'indication'] = 'schizophrenia'
    elif x.startswith('PACLITAXEL') or x.startswith('TAXOL'):
        class_df.loc[y, 'class_id'] = 99
        class_df.loc[y, 'class'] = 'antimicrotubule agent'
        class_df.loc[y, 'indication'] = 'cancer treatment'
    elif x.startswith('UPTRAVI') or x.startswith('SELEXIPAG'):
        class_df.loc[y, 'class_id'] = 100
        class_df.loc[y, 'class'] = 'prostacyclin receptor agonist'
        class_df.loc[y, 'indication'] = 'pulmonary arterial hypertension'
    elif x.startswith('XYREM') or x.startswith('SODIUM OXYBATE'):
        class_df.loc[y, 'class_id'] = 101
        class_df.loc[y, 'class'] = 'CNS depressant'
        class_df.loc[y, 'indication'] = 'cataplexy, excessive daytime sleepiness'
    elif x.startswith('TESTOSTERONE CYPIONATE') or x.startswith('DEPO-TESTOSTERONE'):
        class_df.loc[y, 'class_id'] = 102
        class_df.loc[y, 'class'] = 'androgen'
        class_df.loc[y, 'indication'] = 'hypogonadism, constitutional delay of growth and puberty, cancer treatment'
    elif x.startswith('MYCOPHENOLATE MOFETIL') or x.startswith('CELLCEPT'): 
        class_df.loc[y, 'class_id'] = 103
        class_df.loc[y, 'class'] = 'antimetabolite immunosuppressant'
        class_df.loc[y, 'indication'] = 'prophylaxis of organ rejection'
    elif x.startswith('OCTREOTIDE ACETATE') or x.startswith('SANDOSTATIN'): 
        class_df.loc[y, 'class_id'] = 104
        class_df.loc[y, 'class'] = 'somatostatin analogue'
        class_df.loc[y, 'indication'] = 'acromegaly, diarrhea'
    elif x.startswith('ESCITALOPRAM OXALATE') or x.startswith('LEXAPRO'): 
        class_df.loc[y, 'class_id'] = 105
        class_df.loc[y, 'class'] = 'selective serotonia reuptake inhibitor'
        class_df.loc[y, 'indication'] = 'antidepressant'
    elif x.startswith('LEVETIRACETAM') or x.startswith('KEPPRA'): 
        class_df.loc[y, 'class_id'] = 106
        class_df.loc[y, 'class'] = 'anticonvulsants'
        class_df.loc[y, 'indication'] = 'seizures'
    elif x.startswith('CERTOLIZUMAB PEGOL') or x.startswith('CIMZIA'): 
        class_df.loc[y, 'class_id'] = 107
        class_df.loc[y, 'class'] = 'TNF blocker'
        class_df.loc[y, 'indication'] = 'rheumatoid arthritis, plaque psoriasis, psoriatic arthritis, chrons disease'
    elif x.startswith('CARBOPLATIN') or x.startswith('PARAPLATIN'): 
        class_df.loc[y, 'class_id'] = 108
        class_df.loc[y, 'class'] = 'alkylating agent'
        class_df.loc[y, 'indication'] = 'cancer treatment'
    elif x.startswith('CELECOXIB') or x.startswith('CELEBREX'): 
        class_df.loc[y, 'class_id'] = 109
        class_df.loc[y, 'class'] = 'non-steroidal anti-inflammatory drug'
        class_df.loc[y, 'indication'] = 'pain management, osteoarthritis, rheumatoid arthritis, dysmenorrhea'
    elif x.startswith('CYCLOSPORINE') or x.startswith('SANDIMMUNE'): 
        class_df.loc[y, 'class_id'] = 110
        class_df.loc[y, 'class'] = 'nonribosomal peptide'
        class_df.loc[y, 'indication'] = 'prophylaxis of organ rejection'
    elif x.startswith('SPIRONOLACTONE') or x.startswith('ALDACTONE'): 
        class_df.loc[y, 'class_id'] = 111
        class_df.loc[y, 'class'] = 'aldonsterone antagonist'
        class_df.loc[y, 'indication'] = 'heart failure, edema management, hypertension'
    elif x.startswith('MACITENTAN') or x.startswith('OPSUMIT'): 
        class_df.loc[y, 'class_id'] = 112
        class_df.loc[y, 'class'] = 'endothelin receptor antagonist'
        class_df.loc[y, 'indication'] = 'pulmonary aterial hypertension'
    elif x.startswith('DIPHENHYDRAMINE') or x.startswith('BENADRYL'): 
        class_df.loc[y, 'class_id'] = 113
        class_df.loc[y, 'class'] = 'antihistamines'
        class_df.loc[y, 'indication'] = 'allergy'
    elif x.startswith('AZATHIOPRINE') or x.startswith('IMURAN'): 
        class_df.loc[y, 'class_id'] = 114
        class_df.loc[y, 'class'] = 'immunosuppressant, antimetabolite'
        class_df.loc[y, 'indication'] = 'prevention of renal homotransplantation rejection, rheumatoid arthritis'
    elif x.startswith('RISPERIDONE') or x.startswith('RISPERDAL'): 
        class_df.loc[y, 'class_id'] = 115
        class_df.loc[y, 'class'] = 'antipsychotic'
        class_df.loc[y, 'indication'] = 'schizophrenia, bipolar disorder, autism'
    elif x.startswith('VENETOCLAX') or x.startswith('VENCLEXTA'): 
        class_df.loc[y, 'class_id'] = 116
        class_df.loc[y, 'class'] = 'BCL-2 inhibitor'
        class_df.loc[y, 'indication'] = 'cancer treatment'
    elif x.startswith('DOCETAXEL') or x.startswith('TAXOTERE'): 
        class_df.loc[y, 'class_id'] = 117
        class_df.loc[y, 'class'] = 'microtubule inhibitor'
        class_df.loc[y, 'indication'] = 'cancer treatment'
    elif x.startswith('PIMAVANSERIN TARTRATE') or x.startswith('NUPLAZID'): 
        class_df.loc[y, 'class_id'] = 118
        class_df.loc[y, 'class'] = 'antipsychotic'
        class_df.loc[y, 'indication'] = 'parkinsons disease'
    elif x.startswith('FLUTICASONE PROPIONATE\SALMETEROL XINAFOATE') or x.startswith('ADVAIR DISKUS'): 
        class_df.loc[y, 'class_id'] = 119
        class_df.loc[y, 'class'] = 'corticosteroid, long-acting beta agonist'
        class_df.loc[y, 'indication'] = 'asthma'
    elif x.startswith('CARVEDILOL') or x.startswith('COREG'): 
        class_df.loc[y, 'class_id'] = 120
        class_df.loc[y, 'class'] = 'beta blocker'
        class_df.loc[y, 'indication'] = 'heart failure, hypertension, myocaridal infarction'
    elif x.startswith('NAPROXEN') or x.startswith('NAPROSYN'): 
        class_df.loc[y, 'class_id'] = 121
        class_df.loc[y, 'class'] = 'non-steroidal anti-inflammatory drug'
        class_df.loc[y, 'indication'] = 'pain management, rheumatoid arthritis, osteoarthritis, tendonitis, dysmenorrhea'
    elif x.startswith('ZOLPIDEM TARTRATE') or x.startswith('AMBIEN'): 
        class_df.loc[y, 'class_id'] = 122
        class_df.loc[y, 'class'] = 'sedative-hypnotics'
        class_df.loc[y, 'indication'] = 'insomnia'
    elif x.startswith('LORATADINE') or x.startswith('CLARITIN'): 
        class_df.loc[y, 'class_id'] = 123
        class_df.loc[y, 'class'] = 'antihistamine'
        class_df.loc[y, 'indication'] = 'allergy'
    elif x.startswith('FLUTICASONE PROPIONATE') or x.startswith('FLONASE'): 
        class_df.loc[y, 'class_id'] = 124
        class_df.loc[y, 'class'] = 'corticosteroid'
        class_df.loc[y, 'indication'] = 'allergy'
    elif x.startswith('BUDESONIDE\FORMOTEROL FUMARATE DIHYDRATE') or x.startswith('SYMBICORT'): 
        class_df.loc[y, 'class_id'] = 125
        class_df.loc[y, 'class'] = 'corticosteroid, long-acting beta agonist'
        class_df.loc[y, 'indication'] = 'chronic obstructive pulmonary disease'
    elif x.startswith('LAMOTRIGINE'): 
        class_df.loc[y, 'class_id'] = 126
        class_df.loc[y, 'class'] = 'phenyltriazine'
        class_df.loc[y, 'indication'] = 'epilepsy, bipolar disorder'
    elif x.startswith('LEUPROLIDE ACETATE') or x.startswith('LUPRON DEPOT'): 
        class_df.loc[y, 'class_id'] = 127
        class_df.loc[y, 'class'] = 'gonadotropin-releasing hormone agonist'
        class_df.loc[y, 'indication'] = 'cancer treatment'
    elif x.startswith('QUETIAPINE'): 
        class_df.loc[y, 'class_id'] = 128
        class_df.loc[y, 'class'] = 'antipsychotic'
        class_df.loc[y, 'indication'] = 'schizophrenia, major depressive disorder'
    elif x.startswith('BORTEZOMIB') or x.startswith('VELCADE'): 
        class_df.loc[y, 'class_id'] = 129 
        class_df.loc[y, 'class'] = 'antineoplastic agent, proteasome inhibitor'
        class_df.loc[y, 'indication'] = 'cancer treatment'
    elif x.startswith('MIRTAZAPINE') or x.startswith('REMERON'): 
        class_df.loc[y, 'class_id'] = 130
        class_df.loc[y, 'class'] = 'antidepressant'
        class_df.loc[y, 'indication'] = 'major depressive disorder, post-traumatic stress disorder'
        

In [None]:
class_df.class_id.value_counts()

In [None]:
class_df[class_df.drugname=='EMTRICITABINE']

In [None]:
class_df[class_df['class']=='nan']

In [None]:
class_df[class_df['drugname']=='EMTRIVA']

In [None]:
class_df['class'] = class_df['class'].astype(str)

ind = class_df[class_df['class'] == 'nan']

In [None]:
#TODO: some of the drugnames reappear in this file after being mapped previously. why?

ind['drugname'].value_counts()[:30]

In [None]:
# subset DataFrame with caseid and whether the reaction stopped when drug therapy was stopped (dechal) and if the reaction
# began again after drug therapy was restarted (rechal).


subside_relapse = drug_file[['caseid', 'dechal', 'rechal']]

In [None]:
# The majority of the observations were of class 'Unknown', followed by 'Does Not Apply'
# There were 866404 null values also recorded


subside_relapse.dechal.value_counts()

In [None]:
subside_relapse.dechal.isnull().sum()

In [None]:
# The majority of the observations were of class 'Unknown', followed at distance by 'Negative' and 'Does Not Apply'
# There were 1600785 null values also recorded


subside_relapse.rechal.value_counts()

In [None]:
subside_relapse.rechal.isnull().sum()

# Examining Primary and Secondary suspect frequency

In [None]:
# Quantify each category

new_drug_df.role_cod.value_counts()

In [None]:
# Separate Primary and Secondary suspects  for further investigation

primary = new_drug_df[new_drug_df.role_cod == 'PS']
secondary = new_drug_df[new_drug_df.role_cod == 'SS']

In [None]:
primary

In [None]:
# Choose features of interest to examine trends within each Primary and Secondary Suspect drug

primary = primary[['caseid', 'drugname', 'prod_ai', 'val_vbm', 'dose_vbm', 'cum_dose_chr']]
secondary = secondary[['caseid', 'drugname', 'dose_vbm']]

In [None]:
# reset index

primary = primary.reset_index(drop=True)
secondary = secondary.reset_index(drop=True)

In [None]:
# Interestingly enough, ZANTAC's generic formulation is referred to as RANITIDINE and they make up the top 2 spots on the 
# Primary suspect list. They may have high interactivity with other medications, or based on a patient's medication history,
# ZANTAC (RANITIDINE) is likely to interact with their other medications

primary.drugname.value_counts()

In [None]:
# ZANTAC is also listed as the most frequent Secondary medication of interest. One could reasonably say that taking ZANTAC
# increases the likelihood of experiencing any SAE. 

secondary.drugname.value_counts()

In [None]:
primary[['drugname', 'dose_vbm', 'cum_dose_chr']]