# What are the various drugs prescribed for?

We need some kind of knowledge of which drug is prescribed for which condition. Thankfully, CMS seems to have this information.

In [1]:
import requests # to download the dataset
import zipfile # to extract from archive
import shutil # to write the dataset to file
import os # rename file to something more type-able
import numpy as np

data_dir = "../data/"

try:
    os.stat(data_dir)
except FileNotFoundError:
    os.mkdir(data_dir)
    
url = "https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/BSAPUFS/Downloads/2010_PD_Profiles_PUF.zip"
response = requests.get(url, stream=True)

with open(data_dir + 'drug_profiles_dataset.zip', 'wb') as ds_zipout:
    shutil.copyfileobj(response.raw, ds_zipout)

zip = zipfile.ZipFile(data_dir + 'drug_profiles_dataset.zip', 'r')
ds_filename = zip.namelist()[0]
zip.extract(ds_filename, path=data_dir)

'../data/2010_PD_Profiles_PUF.csv'

Let's load the data and have a look at what's in it:

In [2]:
import pandas as pd

In [3]:
puf = pd.read_csv("../data/2010_PD_Profiles_PUF.csv")

Let's drop the columns we don't need right now:

In [4]:
puf.drop("BENE_SEX_IDENT_CD", axis=1, inplace=True)
puf.drop("BENE_AGE_CAT_CD", axis=1, inplace=True)
puf.drop("PDE_DRUG_TYPE_CD", axis=1, inplace=True)
puf.drop("PLAN_TYPE", axis=1, inplace=True)
puf.drop(["COVERAGE_TYPE", "benefit_phase","DRUG_BENEFIT_TYPE",
         "PRESCRIBER_TYPE", "GAP_COVERAGE", "TIER_ID", "MEAN_RXHCC_SCORE",
         "AVE_DAYS_SUPPLY", "AVE_TOT_DRUG_COST", "AVE_PTNT_PAY_AMT",
         "PDE_CNT", "BENE_CNT_CAT"], axis=1, inplace=True)

In [5]:
len(puf["RXNORM_RXCUI"].unique())

1229

## RXNorm Identifiers

I need the identifiers for the `RXNORM_RXCUI` column and the `DRUG_MAJOR_CLASS` column.

The former comes from the NIH in the form of a downloadable data table:

In [6]:
data_dir = '../data/'
url = "https://download.nlm.nih.gov/rxnorm/RxNorm_full_prescribe_01032017.zip"
response = requests.get(url, stream=True)

with open(data_dir + 'rxnorm_dataset.zip', 'wb') as ds_zipout:
    shutil.copyfileobj(response.raw, ds_zipout)

try:
    os.stat(data_dir+"rxnorm/")
except FileNotFoundError:
    os.mkdir(data_dir+"rxnorm/")
    
zip = zipfile.ZipFile(data_dir + 'rxnorm_dataset.zip', 'r')

ds_filenames = zip.namelist()

for d in ds_filenames:
    zip.extract(d, path=data_dir+"rxnorm/")


Columns:

* RXCUI	RxNorm Unique identifier for concept (concept ID)
* LAT	Language of Term
* TS	Term status (no value provided)
* LUI	Unique identifier for term (no value provided)
* STT	String type (no value provided)
* SUI	Unique identifier for string (no value provided)
* ISPREF	Atom status - preferred (Y) or not (N) for this string within this concept (no value provided)
* RXAUI	Unique identifier for atom (RxNorm Atom ID)
* SAUI	Source asserted atom identifier [optional]
* SCUI	Source asserted concept identifier [optional]
* SDUI	Source asserted descriptor identifier [optional]
* SAB	Source abbreviation
* TTY	Term type in source
* CODE	"Most useful" source asserted identifier (if the source vocabulary has more than one identifier), or a RxNorm-generated source entry identifier (if the source vocabulary has none.)
* STR	String
* SRL	Source Restriction Level (no value provided)
* SUPPRESS	Suppressible flag. Values = N, O, Y, or E. N - not suppressible. O - Specific individual names (atoms) set as Obsolete because the name is no longer provided by the original source. Y - Suppressed by RxNorm editor. E - unquantified, non-prescribable drug with related quantified, prescribable drugs. NLM strongly recommends that users not alter editor-assigned suppressibility.
* CVF

In [7]:
names = ["RXCUI", "LAT", "TS", "LUI", "STT", "SUI", "ISPREF", "RXAUI",
         "SAUI", "SCUI", "SDUI", "SAB", "TTY", "CODE", "STR", "SRL", "SUPPRESS", "CVF"]

rxnorm = pd.read_csv("../data/rxnorm/rrf/RXNCONSO.RRF", sep="|", names=names, index_col=False,
                     usecols=[0,14])

In [8]:
rxnorm.head()

Unnamed: 0,RXCUI,STR
0,38,Parlodel
1,44,mesna
2,44,MESNA
3,44,Mesna
4,73,Docosahexaenoate


In [9]:
rxnorm[rxnorm["STR"] == "GATIFLOXACIN"]

Unnamed: 0,RXCUI,STR
30703,228476,GATIFLOXACIN


We need alll the names to be lowercase:

In [10]:
rxnorm["STR"] = rxnorm["STR"].str.lower()

We are going load the generic brand names from the Part D table so we can match them to RXCUI identifiers:

In [11]:
import feather
drugnames = feather.read_dataframe('../data/drugnames.feather')


In [12]:
drugnames.head()

Unnamed: 0,drugname_brand,drugname_generic
0,10 WASH,SULFACETAMIDE SODIUM
1,1ST TIER UNIFINE PENTIPS,"PEN NEEDLE, DIABETIC"
2,1ST TIER UNIFINE PENTIPS PLUS,"PEN NEEDLE, DIABETIC"
3,60PSE-400GFN-20DM,GUAIFENESIN/DM/PSEUDOEPHEDRINE
4,8-MOP,METHOXSALEN


### Messing with the Drug Names

Some of the drug names have multiple generic names. We need to pull those apart into their own categories. Also, we're going to transform them all to lowercase letters. 

In [13]:
drugnames["drugname_generic"] = drugnames["drugname_generic"].str.lower()
drugnames["drugname_brand"] = drugnames["drugname_brand"].str.lower()

In [14]:
drugnames.head()

Unnamed: 0,drugname_brand,drugname_generic
0,10 wash,sulfacetamide sodium
1,1st tier unifine pentips,"pen needle, diabetic"
2,1st tier unifine pentips plus,"pen needle, diabetic"
3,60pse-400gfn-20dm,guaifenesin/dm/pseudoephedrine
4,8-mop,methoxsalen


We need a placeholder for the `RXCUI` values:

In [15]:
drugnames["RXCUI"] = "0.0"

Now we can try to associate the drug names with their `RXCUI` codes:

In [16]:
for idx in drugnames.index:

    rxcui = []

    for c in ["drugname_generic", "drugname_brand"]:
        
        d = drugnames.loc[idx, c]
        dsplit = d.split("/")

        for di in dsplit:
            displit = di.split(" ")
            v = rxnorm[rxnorm["STR"] == displit[0]]

            if len(v) > 0:
                rxcui.extend(v["RXCUI"].unique())
            else:
                continue

    if len(rxcui) > 1:
        rxcui_str = "|".join(np.array(rxcui, dtype=str))
        
    elif len(rxcui) == 1:
        rxcui_str = str(rxcui[0])
    else:
        rxcui_str = 0.0
    
    drugnames.loc[idx, "RXCUI"] = rxcui_str



How many am I missing?

In [17]:
len(drugnames[drugnames["RXCUI"] == 0.0])

634

What fraction is that of all drug names?

In [18]:
len(drugnames[drugnames["RXCUI"] == 0.0])/len(drugnames)

0.14095153401511784

14% isn't so bad as a first turn-out.

In [19]:
drugnames[drugnames["RXCUI"] == 0.0].head()

Unnamed: 0,drugname_brand,drugname_generic,RXCUI
1,1st tier unifine pentips,"pen needle, diabetic",0
2,1st tier unifine pentips plus,"pen needle, diabetic",0
22,accusure,"syring w-ndl,disp,insul,0.5 ml",0
23,accusure,"syringe and needle,insulin,1ml",0
26,acetaminoph-caff-dihydrocodein,dhcodeine bt/acetaminophn/caff,0


This seems to include things like prenatal vitamins, devices (like needles and syringes) and abbreviated names I can't automatically include. 
**I am going to ignore this for the moment, but we should fix this in the long run!**

## Associating Drug Classes

Drug classes come originally from the Veteran's Health Administration [National Drug File](http://www.pbm.va.gov/PBM/nationalformulary/NDF_January_2016.xlsx). It turns out, they're also listed in the PUF SAS manual. Go figure:


In [20]:
data_dir = '../data/'
url = "https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/BSAPUFS/Downloads/2010_PD_Profiles_PUF_DUG.zip"
response = requests.get(url, stream=True)

with open(data_dir + 'drug_classes_dataset.zip', 'wb') as ds_zipout:
    shutil.copyfileobj(response.raw, ds_zipout)

zip = zipfile.ZipFile(data_dir + 'drug_classes_dataset.zip', 'r')
ds_filenames = zip.namelist()
for f in ds_filenames:
    zip.extract(f, path=data_dir)

In [21]:
drug_major_class = pd.read_csv(data_dir+"DRUG_MAJOR_CLASS_TABLE.csv")

In [22]:
drug_major_class.head()

Unnamed: 0,drug_major_class,drug_major_class_desc
0,0,UNKNOWN/MISSING
1,AD000,"ANTIDOTES,DETERRENTS AND POISON CONTROL"
2,AH000,ANTIHISTAMINES
3,AM000,ANTIMICROBIALS
4,AN000,ANTINEOPLASTICS


In [23]:
drug_class = pd.read_csv(data_dir+"DRUG_CLASS_TABLE.csv")

In [24]:
drug_class.head()

Unnamed: 0,drug_class,drug_class_desc
0,0,UNKNOWN/MISSING
1,AD100,ALCOHOL DETERRENTS
2,AD200,CYANIDE ANTIDOTES
3,AD300,HEAVY METAL ANTAGONISTS
4,AD400,"ANTIDOTES,DETERRENTS,AND POISON CONTROL EXCHAN..."


In [25]:
drug_class.replace(to_replace=np.nan, value="N/A", inplace=True)

Okay, cool, I think that's all the information I need!

## Adding Drug classes to the Drug information Table

We can now use the available information to add a column `drug_major_class` and `drug_class` to the `drugnames` table for use in identification and visualization.

To revise, I have four tables:

In [26]:
drugnames.head()

Unnamed: 0,drugname_brand,drugname_generic,RXCUI
0,10 wash,sulfacetamide sodium,10169
1,1st tier unifine pentips,"pen needle, diabetic",0
2,1st tier unifine pentips plus,"pen needle, diabetic",0
3,60pse-400gfn-20dm,guaifenesin/dm/pseudoephedrine,5032|8896
4,8-mop,methoxsalen,6854|227713


With the brand names, the generic names and the RxNorm identifier of the drug. 

In [27]:
puf.head()

Unnamed: 0,RXNORM_RXCUI,DRUG_MAJOR_CLASS,DRUG_CLASS
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0


with the RxNorm values as well as major/minor classes of the drug. 

And finally, 

In [28]:
drug_major_class.head()

Unnamed: 0,drug_major_class,drug_major_class_desc
0,0,UNKNOWN/MISSING
1,AD000,"ANTIDOTES,DETERRENTS AND POISON CONTROL"
2,AH000,ANTIHISTAMINES
3,AM000,ANTIMICROBIALS
4,AN000,ANTINEOPLASTICS


and 

In [29]:
drug_class.head()

Unnamed: 0,drug_class,drug_class_desc
0,0,UNKNOWN/MISSING
1,AD100,ALCOHOL DETERRENTS
2,AD200,CYANIDE ANTIDOTES
3,AD300,HEAVY METAL ANTAGONISTS
4,AD400,"ANTIDOTES,DETERRENTS,AND POISON CONTROL EXCHAN..."


have the identifiers for the major and minor drug classes.

I'm going to add four columns to `drugnames`, one for each major and minor drug class, and one for the string representations of those classes, which will make the table bigger, but easier to use in the long term.

In [30]:
drugnames["drug_major_class"] = ""
drugnames["dmc_string"] = ""
drugnames["drug_class"] = ""
drugnames["dc_string"] = ""

It looks like there are a bunch of drug classes that don't match between the drug classes in the key and the drug classes in the table. Because of course there are. 
**For now, any drug class we don't have an explanation for will be considered missing!**

How many are those?

In [31]:
drugnames["RXCUI"] = drugnames["RXCUI"].astype(str)

In [32]:
for idx in drugnames.index:

    drug_rxcui = drugnames.loc[idx, "RXCUI"].split("|")
    dmc, dc = [], []
    for rxcui in drug_rxcui:
        r = puf[puf["RXNORM_RXCUI"] == np.float(rxcui)]

        rxc = r.loc[r.index, "RXNORM_RXCUI"].unique()

        dmc.extend(r.loc[r.index, "DRUG_MAJOR_CLASS"].unique())
        dc.extend(r.loc[r.index, "DRUG_CLASS"].unique())


    dmc = np.unique(dmc)
    dc = np.unique(dc)

    if len(dmc) != 0:
        drugnames.loc[idx, "drug_major_class"] = "|".join(dmc)
        dmc_name = np.hstack([drug_major_class.loc[drug_major_class["drug_major_class"] == d, 
                                                   "drug_major_class_desc"].values for d in dmc])
        drugnames.loc[idx, "dmc_name"] = "|".join(dmc_name)

    else:
        drugnames.loc[idx, "drug_major_class"] = "0"
        drugnames.loc[idx, "dmc_name"] = "0"

    if len(dc) != 0:
        drugnames.loc[idx, "drug_class"] = "|".join(dc)
        dc_name = np.hstack([drug_class.loc[drug_class["drug_class"] == d, 
                                            "drug_class_desc"].values for d in dc])   

        drugnames.loc[idx, "dc_name"] = "|".join(dc_name)
    else:
        drugnames.loc[idx, "drug_class"] = "0"
        drugnames.loc[idx, "dc_name"] = "0"


In [33]:
drugnames.head()

Unnamed: 0,drugname_brand,drugname_generic,RXCUI,drug_major_class,dmc_string,drug_class,dc_string,dmc_name,dc_name
0,10 wash,sulfacetamide sodium,10169,0|DE000|OP000,,0|DE101|OP210,,UNKNOWN/MISSING|DERMATOLOGICAL AGENTS|OPHTHALM...,"UNKNOWN/MISSING|ANTI-INFECTIVE,TOPICAL|ANTI-IN..."
1,1st tier unifine pentips,"pen needle, diabetic",0.0,0,,0,,UNKNOWN/MISSING,UNKNOWN/MISSING
2,1st tier unifine pentips plus,"pen needle, diabetic",0.0,0,,0,,UNKNOWN/MISSING,UNKNOWN/MISSING
3,60pse-400gfn-20dm,guaifenesin/dm/pseudoephedrine,5032|8896,RE000,,RE200|RE302,,RESPIRATORY TRACT MEDICATIONS,"DECONGESTANTS,SYSTEMIC|ANTITUSSIVES/EXPECTORANTS"
4,8-mop,methoxsalen,6854|227713,DE000,,DE810,,DERMATOLOGICAL AGENTS,ANTIPSORIATIC


In [34]:
# Serialize drug names to feather file for use in both Python and R
import feather
feather.write_dataframe(drugnames, data_dir + 'drugnames_withclasses.feather')