# __Clean Price Data__

In [1]:
# Import price data here
import pandas as pd
import dill
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
prices = pd.read_csv('data/NADAC_Current_Data.csv')
# prices.head()

### __Datetime conversions__
* Reduce space & allows for calculations
* Takes a long time.  Determine faster way to get this done.

In [4]:
# Convert Effective_Date column to datetime format (takes about 32 minutes 25 seconds)
prices['effective_date'] = pd.to_datetime(prices['effective_date'], infer_datetime_format = True) #, format='%Y/%m/%d')
prices['as_of_date'] = pd.to_datetime(prices['as_of_date'], infer_datetime_format = True) #, format='%Y/%m/%d')

Wall time: 32min 25s


In [11]:
# Check the spread of the 'effective_date' column data
prices['effective_date'].dt.year.value_counts()

2018    1323204
2017    1273904
2016    1257914
2015    1222646
2014    1199249
2019     856327
2013     214199
Name: effective_date, dtype: int64

In [16]:
# Check the spread of the 'as_of_date' column data
prices['as_of_date'].dt.year.value_counts()

2018    1296863
2017    1281151
2016    1250949
2015    1222702
2014    1213307
2019     972080
2013     110391
Name: as_of_date, dtype: int64

### __Cleaning Up Drug Names__
The drug names in both datasets are not standardized, and as we'll be matching observations up based on the drug names, it's imperative that they be cleaned well.

In [43]:
#Checking for time improvement here over next cell below
import re
replacements = {r'\sCAP*?\Z|\sCP*?\Z' : ' CAPSULE',
                r'\sTAB\sCHW\s*?\Z|\sTAB\sCHEW\s*?\Z': ' CHEWABLE TABLET', 
                r'\sTAB\Z|\sTAB\s|\sTB' : ' TABLET', 
                r'\sSYR*?\Z' : ' SYRINGE', 
                r'\sCRM*?\Z' : ' CREAM', 
                r'\sSL*?\Z' : ' SUB-LINGUAL', 
                r'\sFOAM*?\Z' : ' FOAM', 
                r'\sAUTO\-INJ*?\Z' : ' INJECTION', 
                r'\sEFF*?\Z' : ' EFFERVESCENT', 
                r'\sSOLN*?\Z' : ' SOLUTION', 
                r'\sINH*?\Z' : ' INHALATION', 
                r'\sHCL\s*?\Z' : ' HYDROCHLORIDE',
                r'\sCPLT*?\Z' : ' CAPLET',
                r'\sGASTR\s*?\Z' : ' GASTRIC',
                r'\sOSM\s*?\Z' : ' OSMOTIC',
                r'\sLIQ*?\Z' : ' LIQUID',
                '\s\*\*.*\*\*\s' : '',
                ' MG' : 'MG',
                ' ML' : 'ML',
                ' MCG' : 'MCG',
                ' +': ' '
               }

for pat, repl in replacements.items():
    pat = re.compile(pat)
    prices['ndc_description'] = prices['ndc_description'].str.replace(pat, repl)

In [None]:
# Visualize frequency of nulls in price data
sns.heatmap(Cleaned_Price_Data.isnull(), cbar = False)

In [45]:
# Export to file
prices.to_csv('data/Cleaned_Price_Data.csv')