### __Pharma File Merger & Basic Cleaner__
This file aggregates and does a bit of cleaning on the data from the following files:
* products.txt
* patents.txt
* exlusivity.txt

Additionally, the national_average_drug_acquisition_cost.csv file is introduced later in the notebook and cleaned a bit as well.

This file is meant to be a shortcut to some of the processes embedded in the 'Pharma_Exploratory' notebook.

In [1]:
import pandas as pd
import re

In [2]:
# Read in relevant files (the first 3 belong in a set)
drugs = pd.read_csv('products.txt', sep='~', engine='python')
patents = pd.read_csv('patent.txt', sep='~')
exclusivity = pd.read_csv('exclusivity.txt', sep='~')

prices = pd.read_csv('national_average_drug_acquisition_cost.csv')

In [4]:
# Merge patent, products, and exclusivity datasets
all_data = pd.merge(drugs, patents, left_on = ['Appl_No', 'Product_No'], right_on = ['Appl_No', 'Product_No'], how = 'left')
all_data = pd.merge(all_data, exclusivity, left_on=['Appl_No', 'Product_No'], right_on=['Appl_No', 'Product_No'], how='left')
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58567 entries, 0 to 58566
Data columns (total 25 columns):
Ingredient                 58567 non-null object
DF;Route                   58567 non-null object
Trade_Name                 58567 non-null object
Applicant                  58567 non-null object
Strength                   58500 non-null object
Appl_Type_x                58567 non-null object
Appl_No                    58567 non-null int64
Product_No                 58567 non-null int64
TE_Code                    20466 non-null object
Approval_Date              58567 non-null object
RLD                        58567 non-null object
RS                         58567 non-null object
Type                       58567 non-null object
Applicant_Full_Name        58567 non-null object
Appl_Type_y                23361 non-null object
Patent_No                  23361 non-null object
Patent_Expire_Date_Text    23361 non-null object
Drug_Substance_Flag        4875 non-null object
Drug_Product

In [5]:
# Split DF and Route information into different columns
all_data['dosage_form'] = all_data['DF;Route'].str.split(';').str[0]
all_data['route'] = all_data['DF;Route'].str.split(';').str[-1]

In [6]:
# Reorder column names and drop the original 'DF;Route' column
cols = ['Ingredient', 'dosage_form', 'route', 'Trade_Name', 'Strength', 'Applicant', 'Appl_Type_x', 'Appl_No', 'Product_No', 'TE_Code', 'Approval_Date', 'RLD', 'RS', 'Type',
 'Applicant_Full_Name', 'Appl_Type_y', 'Patent_No', 'Patent_Expire_Date_Text', 'Drug_Substance_Flag', 'Drug_Product_Flag', 'Patent_Use_Code', 'Delist_Flag',
 'Submission_Date', 'Appl_Type', 'Exclusivity_Code', 'Exclusivity_Date', 'DF;Route']
all_data = all_data[cols]
all_data.drop('DF;Route', axis=1).head()

Unnamed: 0,Ingredient,dosage_form,route,Trade_Name,Strength,Applicant,Appl_Type_x,Appl_No,Product_No,TE_Code,...,Patent_No,Patent_Expire_Date_Text,Drug_Substance_Flag,Drug_Product_Flag,Patent_Use_Code,Delist_Flag,Submission_Date,Appl_Type,Exclusivity_Code,Exclusivity_Date
0,BUDESONIDE,"AEROSOL, FOAM",RECTAL,UCERIS,2MG/ACTUATION,VALEANT PHARMS INTL,N,205613,1,,...,,,,,,,,,,
1,BETAMETHASONE VALERATE,"AEROSOL, FOAM",TOPICAL,BETAMETHASONE VALERATE,0.12%,PERRIGO UK FINCO,A,78337,1,AB,...,,,,,,,,,,
2,BETAMETHASONE VALERATE,"AEROSOL, FOAM",TOPICAL,BETAMETHASONE VALERATE,0.12%,RICONPHARMA LLC,A,207144,1,AB,...,,,,,,,,,,
3,BETAMETHASONE VALERATE,"AEROSOL, FOAM",TOPICAL,BETAMETHASONE VALERATE,0.12%,TARO PHARM,A,208204,1,AB,...,,,,,,,,,,
4,CLINDAMYCIN PHOSPHATE,"AEROSOL, FOAM",TOPICAL,CLINDAMYCIN PHOSPHATE,1%,PERRIGO UK FINCO,A,90785,1,AT,...,,,,,,,,,,


In [9]:
# Correct inconsistencies in 'NDC Descriptions' (i.e. CAP = CAPSULE, etc.)
cap_regex = re.compile(r'\sCAP*?\Z | \sCP*?\Z')
prices['NDC Description'] = prices['NDC Description'].str.replace(cap_regex, ' CAPSULE')

chw_regex = re.compile(r'[\sCHW]*?\Z | [\sCHEW]*?\Z')
chw_regex2 = re.compile(r'[\sCHEW]*?\Z')
prices['NDC Description'] = prices['NDC Description'].str.replace((chw_regex or chw_regex2), ' CHEWABLE')

tab_regex = re.compile(r'\sTAB\Z')
tab_regex2 = re.compile(r'\sTAB\s')
prices['NDC Description'] = prices['NDC Description'].str.replace((tab_regex or tab_regex2), ' TABLET')

syr_regex = re.compile(r'\sSYR*?\Z')
prices['NDC Description'] = prices['NDC Description'].str.replace(syr_regex, ' SYRINGE')

crm_regex = re.compile(r'\sCRM*?\Z')
prices['NDC Description'] = prices['NDC Description'].str.replace(crm_regex, ' CREAM')

sl_regex = re.compile(r'\sSL*?\Z')
prices['NDC Description'] = prices['NDC Description'].str.replace(sl_regex, ' SUB-LINGUAL')

foam_regex = re.compile(r'\sFOAM*?\Z')
prices['NDC Description'] = prices['NDC Description'].str.replace(foam_regex, ' FOAM')

autoinj_regex = re.compile(r'\sAUTO\-INJ*?\Z')
prices['NDC Description'] = prices['NDC Description'].str.replace(autoinj_regex, ' INJECTION')

eff_regex = re.compile(r'\sEFF*?\Z')
prices['NDC Description'] = prices['NDC Description'].str.replace(eff_regex, ' EFFERVESCENT')

soln_regex = re.compile(r'\sSOLN*?\Z')
prices['NDC Description'] = prices['NDC Description'].str.replace(soln_regex, ' SOLUTION')

inh_regex = re.compile(r'\sINH*?\Z')
prices['NDC Description'] = prices['NDC Description'].str.replace(inh_regex, ' INHALATION')

hcl_regex = re.compile(r'\sHCL\s*?\Z')
prices['NDC Description'] = prices['NDC Description'].str.replace(hcl_regex, ' HYDROCHLORIDE')

### Seems to be deleting the 'm\s' and replacing it for foam and cream ###

# Remove the space in between the dosage and dosage units
prices['NDC Description'] = prices['NDC Description'].str.replace(' MG', 'MG')
prices['NDC Description'] = prices['NDC Description'].str.replace(' ML', 'ML')
prices['NDC Description'] = prices['NDC Description'].str.replace(' MCG', 'MCG')
# Remove any white extra white space in the column data
prices['NDC Description'] = prices['NDC Description'].str.replace(' +', ' ')

#Print the new cleaned unique_titles
prices['NDC Description'].value_counts()

METFORMIN HCL 500MG TABLET                22455
METFORMIN HCL 1,000MG TABLET              18756
LISINOPRIL 20MG TABLET                    17532
AMLODIPINE BESYLATE 5MG TABLET            17420
MONTELUKAST SOD 10MG TABLET               17061
METFORMIN HCL 850MG TABLET                16958
AMLODIPINE BESYLATE 10MG TABLET           16789
LISINOPRIL 10MG TABLET                    16727
LOSARTAN POTASSIUM 50MG TABLET            16238
LISINOPRIL 40MG TABLET                    15932
LOSARTAN POTASSIUM 100MG TABLET           15730
LISINOPRIL 5MG TABLET                     15105
GABAPENTIN 300MG CAPSULE                  14016
DONEPEZIL HCL 5MG TABLET                  13696
IBUPROFEN 200MG TABLET                    13575
AMLODIPINE BESYLATE 2.5MG TABLET          13157
FINASTERIDE 5MG TABLET                    12918
DONEPEZIL HCL 10MG TABLET                 12793
CITALOPRAM HBR 40MG TABLET                12645
CLOPIDOGREL 75MG TABLET                   12539
OMEPRAZOLE DR 20MG CAPSULE              

In [10]:
# Rename/format or drop some columns
prices.rename(columns = {'NDC Description': 'ndc_description', 'NADAC_Per_Unit':'cost_per_unit_usd', 'OTC':'over_the_counter', 'As of Date':'as_of_date'}, inplace=True)
prices.drop(['Pharmacy_Type_Indicator', 'Explanation_Code'], axis=1, inplace=True)

# Make all column headers lowercase
prices.columns = map(str.lower, prices.columns)
all_data.columns = map(str.lower, all_data.columns)

In [11]:
# Create an aggregate column from the all_data dataframe to match against the prices 'ndc description' dataframe with fuzzywuzzy's Levenshtein Distance generator
all_data['ndc_description_agg'] = all_data['trade_name'] + " " + all_data['strength'] + " " + all_data['route']

In [12]:
# Drop all unnecessary columns
new_prices = prices.drop(['ndc', 'cost_per_unit_usd', 'effective_date',
                          'pricing_unit', 'over_the_counter', 'classification_for_rate_setting',
                          'corresponding_generic_drug_nadac_per_unit',
                          'corresponding_generic_drug_effective_date', 'as_of_date'], axis = 1)

new_all_data = all_data.drop(['ingredient', 'dosage_form', 'route', 'trade_name', 'strength',
                              'applicant', 'appl_type_x', 'appl_no', 'product_no', 'te_code',
                              'approval_date', 'rld', 'rs', 'type', 'applicant_full_name',
                              'appl_type_y', 'patent_no', 'patent_expire_date_text',
                              'drug_substance_flag', 'drug_product_flag', 'patent_use_code',
                              'delist_flag', 'submission_date', 'appl_type', 'exclusivity_code',
                              'exclusivity_date', 'df;route'], axis = 1)

In [13]:
# Export cleaned files
new_prices.to_csv('prices_cleaned.csv')
new_all_data.to_csv('all_data_cleaned.csv')