### __Pharma Data Cleaner & Aggregator (part 2)__

This file is meant to be a followup to the later portion of the Pharma_Exporatory notebook.  Introduced are the following files:

* fuzzy_prices - containing the drug name comparison information, and price information
* all_data - containing product, patent, and exclusivity data

This file will merge these two datasets, and further clean/prep them for analysis. 

The processes in this notebook, unlike those in the 'Cleaner-Agg part 1' notebook are not available in the Pharma_Exploratory notebook.

In [4]:
import pandas as pd
# Reading in the CSVs with the python engine will take longer, but will hopefully produce better dtype matching (thus reducing the memory usage)
all_data = pd.read_csv('all_data.csv', engine = 'python')
all_data = all_data.set_index('ndc_description_agg')
all_data.head(3)

Unnamed: 0_level_0,Unnamed: 0,ingredient,dosage_form,route,trade_name,strength,applicant,appl_type_x,appl_no,product_no,...,patent_no,patent_expire_date_text,drug_substance_flag,drug_product_flag,patent_use_code,delist_flag,submission_date,appl_type,exclusivity_code,exclusivity_date
ndc_description_agg,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
UCERIS 2MG/ACTUATION RECTAL,0,BUDESONIDE,"AEROSOL, FOAM",RECTAL,UCERIS,2MG/ACTUATION,VALEANT PHARMS INTL,N,205613,1,...,,,,,,,,,,
BETAMETHASONE VALERATE 0.12% TOPICAL,1,BETAMETHASONE VALERATE,"AEROSOL, FOAM",TOPICAL,BETAMETHASONE VALERATE,0.12%,PERRIGO UK FINCO,A,78337,1,...,,,,,,,,,,
BETAMETHASONE VALERATE 0.12% TOPICAL,2,BETAMETHASONE VALERATE,"AEROSOL, FOAM",TOPICAL,BETAMETHASONE VALERATE,0.12%,RICONPHARMA LLC,A,207144,1,...,,,,,,,,,,


In [None]:
# Delete more columns to reduce the dataset size (a vast majority of these columns were NaN values)
fuzzy_prices.drop(['Unnamed: 0', 
                   'ndc', 
                   'corresponding_generic_drug_nadac_per_unit',
                   'corresponding_generic_drug_effective_date',
                   'Unnamed: 0.1'], axis = 1, inplace = True)
fuzzy_prices.head(3)

In [None]:
# Convert to datetime and see the distribution of dates in the 'effective_date' column
pd.to_datetime(fuzzy_prices['effective_date'])
fuzzy_prices['effective_date'].value_counts(dropna = False).sort_values(ascending = False)

In [None]:
# Attempting to lighten up the dataset further by dropping duplicates
fuzzy_prices.drop_duplicates(keep='first')
fuzzy_prices.info()

In [2]:
# No real duplicates found (both good and bad news) - compare entry number to info statement at beginning
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58567 entries, 0 to 58566
Data columns (total 28 columns):
Unnamed: 0                 58567 non-null int64
ingredient                 58567 non-null object
dosage_form                58563 non-null object
route                      58549 non-null object
trade_name                 58567 non-null object
strength                   58500 non-null object
applicant                  58567 non-null object
appl_type_x                58567 non-null object
appl_no                    58567 non-null int64
product_no                 58567 non-null int64
te_code                    20466 non-null object
approval_date              58567 non-null object
rld                        58567 non-null object
rs                         58567 non-null object
type                       58567 non-null object
applicant_full_name        58567 non-null object
appl_type_y                23361 non-null object
patent_no                  23361 non-null object
patent_expir

In [3]:
all_data.head()

Unnamed: 0.1,Unnamed: 0,ingredient,dosage_form,route,trade_name,strength,applicant,appl_type_x,appl_no,product_no,...,patent_expire_date_text,drug_substance_flag,drug_product_flag,patent_use_code,delist_flag,submission_date,appl_type,exclusivity_code,exclusivity_date,ndc_description_agg
0,0,BUDESONIDE,"AEROSOL, FOAM",RECTAL,UCERIS,2MG/ACTUATION,VALEANT PHARMS INTL,N,205613,1,...,,,,,,,,,,UCERIS 2MG/ACTUATION RECTAL
1,1,BETAMETHASONE VALERATE,"AEROSOL, FOAM",TOPICAL,BETAMETHASONE VALERATE,0.12%,PERRIGO UK FINCO,A,78337,1,...,,,,,,,,,,BETAMETHASONE VALERATE 0.12% TOPICAL
2,2,BETAMETHASONE VALERATE,"AEROSOL, FOAM",TOPICAL,BETAMETHASONE VALERATE,0.12%,RICONPHARMA LLC,A,207144,1,...,,,,,,,,,,BETAMETHASONE VALERATE 0.12% TOPICAL
3,3,BETAMETHASONE VALERATE,"AEROSOL, FOAM",TOPICAL,BETAMETHASONE VALERATE,0.12%,TARO PHARM,A,208204,1,...,,,,,,,,,,BETAMETHASONE VALERATE 0.12% TOPICAL
4,4,CLINDAMYCIN PHOSPHATE,"AEROSOL, FOAM",TOPICAL,CLINDAMYCIN PHOSPHATE,1%,PERRIGO UK FINCO,A,90785,1,...,,,,,,,,,,CLINDAMYCIN PHOSPHATE 1% TOPICAL


In [5]:
# Change the following columns to datetime dtype
pd.to_datetime(all_data['patent_expire_date_text'])

#From here on, this modification means that these drugs were approved prior to Jan 1, 1982
all_data['approval_date'].replace('Approved Prior to Jan 1, 1982', 'Dec 31, 1981')

ndc_description_agg
UCERIS 2MG/ACTUATION RECTAL                     Oct 7, 2014
BETAMETHASONE VALERATE 0.12% TOPICAL           Nov 26, 2012
BETAMETHASONE VALERATE 0.12% TOPICAL           May 24, 2017
BETAMETHASONE VALERATE 0.12% TOPICAL           May 24, 2017
CLINDAMYCIN PHOSPHATE 1% TOPICAL               Mar 31, 2010
CLOBETASOL PROPIONATE 0.05% TOPICAL            Feb 15, 2019
CLOBETASOL PROPIONATE 0.05% TOPICAL            Jul 31, 2017
CLOBETASOL PROPIONATE 0.05% TOPICAL            Mar 10, 2008
CLOBETASOL PROPIONATE 0.05% TOPICAL            Aug 14, 2012
CLOBETASOL PROPIONATE 0.05% TOPICAL             Oct 4, 2018
ECOZA 1% TOPICAL                               Oct 24, 2013
ENSTILAR 0.064%;0.005% TOPICAL                 Oct 16, 2015
ENSTILAR 0.064%;0.005% TOPICAL                 Oct 16, 2015
ENSTILAR 0.064%;0.005% TOPICAL                 Oct 16, 2015
EVOCLIN 1% TOPICAL                             Oct 22, 2004
EVOCLIN 1% TOPICAL                             Oct 22, 2004
EXTINA 2% TOPICAL   

In [6]:
# Several columns to drop
all_data.drop(['Unnamed: 0', 
               'appl_no',
               'drug_substance_flag',
               'applicant'], axis = 1, inplace = True)

In [6]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58567 entries, 0 to 58566
Data columns (total 24 columns):
ingredient                 58567 non-null object
dosage_form                58563 non-null object
route                      58549 non-null object
trade_name                 58567 non-null object
strength                   58500 non-null object
appl_type_x                58567 non-null object
product_no                 58567 non-null int64
te_code                    20466 non-null object
approval_date              58567 non-null object
rld                        58567 non-null object
rs                         58567 non-null object
type                       58567 non-null object
applicant_full_name        58567 non-null object
appl_type_y                23361 non-null object
patent_no                  23361 non-null object
patent_expire_date_text    23361 non-null object
drug_product_flag          11250 non-null object
patent_use_code            13382 non-null object
delist_fla

#### __Merging time:__
Seeing as I was able to reduce the price dataset by almost half, I'll try and merge the two datasets again.  I'll merge 'right' this time because I want to keep as much of the patent data as possible. 

<b>Note to self:</b> you may want to come back and merge by 'outer' so that you can retain as much price info as possible and extrapolate any missing patent data for drugs that have prices but no patent dates.

Originally, I opened the fuzzy_prices file at the top cleaned it, and then tried to priocess it here at the bottom of the notebook. I've found, however, that chunking and processing the CSV as it is read in leads to far fewer errors with my machine's limited memory.

In [14]:
# Chunk, prepare, and merge the fuzzy_prices file with the all_data file
for chunk in pd.read_csv('fuzzy_prices.csv', chunksize = 25e6, engine = 'python'):
    chunk.set_index('ndc_description_agg')
    chunk.drop(['Unnamed: 0', 
                       'ndc', 
                       'corresponding_generic_drug_nadac_per_unit',
                       'corresponding_generic_drug_effective_date',
                       'Unnamed: 0.1'], axis = 1, inplace = True)
    chunk.head(3)

    # Convert to datetime and see the distribution of dates in the 'effective_date' column
    pd.to_datetime(chunk['effective_date'])
    chunk['effective_date'].value_counts(dropna = False).sort_values(ascending = False)

    # Attempting to lighten up the dataset further by dropping duplicates
    chunk.drop_duplicates(keep='first')
    chunk.info()
    merged_all = chunk.join(all_data, how = 'outer')
    merged_all.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9978725 entries, 0 to 9978724
Data columns (total 9 columns):
ndc_description                    object
cost_per_unit_usd                  float64
effective_date                     object
pricing_unit                       object
over_the_counter                   object
classification_for_rate_setting    object
as_of_date                         object
ndc_description_agg                object
score                              int64
dtypes: float64(1), int64(1), object(7)
memory usage: 685.2+ MB


In [15]:
merged_all.head()

Unnamed: 0,ndc_description,cost_per_unit_usd,effective_date,pricing_unit,over_the_counter,classification_for_rate_setting,as_of_date,ndc_description_agg,score,ingredient,...,appl_type_y,patent_no,patent_expire_date_text,drug_product_flag,patent_use_code,delist_flag,submission_date,appl_type,exclusivity_code,exclusivity_date
0,IBUPROFEN 200MG TABLET,0.02991,2018-11-21,EA,Y,G,2018-12-12,IBUPROFEN 200MG ORAL,86.0,,...,,,,,,,,,,
1,IBUPROFEN 200MG TABLET,0.02991,2018-11-21,EA,Y,G,2018-12-12,IBUPROFEN 200MG ORAL,86.0,,...,,,,,,,,,,
2,IBUPROFEN 200MG TABLET,0.02991,2018-11-21,EA,Y,G,2018-12-12,IBUPROFEN 200MG ORAL,86.0,,...,,,,,,,,,,
3,IBUPROFEN 200MG TABLET,0.02991,2018-11-21,EA,Y,G,2018-12-12,IBUPROFEN 200MG ORAL,86.0,,...,,,,,,,,,,
4,IBUPROFEN 200MG TABLET,0.02991,2018-11-21,EA,Y,G,2018-12-12,IBUPROFEN 200MG ORAL,86.0,,...,,,,,,,,,,


In [18]:
# Export the merged file
merged_all = merged_all.to_csv('merged_all.csv')

### __Yet another Dask attempt__


In [None]:
# Attempting dask again!
from dask import dataframe as dd 
from dask.distributed import Client, LocalCluster

# Initiate the client!
client = Client(n_workers = 1, 
                threads_per_worker = 4, 
                processes = False, 
               memory_limit = '14GB', 
               scheduler_port = 0, 
               silence_logs = True, 
               diagnostics_port = 0)
client

In [None]:
# Start the merger
merged_all_ddf = client.submit(pd.merge(fuzzy_prices, all_data, on=['ndc_description_agg'], how = 'right').compute())

In [None]:
merged_all_ddf.describe()