# __Feature Engineering & Encoding__

* Delete all date & name columns (and any other unnecessary columns)
* Create 'days_to_patent_expiry' column
* Combine data in several categorical columns (i.e. te_code)
* One-hot encode all categorical data

In [2]:
import pandas as pd
import numpy as np

In [155]:
# Unpickle Top 100 drugs file
import dill
Price_Patent_Data = dill.load(open('data/top_100_drugs.pkd', 'rb'))

In [156]:
Price_Patent_Data.drop('submission_date', axis = 1, inplace = True) # Submission_date isn't diverse enough to be useful

In [157]:
#Convert all to datetimes - not needed if unpickling
Price_Patent_Data['effective_date'] = pd.to_datetime(Price_Patent_Data['effective_date'])
Price_Patent_Data['corresponding_generic_drug_effective_date'] = pd.to_datetime(Price_Patent_Data['corresponding_generic_drug_effective_date'])
Price_Patent_Data['approval_date'] = pd.to_datetime(Price_Patent_Data['approval_date'])
Price_Patent_Data['patent_expire_date_text'] = pd.to_datetime(Price_Patent_Data['patent_expire_date_text'])
# Price_Patent_Data['submission_date'] = pd.to_datetime(Price_Patent_Data['submission_date'])
# Price_Patent_Data['exclusivity_date'] = pd.to_datetime(Price_Patent_Data['exclusivity_date'])

### __Drop ndc_description__
Duplicate information found in drug_names, strength, route, and other columns

In [158]:
Price_Patent_Data.drop_duplicates(keep = 'first', inplace = True)
Price_Patent_Data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 214082 entries, 68472 to 6411864
Data columns (total 17 columns):
nadac_per_unit                               214082 non-null float64
drug_names                                   214082 non-null object
ndc                                          214082 non-null float64
effective_date                               214082 non-null datetime64[ns]
classification_for_rate_setting              214082 non-null object
corresponding_generic_drug_effective_date    214082 non-null datetime64[ns]
corresponding_generic_drug_nadac_per_unit    214082 non-null float64
otc                                          214082 non-null object
approval_date                                132995 non-null datetime64[ns]
patent_expire_date_text                      2195 non-null datetime64[ns]
pricing_unit                                 214082 non-null object
ingredient                                   132995 non-null object
applicant                          

### __Create days_since_generic_on_market feature__
If a drug has a generic equivalent (i.e. a classification_for_rate_setting value of 'G', and a te_code of 'A')...see notes

In [159]:
# Function combining all feature engineerings

def create_features(df):
    import re
    # Create days_before_patent_expires feature
    df['days_before_patent_expires'] = (df['patent_expire_date_text'] - df['approval_date']).astype('timedelta64[D]')

    # Create drug_age feature
    Price_Patent_Data['drug_age'] = Price_Patent_Data.groupby(['ndc'])['approval_date'].transform(lambda x: dt.date.today() - x.min().date())/np.timedelta64(1, 'D')

    # Drop two datetime columns now that we're done with them
    df.drop(columns = ['approval_date', 'patent_expire_date_text'], inplace = True)

    # Combine te_code categories
    df['te_code'] = df['te_code'].str.replace(r"(^A.*)", "A", regex = True)
    df['te_code'] = df['te_code'].str.replace(r"(^B.*)", "B", regex = True)
    df.te_code.value_counts(dropna = False)

    # Combine classifications listed in classification_for_rate_setting feature
    df['classification_for_rate_setting'] = df['classification_for_rate_setting'].str.replace(r"(^B.*)", "B", regex = True)
    df['classification_for_rate_setting'].value_counts(dropna = False)

    # Simplify exclusivity_code - Delete everything after the dash
#     df['exclusivity_code'] = df['exclusivity_code'].str.replace(r"(.*)-.*", '\\1', regex = True)
#     df.exclusivity_code.value_counts(dropna = False)

    # Aggregate values in dosage_form feature with less than 1000 counts
    df.loc[df.groupby('dosage_form').dosage_form.transform('count').lt(1000), 'dosage_form'] = 'OTHER'
    df['dosage_form'].value_counts(dropna = False)

    # Aggregate values in route feature with less than 100 counts
    df.loc[df.groupby('route').dosage_form.transform('count').lt(100), 'route'] = 'OTHER'
    df['route'].value_counts(dropna = False)
    
    # Correct the corresponding_generic_drug_effective_date and corresponding_generic_drug_nadac_per_unit data (generics can't have data in these columns)
    Price_Patent_Data.loc[Price_Patent_Data.classification_for_rate_setting == 'G', ['corresponding_generic_drug_effective_date', 'corresponding_generic_drug_nadac_per_unit']] = pd.NaT, np.NaN

    # Create a new generic_exists column
    Price_Patent_Data['generic_exists'] = np.where((Price_Patent_Data['classification_for_rate_setting'] == 'B') & (Price_Patent_Data['corresponding_generic_drug_nadac_per_unit'].notnull()), 1, 0)  
    df['generic_exists'].value_counts(dropna = False)

In [160]:
create_features(Price_Patent_Data)

Unnamed: 0,nadac_per_unit,drug_names,ndc,effective_date,classification_for_rate_setting,corresponding_generic_drug_effective_date,corresponding_generic_drug_nadac_per_unit,otc,pricing_unit,ingredient,applicant,te_code,type,dosage_form,route,days_before_patent_expires,drug_age,generic_exists
68472,5.92035,TINIDAZOLE 500MG ORAL,54034807.0,2013-11-21,G,NaT,,N,EA,,,,,,,,1551.0,0
68473,5.92035,TINIDAZOLE 500MG ORAL,54034807.0,2013-11-22,G,NaT,,N,EA,,,,,,,,1551.0,0
68474,5.92035,TINIDAZOLE 500MG ORAL,54034807.0,2013-11-23,G,NaT,,N,EA,,,,,,,,1551.0,0
68475,5.92035,TINIDAZOLE 500MG ORAL,54034807.0,2013-11-24,G,NaT,,N,EA,,,,,,,,1551.0,0
68476,5.92035,TINIDAZOLE 500MG ORAL,54034807.0,2013-11-25,G,NaT,,N,EA,,,,,,,,1551.0,0


### __Convert dates to day, month, year columns__

(for one-hot encoding later)

In [161]:
# Convert effective_date column values
Price_Patent_Data['effective_date_year'] = (Price_Patent_Data['effective_date'].dt.year).astype('float16')
Price_Patent_Data['effective_date_month'] = (Price_Patent_Data['effective_date'].dt.month).astype('float16')
Price_Patent_Data['effective_date_day'] = (Price_Patent_Data['effective_date'].dt.day).astype('float16')

# Convert corresponding_generic_drug_effective_date column values
Price_Patent_Data['corresponding_generic_drug_effective_year'] = (Price_Patent_Data['corresponding_generic_drug_effective_date'].dt.year).astype('float16')
Price_Patent_Data['corresponding_generic_drug_effective_month'] = (Price_Patent_Data['corresponding_generic_drug_effective_date'].dt.month).astype('float16')
Price_Patent_Data['corresponding_generic_drug_effective_day'] = (Price_Patent_Data['corresponding_generic_drug_effective_date'].dt.day).astype('float16')

# Drop both original columns
Price_Patent_Data.drop(['effective_date', 'corresponding_generic_drug_effective_date'], axis = 1, inplace = True)

Unnamed: 0,nadac_per_unit,drug_names,ndc,classification_for_rate_setting,corresponding_generic_drug_nadac_per_unit,otc,pricing_unit,ingredient,applicant,te_code,...,route,days_before_patent_expires,drug_age,generic_exists,effective_date_year,effective_date_month,effective_date_day,corresponding_generic_drug_effective_year,corresponding_generic_drug_effective_month,corresponding_generic_drug_effective_day
68472,5.92035,TINIDAZOLE 500MG ORAL,54034807.0,G,,N,EA,,,,...,,,1551.0,0,2013.0,11.0,21.0,,,
68473,5.92035,TINIDAZOLE 500MG ORAL,54034807.0,G,,N,EA,,,,...,,,1551.0,0,2013.0,11.0,22.0,,,
68474,5.92035,TINIDAZOLE 500MG ORAL,54034807.0,G,,N,EA,,,,...,,,1551.0,0,2013.0,11.0,23.0,,,
68475,5.92035,TINIDAZOLE 500MG ORAL,54034807.0,G,,N,EA,,,,...,,,1551.0,0,2013.0,11.0,24.0,,,
68476,5.92035,TINIDAZOLE 500MG ORAL,54034807.0,G,,N,EA,,,,...,,,1551.0,0,2013.0,11.0,25.0,,,


### __Fix column data types__

In [162]:
Price_Patent_Data[[
                   'nadac_per_unit', 
                   'corresponding_generic_drug_nadac_per_unit', 
                   'days_before_patent_expires',
                   'drug_age',
                   'ndc', 
                   ]] = Price_Patent_Data[['nadac_per_unit', 'corresponding_generic_drug_nadac_per_unit', 'days_before_patent_expires', 'drug_age', 'ndc']].astype('float32')

Unnamed: 0,nadac_per_unit,drug_names,ndc,classification_for_rate_setting,corresponding_generic_drug_nadac_per_unit,otc,pricing_unit,ingredient,applicant,te_code,...,route,days_before_patent_expires,drug_age,generic_exists,effective_date_year,effective_date_month,effective_date_day,corresponding_generic_drug_effective_year,corresponding_generic_drug_effective_month,corresponding_generic_drug_effective_day
68472,5.92035,TINIDAZOLE 500MG ORAL,54034808.0,G,,N,EA,,,,...,,,1551.0,0,2013.0,11.0,21.0,,,
68473,5.92035,TINIDAZOLE 500MG ORAL,54034808.0,G,,N,EA,,,,...,,,1551.0,0,2013.0,11.0,22.0,,,
68474,5.92035,TINIDAZOLE 500MG ORAL,54034808.0,G,,N,EA,,,,...,,,1551.0,0,2013.0,11.0,23.0,,,
68475,5.92035,TINIDAZOLE 500MG ORAL,54034808.0,G,,N,EA,,,,...,,,1551.0,0,2013.0,11.0,24.0,,,
68476,5.92035,TINIDAZOLE 500MG ORAL,54034808.0,G,,N,EA,,,,...,,,1551.0,0,2013.0,11.0,25.0,,,


In [163]:
# Drop drug_names (carry on with 'ndc')
Price_Patent_Data.drop('drug_names', axis = 1, inplace = True)

In [164]:
# Drop rows without a nadac_per_unit (price)
Price_Patent_Data = Price_Patent_Data[Price_Patent_Data.nadac_per_unit > 0]
Price_Patent_Data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 214082 entries, 68472 to 6411864
Data columns (total 21 columns):
nadac_per_unit                                214082 non-null float32
ndc                                           214082 non-null float32
classification_for_rate_setting               214082 non-null object
corresponding_generic_drug_nadac_per_unit     2402 non-null float32
otc                                           214082 non-null object
pricing_unit                                  214082 non-null object
ingredient                                    132995 non-null object
applicant                                     132995 non-null object
te_code                                       125918 non-null object
type                                          132995 non-null object
dosage_form                                   132995 non-null object
route                                         132995 non-null object
days_before_patent_expires                    2195 non-

### __Create dummies for the following columns:__

* One-hot encode in the sklearn pipline later (instead of doing this now)

In [6]:
# Replace columns above with binary versions (including ndc_description)
Price_Patent_Reg = pd.get_dummies(data = Price_Patent_Reg, columns = ['drug_names',
                                                                      'classification_for_rate_setting',
                                                                      'otc',
                                                                      'pricing_unit', 
                                                                      'ingredient',
                                                                      'applicant',
                                                                      'te_code',
                                                                      'type',
                                                                      #'exclusivity_code', 
                                                                      'dosage_form', 
                                                                      'route'
                                                                     ])
Price_Patent_Reg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 214063 entries, 68472 to 6411864
Columns: 137 entries, nadac_per_unit to route_ORAL
dtypes: datetime64[ns](2), float16(2), float64(2), int32(1), object(1), uint8(129)
memory usage: 37.8+ MB


### __Fill NaNs, update dtypes?__

In [165]:
# Fill NaNs with zeros (research better alternative)
Price_Patent_Data.fillna(0, inplace = True) 

### __Pickle__

In [166]:
# Pickle data
dill.dump(Price_Patent_Data, open('data/features_created.pkd', 'wb'))

In [33]:
# Save as CSV - create dummy variables and sparse matrix in Regressions notebook
Price_Patent_Data.to_csv('data/Price_Patent_Reg.csv')

### __Add date lag column__

If performing time series?

### __Create dummy variables__
This is now being done in the Regressions notebook


In [None]:
# Alternatively - to avoid the creation of so many columns and the need for a sparse matrix - we can do the following to create category ids for each value in a column
# We may need, though, to preserve the order
# Price_Patent_Data['column_name'].astype('category').cat.codes 

In [None]:
# Price_Patent_Reg.set_index(['effective_date', 'ndc_description'], inplace = True)
# Price_Patent_Reg.sort_index(inplace = True, ascending = False)
# Price_Patent_Reg.drop_duplicates(inplace = True)
# Price_Patent_Reg.head(100)