In [4]:
import time
import os
import pandas as pd
import nltk
import re
import numpy as np
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer
import os

from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
# define the documents
# implement tfidf
from nltk.corpus import stopwords

from sklearn.decomposition import TruncatedSVD

DATA_PATH = "../data/raw"

In [5]:
# nltk.download('punkt')

### Read data

In [6]:
# read data
train = pd.read_csv(os.path.join(DATA_PATH, 'drugs_train.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'drugs_test.csv'))
active_ing = pd.read_csv(os.path.join(DATA_PATH, 'active_ingredients.csv'))
drug_label_feature_eng = pd.read_csv(os.path.join(DATA_PATH, 'drug_label_feature_eng.csv'))

### lets concat the overall datasets
Please note that concating the train and test datastets before applying features engineering thechniques, it would cause **leakage** problems: example If you use the imputation techniques that compute some statistic of one attribute (such as
average) or several attributes (by solving the regression problem), the leakage happens if you
use the whole dataset to compute this statistic. Using all available examples, you contaminate
the training data with information obtained from the validation and test examples

So we will be carefull about that

In [7]:
df = pd.concat([train, test])

In [8]:
# get active_ingredient feature
df = pd.merge(df, active_ing, on='drug_id', how='inner')

In [9]:
# get feature engineering on the text description,
df =  pd.merge(df, drug_label_feature_eng, on='description', how='inner')

In [10]:
df.tail().T

Unnamed: 0,18629,18630,18631,18632,18633
drug_id,3664_test,3664_test,3667_test,3670_test,3670_test
description,1 flacon(s) en verre brun pulvérisateur(s) ave...,1 flacon(s) en verre brun pulvérisateur(s) ave...,1 seringue(s) préremplie(s) en verre de 20 ml,1 flacon(s) polystyrène de 30 capsule(s),1 flacon(s) polystyrène de 30 capsule(s)
administrative_status,Présentation active,Présentation active,Présentation active,Présentation active,Présentation active
marketing_status,Déclaration de commercialisation,Déclaration de commercialisation,Déclaration de commercialisation,Déclaration de commercialisation,Déclaration de commercialisation
approved_for_hospital_use,oui,oui,oui,oui,oui
reimbursement_rate,65%,65%,65%,65%,65%
dosage_form,solution,solution,solution injectable,capsule molle,capsule molle
route_of_administration,nasale,nasale,intra-articulaire,orale,orale
marketing_authorization_status,Autorisation active,Autorisation active,Autorisation active,Autorisation active,Autorisation active
marketing_declaration_date,19890101,19890101,20030101,19710101,19710101


### Datetime features

In [11]:
# convert to datetime
df['marketing_declaration_date'] = pd.to_datetime(df['marketing_declaration_date'], format="%Y%m%d")
df['marketing_authorization_date'] = pd.to_datetime(df['marketing_authorization_date'], format="%Y%m%d")

In [12]:
# as all dates are in yyyy-01-01 
# it would be more intersting to retrieve only year
df['declaration_year'] = df['marketing_declaration_date'].dt.year
df['authorization_year'] = df['marketing_authorization_date'].dt.year

In [13]:
# anobvious features consists of extracting the date diffrences between the declaration date and the authorization date
def get_days_diff(x):
    return x.days

df['delta_decralation_autorization'] = df['marketing_declaration_date'] - df['marketing_authorization_date']
df['delta_decralation_autorization'] = df['delta_decralation_autorization'].apply(get_days_diff)

In [14]:
# get the difference days AND years
df['delta_days_decralation_autorization'] = df['delta_decralation_autorization']
df['delta_years_decralation_autorization'] = df['delta_decralation_autorization']//365
df = df.drop('delta_decralation_autorization', axis=1)

In [15]:
(df['delta_days_decralation_autorization']<0).sum()/len(df)

0.039229365675646666

drugs having authorization date earlier than the declaration date were surely registred with wrong dates

### Binary vars
Lets pick up categorical features with only two modalities and make them binary

In [16]:
binary_vars =  [var for var in df.columns if df[var].dtype=='object' and df[var].nunique()==2]
binary_vars

['administrative_status', 'approved_for_hospital_use']

In [17]:
df['administrative_status'].unique()

array(['Présentation active', 'Présentation abrogée'], dtype=object)

=> Lets set entries with 'Présentation active' to 1 and those with 'Présentation abrogée' to 0

In [18]:

df['administrative_status'] = df['administrative_status'].map({'Présentation active':1, 'Présentation abrogée':0})

In [19]:
df['approved_for_hospital_use'].unique()

array(['oui', 'non'], dtype=object)

=> Lets set entries with 'oui' to 1 and those with 'non' to 0 (french translation :p)

In [20]:
df['approved_for_hospital_use'] = df['approved_for_hospital_use'].map({'oui':1, 'non':0})

### ordinal features
From the analysis step we can notice that there are some features that have an ordinal degradation such as **marketing_status** **marketing_authorization_status** and **reimbursement_rate**

In [21]:
train['marketing_status'].unique()

array(['Déclaration de commercialisation',
       "Déclaration d'arrêt de commercialisation",
       "Arrêt de commercialisation (le médicament n'a plus d'autorisation)",
       'Déclaration de suspension de commercialisation'], dtype=object)

In [22]:
severity_degree = {'Déclaration de commercialisation':0,
               'Déclaration de suspension de commercialisation':1,
               "Déclaration d'arrêt de commercialisation":2,
               "Arrêt de commercialisation (le médicament n'a plus d'autorisation)":3}
# mep the degree of severity
df['marketing_status'] = df['marketing_status'].map(severity_degree)

In [23]:
train['marketing_authorization_status'].unique()

array(['Autorisation active', 'Autorisation abrogée',
       'Autorisation archivée', 'Autorisation retirée'], dtype=object)

In [24]:
severity_degree = {'Autorisation active' : 0,
                   'Autorisation abrogée' : 1,
                   'Autorisation archivée' : 2,
                   'Autorisation retirée' : 3}

# mep the degree of severity
df['marketing_authorization_status'] = df['marketing_authorization_status'].map(severity_degree)

In [25]:
train['reimbursement_rate'].unique()

array(['65%', '100%', '15%', '30%', '0%'], dtype=object)

In [26]:
reimbursement_rate_map = {'0%' : 0,
                           '15%' : 1,
                           '30%' : 2,
                           '65%' : 3,
                         '100%' : 4}

# mep the degree of severity
df['reimbursement_rate'] = df['reimbursement_rate'].map(reimbursement_rate_map)

In [27]:
df['reimbursement_rate'].describe()

count    18634.000000
mean         2.873618
std          0.514449
min          1.000000
25%          3.000000
50%          3.000000
75%          3.000000
max          4.000000
Name: reimbursement_rate, dtype: float64

In [28]:
(train['dosage_form'].value_counts()>1).sum()

139

In [29]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/zoona/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [30]:
# Vectorization parameters
# Range (inclusive) of n-gram sizes for tokenizing text.
NGRAM_RANGE = (1, 2)

# Limit on the number of features. We use the top 10K features.
TOP_K = 10000

# Whether text should be split into word or character n-grams.
# One of 'word', 'char'.
TOKEN_MODE = 'word'

# Minimum document/corpus frequency below which a token will be discarded.
MIN_DOCUMENT_FREQUENCY = 2

SVD_N_COMPONENTS = 25

class CustomNgramVectorize(BaseEstimator, TransformerMixin):
    """Vectorizes texts as n-gram vectors.

    1 text = 1 tf-idf vector the length of vocabulary of unigrams + bigrams.

    # Arguments
        train_texts: list, training text strings.
        train_labels: np.ndarray, training labels.
        val_texts: list, validation text strings.

    # Returns
        x_train, x_val: vectorized training and validation texts
    """
    def __init__(self, text, reduce=True):
        # Create keyword arguments to pass to the 'tf-idf' vectorizer.
        kwargs = {
                'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
                'dtype': 'int32',
                'strip_accents': 'unicode',
                'decode_error': 'replace',
                'max_features' : 1000, #limit number of words
                'sublinear_tf': True, # Apply sublinear tf scaling
                'stop_words' : stopwords.words('french'),# drop french stopwords
                'analyzer': TOKEN_MODE,  # Split text into word tokens.
                'min_df': MIN_DOCUMENT_FREQUENCY,
        }
        self.tfidf_vectorizer = TfidfVectorizer(**kwargs)
        self.reduce = reduce
        if self.reduce:
            self.svd = TruncatedSVD(n_components=SVD_N_COMPONENTS, n_iter=25, random_state=12)
    
    def fit(self, X, y=None):
        self.tfidf_vectorizer.fit(X)
            
    def transform(self, X, y=None):
        X = self.tfidf_vectorizer.transform(X)
        # convert to dataframe
        X_df = pd.DataFrame(X.toarray(), columns=sorted(self.tfidf_vectorizer.vocabulary_))
        if self.reduce:
            X_df = self.svd.fit_transform(X_df)
        return X_df
    
    


In [31]:
var = 'dosage_form'
vectorizer = CustomNgramVectorize(train[var], reduce=True)
# train only on train dataset
vectorizer.fit(train[var])

# transform the ovearall
X_df = vectorizer.transform(df[var])

# add new svd transformed vectore
df = pd.concat([df,
                pd.DataFrame(X_df, columns=[f'{var}_svd_tfidf_component_{i}' for i in range(SVD_N_COMPONENTS)])],
                axis=1)    



In [32]:
# persist model
import joblib
joblib.dump(vectorizer, '../models/dosage_form_tfidf_vectorizer.joblib')


['../models/dosage_form_tfidf_vectorizer.joblib']

In [33]:
var='route_of_administration'
v = joblib.load('../models/dosage_form_tfidf_vectorizer.joblib')
v.fit(train[var])

X_df = v.transform(df[var])



In [34]:
import joblib
joblib.dump(vectorizer, f'../models/{var}_tfidf_vectorizer.joblib')

['../models/route_of_administration_tfidf_vectorizer.joblib']

In [35]:
df = pd.concat([df,
                   pd.DataFrame(X_df, columns=[f'{var}_svd_tfidf_component_{i}' for i in range(SVD_N_COMPONENTS)])],
                   axis=1)    

we found out most of 'pharmaceutical_companies' values contains the related country name.

Let's extract countries from pharmaceutical_companies

In [38]:
def extract_country(x):
    m = re.search('\((\w+)\)', x)
    if m:
        return m.group(1).replace('LUXMEBOURG', 'LUXEMBOURG')
    return 'FRANCE'

df['country'] = df['pharmaceutical_companies'].apply(extract_country)

Let's now clean the country annotations from pharmaceutical_companies feature

In [39]:
def clean_pharmaceutical_companies(x):
    x = str(x).strip().split(' (')[0]
    x = x.replace(' /', '')
    return x.strip()

df['pharmaceutical_companies'] = df['pharmaceutical_companies'].apply(clean_pharmaceutical_companies)

In [40]:
var = 'pharmaceutical_companies'
vectorizer = CustomNgramVectorize(train[var], reduce=True)
vectorizer.fit(train[var])

# persist model
import joblib
joblib.dump(vectorizer, f'../models/{var}_tfidf_vectorizer.joblib')






['../models/pharmaceutical_companies_tfidf_vectorizer.joblib']

In [41]:

X_df = vectorizer.transform(df[var])
df = pd.concat([df,
                   pd.DataFrame(X_df, columns=[f'{var}_svd_tfidf_component_{i}' for i in range(SVD_N_COMPONENTS)])],
                   axis=1)    

In [42]:
## retritrieve  rain/test
train = df[~df['price'].isna()]
test = df[df['price'].isna()]

In [43]:
train.reset_index(drop=True).to_pickle('../data/transformed/train.pkl')
test.reset_index(drop=True).drop('price', axis=1).to_pickle('../data/transformed/test.pkl')

### Later, lets add a one hot encoding
