In [1]:
# Load Pkgs
import pandas as pd
import numpy as np

In [3]:
# Load NLP Pkgs
import spacy
from wordcloud import WordCloud, STOPWORDS
from spacy.util import minibatch, compounding
import matplotlib.pyplot as plt
import re
import random
from spacy.training.example import Example

In [5]:
# Load Dataset
df = pd.read_csv("drug_review_dataset_with_sentiment.csv")

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount,drug_class,sentiment,sentiment_label
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27,arb blocker,0.0,neutral
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,,0.168333,positive
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,,0.06721,positive
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,,0.179545,positive
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37,,0.194444,positive


In [9]:
# NER
nlp0 = spacy.load('en_core_web_sm')

In [15]:
# Get All Components of this NLP Object
nlp0.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [16]:
ner0 = nlp0.get_pipe('ner')

TypeError: add_label() takes exactly one argument (0 given)

In [18]:
# Example
ex1 = "James went to London to buy Ibuprofen last year 2019"

In [19]:
docx = nlp0(ex1)

In [20]:
type(docx)

spacy.tokens.doc.Doc

In [22]:
# Check for entities
for entity in docx.ents:
    print(entity,entity.label_)

London GPE
Ibuprofen ORG
last year 2019 DATE


### Preparing the data 

Training data must be a tuple

TRAIN_DATA = [ ("Who is Shaka Khan?", {"entities": [(START, STOP, "LABEL")]}) ]


TRAIN_DATA = [ ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}), ]

In [23]:
def process_review(review):
    processed_token = []
    for token in review.split():
        token = ''.join(e.lower() for e in token if e.isalnum())
        processed_token.append(token)
    return ' '.join(processed_token)

In [24]:
# Drugs Names
all_drugs = df['drugName'].unique().tolist()

In [123]:
all_drugs = [x.lower() for x in all_drugs]
all_drugs=="testim"

False

In [28]:
df['review']

0         "It has no side effect, I take it in combinati...
1         "My son is halfway through his fourth week of ...
2         "I used to take another oral contraceptive, wh...
3         "This is my first time using any form of birth...
4         "Suboxone has completely turned my life around...
                                ...                        
161292    "I wrote my first report in Mid-October of 201...
161293    "I was given this in IV before surgey. I immed...
161294    "Limited improvement after 4 months, developed...
161295    "I&#039;ve been on thyroid medication 49 years...
161296    "I&#039;ve had chronic constipation all my adu...
Name: review, Length: 161297, dtype: object

In [29]:
count = 0
TRAIN_DATA = []
for _, item in df.iterrows():
    ent_dict = {}
    if count < 1000:
        review = process_review(item['review'])
        #Locate drugs and their positions once and add to the visited items.
        visited_items = []
        entities = []
        for token in review.split():
            if token in all_drugs:
                for i in re.finditer(token, review):
                    if token not in visited_items:
                        entity = (i.span()[0], i.span()[1], 'DRUG')
                        visited_items.append(token)
                        entities.append(entity)
        if len(entities) > 0:
            ent_dict['entities'] = entities
            train_item = (review, ent_dict)
            TRAIN_DATA.append(train_item)
            count+=1

In [30]:
TRAIN_DATA[1:3]

[('my son is halfway through his fourth week of intuniv we became concerned when he began this last week when he started taking the highest dose he will be on for two days he could hardly get out of bed was very cranky and slept for nearly 8 hours on a drive home from school vacation very unusual for him i called his doctor on monday morning and she said to stick it out a few days see how he did at school and with getting up in the morning the last two days have been problem free he is much more agreeable than ever he is less emotional a good thing less cranky he is remembering all the things he should overall his behavior is better we have tried many different medications and so far this is the most effective',
  {'entities': [(45, 52, 'DRUG')]}),
 ('i used to take another oral contraceptive which had 21 pill cycle and was very happy very light periods max 5 days no other side effects but it contained hormone gestodene which is not available in us so i switched to lybrel because the i

# Training the NER Model

# Model 1

In [116]:
n_iter = 20
def train_ner(training_data):
    """Steps
    Create a Blank NLP  model object
    Create and add NER to the NLP model
    Add Labels from your training data
    Train  
    """
    TRAIN_DATA = training_data
    nlp = spacy.blank("en")  # create blank Language class
    print("Created blank 'en' model")
    
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner",last=True)
        #nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")
        
    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
            
    nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            n_samples = len(texts)
            for sample, label in zip(texts[:n_samples],annotations[:n_samples]):

                    example = Example.from_dict(nlp.make_doc(sample), label)
                    nlp.update([example],
                        drop=0.35,  # dropout - make it harder to memorise data
                        losses=losses,
                    )
        print("Losses", losses)
    return nlp

In [117]:
# Let training
nlp2 = train_ner(TRAIN_DATA[1:100])

Created blank 'en' model
Losses {'ner': 794.9302599248563}
Losses {'ner': 207.44614014390416}
Losses {'ner': 145.95195985568284}
Losses {'ner': 110.01973029744478}
Losses {'ner': 74.99761658604783}
Losses {'ner': 56.762903658675555}
Losses {'ner': 71.14993604346088}
Losses {'ner': 45.598185051746725}
Losses {'ner': 32.22910505117763}
Losses {'ner': 39.30490059482556}
Losses {'ner': 39.21623854133004}
Losses {'ner': 36.34183681086979}
Losses {'ner': 29.561587040307394}
Losses {'ner': 18.04229876391385}
Losses {'ner': 29.246280809232616}
Losses {'ner': 22.877774512348118}
Losses {'ner': 23.812194081489988}
Losses {'ner': 33.40802169101459}
Losses {'ner': 17.7297256453721}
Losses {'ner': 8.99061670867373}


In [118]:
for text,_ in TRAIN_DATA[:1]:
    doc = nlp2(text)
    result = [(ent,ent.label_) for ent in doc.ents]
    print(result)

[(androgel, 'DRUG')]


In [119]:
TRAIN_DATA[:1]

[('when i first started using axiron it burned for a few minutes i was afriad this would be the rest of my life nope two weeks later the burning stopped i have tried testim androgel and hated the messy applications and feared spreading it on my nieces this goes under my arm and i love it it is worth the extra 30 dollars from my insurance i do recommend',
  {'entities': [(27, 33, 'DRUG'), (163, 169, 'DRUG'), (170, 178, 'DRUG')]})]

# Model 2

In [120]:
optimizer = nlp.begin_training()
to_train_ents = TRAIN_DATA[:100]
nlp = spacy.load('en_core_web_sm')

other_pipes = [pipe for pipe in nlp.pipe_names if pipe !='ner']

with nlp.disable_pipes(*other_pipes): # only train NER
    for itn in range(n_iter): # we are going to go throught the training data 20 times
        losses = {}
        random.shuffle(to_train_ents) # we shuffle the data 
        for item in to_train_ents:
            example = Example.from_dict(nlp.make_doc(item[0]), item[1])
            nlp.update([example],
            sgd=optimizer,
            drop = 0.35,
            losses =  losses)
        print("Ner",losses["ner"])
            # this update function Takes 
                       # the model we updated at the begininig at update it with the new information
                       #about the tokens, the position of the tokens and the string.
            

In [121]:
for text,_ in TRAIN_DATA[:1]:
    doc = nlp(text)
    result = [(ent,ent.label_) for ent in doc.ents]
    print(result)

[(axiron, 'DRUG'), (testim, 'DRUG'), (androgel, 'DRUG')]


In [122]:
TRAIN_DATA[:1]

[('when i first started using axiron it burned for a few minutes i was afriad this would be the rest of my life nope two weeks later the burning stopped i have tried testim androgel and hated the messy applications and feared spreading it on my nieces this goes under my arm and i love it it is worth the extra 30 dollars from my insurance i do recommend',
  {'entities': [(27, 33, 'DRUG'), (163, 169, 'DRUG'), (170, 178, 'DRUG')]})]

# Extract drog entity 

## Model 1

In [124]:
def extract_drug_entity(text):
    docx =  nlp2(text)
    result = [(ent,ent.label_) for ent in docx.ents]
    return result

In [125]:
df['review'][0:10]

0    "It has no side effect, I take it in combinati...
1    "My son is halfway through his fourth week of ...
2    "I used to take another oral contraceptive, wh...
3    "This is my first time using any form of birth...
4    "Suboxone has completely turned my life around...
5    "2nd day on 5mg started to work with rock hard...
6    "He pulled out, but he cummed a bit in me. I t...
7    "Abilify changed my life. There is hope. I was...
8    " I Ve had  nothing but problems with the Kepp...
9    "I had been on the pill for many years. When m...
Name: review, dtype: object

In [126]:
df['review'][0:10].apply(extract_drug_entity)

0                                                   []
1                     [((Intuniv), DRUG), ((.), DRUG)]
2                                                   []
3                                                   []
4    [((Suboxone), DRUG), ((oxycontin), DRUG), ((ox...
5              [((!), DRUG), ((!), DRUG), ((!), DRUG)]
6                                                   []
7    [((Abilify), DRUG), ((Zoloft), DRUG), ((Abilif...
8                                                   []
9                                    [((older), DRUG)]
Name: review, dtype: object

## Model 2

In [127]:
def extract_drug_entity_1(text):
    docx =  nlp(text)
    result = [(ent,ent.label_) for ent in docx.ents]
    return result

In [128]:
df['review'][0:10].apply(extract_drug_entity_1)

0                                 [((Bystolic), DRUG)]
1                                  [((Intuniv), DRUG)]
2    [((max), DRUG), ((Lybrel), DRUG), ((Lybrel), D...
3                                      [((max), DRUG)]
4    [((Suboxone), DRUG), ((Suboxone), DRUG), ((oxy...
5                                                   []
6                                                   []
7    [((Abilify), DRUG), ((Zoloft), DRUG), ((Abilif...
8                                                   []
9                                                   []
Name: review, dtype: object

In [129]:
df['review'][2]

'"I used to take another oral contraceptive, which had 21 pill cycle, and was very happy- very light periods, max 5 days, no other side effects. But it contained hormone gestodene, which is not available in US, so I switched to Lybrel, because the ingredients are similar. When my other pills ended, I started Lybrel immediately, on my first day of period, as the instructions said. And the period lasted for two weeks. When taking the second pack- same two weeks. And now, with third pack things got even worse- my third period lasted for two weeks and now it&#039;s the end of the third week- I still have daily brown discharge.\r\nThe positive side is that I didn&#039;t have any other side effects. The idea of being period free was so tempting... Alas."'