In [2]:
# NLP Lecture @ Strive School - 21st July 2021
# NER update

'''
Since today we are exploring the world of natural language processing, we’ll deepen in the Named Entity Recognition technique: this is just one of the mechanisms that NLP embodies. The recognition of named entities as the process of automatic identification of the entities present in a text and consequent classification into predefined categories such as "person", "organization", "position" is a quite common activity and expect for English, trained models with spaCy offer few labels that could be improved through training.

Following the case study of this morning, try to emulate it in order to label all the brands present in the provided datasets, choosing the one you prefer OR trying to label all them and to train the model to recognize new different entities. The result should be twofold: the final model should be able to recognize brands that it has already seen, but already new ones.
The brands proposed in the dataset concern fashion, cars and food.
In order to test the accuracy of the model, test it with sentences and brands the model has never seen.

Sample of the dataset
---------------------
- Cate Blanchett in Armani Privé. Rating: 8. Concludes as a rare butterfly, or from Rorschach's Test, or from computerized axial tomography.
- I liked everything, recommend it! Another quality Xiaomi product...
- What is the price of that Fiat 500XL?

Info:
- Feel free to change or arrange a new dataset
- Try experimenting and tuning with the hyperparameters
- Feel free to use or change the code you've seen during the morning session
- TBD = To be done (from you!) :)

'''

'\nSince today we are exploring the world of natural language processing, we’ll deepen in the Named Entity Recognition technique: this is just one of the mechanisms that NLP embodies. The recognition of named entities as the process of automatic identification of the entities present in a text and consequent classification into predefined categories such as "person", "organization", "position" is a quite common activity and expect for English, trained models with spaCy offer few labels that could be improved through training.\n\nFollowing the case study of this morning, try to emulate it in order to label all the brands present in the provided datasets, choosing the one you prefer OR trying to label all them and to train the model to recognize new different entities. The result should be twofold: the final model should be able to recognize brands that it has already seen, but already new ones.\nThe brands proposed in the dataset concern fashion, cars and food.\nIn order to test the acc

In [3]:
# STEP 0 - PRE REQUISITES

# python -m spacy download en_core_web_lg

# TBD: Import libraries
import spacy
import random
from spacy.util import minibatch, compounding
from pathlib import Path
from spacy.training import Example

# TBD: Load preferred model
nlp = spacy.load("en_core_web_sm")

with open("fashion brands.txt") as file:
    dataset = file.read()

doc = nlp(dataset)
#print("entities:",[(ent.text,ent.label_) for ent in doc.ents])
print([ent.text for ent in doc.ents])

['Bella Hadid', 'Schiaparelli', 'Meng Li', 'Balenciaga', 'Jodie Turner-Smith', 'Lou Doillon', 'Gucci', 'Gucci', 'Loewe', 'Jodie Turner-Smith', 'Gucci', 'Vanessa Paradis', 'Chanel', 'Charlotte Gainsbourg', 'Saint Laurent', 'Atelier Versace', 'Diane Kruger', 'Alberta', 'Andie MacDowell', 'Prada', 'Bella Hadid', 'Jean Paul Gaultier', 'Jodie Foster', 'Spike Lee', 'Louis Vuitton', 'Helen Mirren', 'Dolce', 'Adam Driver', 'Burberry', 'H&M', 'Cate Blanchett', 'Armani PrivÃ©', '8', 'Rorschach', 'Anna Foglietta', 'Armani PrivÃ©', '7/8', 'Nolan', 'Diodato', 'Emporio Armani', '7', 'Venetian', 'Biennale', 'Giorgio Armani', '7', 'The Golden Lion', 'Emporio Armani', '6', 'two', 'Valeria Golino', 'Christian Dior', '5/6', 'Joan', 'Arc', 'Betta Guerrieri', '6/7', 'Cate Blanchett', 'Armani PrivÃ©', '8', 'the Red Carpet', 'Gessica Notaro', 'Elisabetta Franchi', '5', "Wonder Woman's", 'Paradise Island', 'Osvaldo Supino', '4', 'Donatella Finocchiaro', 'Giorgio Armani', '6', 'Miu Miu', '7', 'Cate Blanchett',

In [4]:
# STEP 1 - TRAIN DATA

# Prepare training data

# TBD: define all the entities by extracting the words and their indexes from the dataset
# expected format is the following:  ("sentence", {"entities": [0,10, "FOOD"]})

words = ["Gucci","Schiaparelli","Chanel","Prada","Dolce & Gabbana ","Armani","Versace","Saint Laurent","Burberry","H&M","Alexander McQueen","Calvin Klein","Louis Vuitton"]

train_data = []
with open("fashion brands.txt") as file:
     dataset = [line.strip() for line in file if line.strip()]

     for sentence in dataset:
         print("######")
         print("sentence: ", sentence)
         print("######")
         sentence = sentence.lower()
         entities = []
         for word in words:
             word = word.lower()
             if word in sentence:
                 start_index = sentence.index(word)
                 end_index = len(word) + start_index
                 print("word: ", word)
                 print("----------------")
                 print("start index:", start_index)
                 print("end index:", end_index)
                 pos = (start_index, end_index, "fashion_brand")
                 entities.append(pos)
         element = (sentence.rstrip('\n'), {"entities": entities})

         train_data.append(element)
         print('----------------')
         print("element:", element)

######
sentence:  Bella Hadid in Schiaparelli couture
######
word:  schiaparelli
----------------
start index: 15
end index: 27
----------------
element: ('bella hadid in schiaparelli couture', {'entities': [(15, 27, 'fashion_brand')]})
######
sentence:  Meng Li in Balenciaga
######
----------------
element: ('meng li in balenciaga', {'entities': []})
######
sentence:  Jodie Turner-Smith in Gucci
######
word:  gucci
----------------
start index: 22
end index: 27
----------------
element: ('jodie turner-smith in gucci', {'entities': [(22, 27, 'fashion_brand')]})
######
sentence:  Lou Doillon in Gucci
######
word:  gucci
----------------
start index: 15
end index: 20
----------------
element: ('lou doillon in gucci', {'entities': [(15, 20, 'fashion_brand')]})
######
sentence:  Salma Hayek in Gucci
######
word:  gucci
----------------
start index: 15
end index: 20
----------------
element: ('salma hayek in gucci', {'entities': [(15, 20, 'fashion_brand')]})
######
sentence:  Josh Oâ€™Conno

In [5]:
# STEP 2 - UPDATE MODEL
ner = nlp.get_pipe("ner")

for _, annotations in train_data:
    for ent in annotations.get("entities"):
        print(annotations.get("entities"))
        ner.add_label(ent[2])


[(15, 27, 'fashion_brand')]
[(22, 27, 'fashion_brand')]
[(15, 20, 'fashion_brand')]
[(15, 20, 'fashion_brand')]
[(22, 27, 'fashion_brand')]
[(19, 25, 'fashion_brand')]
[(24, 37, 'fashion_brand')]
[(27, 34, 'fashion_brand')]
[(16, 22, 'fashion_brand')]
[(19, 24, 'fashion_brand')]
[(13, 26, 'fashion_brand')]
[(15, 23, 'fashion_brand')]
[(27, 30, 'fashion_brand')]
[(18, 24, 'fashion_brand')]
[(18, 24, 'fashion_brand')]
[(19, 25, 'fashion_brand')]
[(27, 33, 'fashion_brand')]
[(24, 30, 'fashion_brand')]
[(18, 24, 'fashion_brand')]
[(33, 39, 'fashion_brand')]
[(18, 35, 'fashion_brand')]
[(18, 23, 'fashion_brand')]
[(19, 25, 'fashion_brand')]
[(15, 21, 'fashion_brand')]
[(57, 64, 'fashion_brand')]
[(53, 70, 'fashion_brand')]
[(57, 69, 'fashion_brand')]
[(57, 63, 'fashion_brand')]
[(55, 60, 'fashion_brand')]
[(59, 72, 'fashion_brand')]
[(58, 64, 'fashion_brand')]
[(58, 70, 'fashion_brand')]
[(20, 27, 'fashion_brand')]


In [6]:
# TBD: load the needed pipeline
ner = nlp.get_pipe("ner")

In [7]:
# TBD: define the annotations
for _, annotations in train_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

In [8]:
train_data

[('bella hadid in schiaparelli couture',
  {'entities': [(15, 27, 'fashion_brand')]}),
 ('meng li in balenciaga', {'entities': []}),
 ('jodie turner-smith in gucci', {'entities': [(22, 27, 'fashion_brand')]}),
 ('lou doillon in gucci', {'entities': [(15, 20, 'fashion_brand')]}),
 ('salma hayek in gucci', {'entities': [(15, 20, 'fashion_brand')]}),
 ('josh oâ€™connor in loewe', {'entities': []}),
 ('jodie turner-smith in gucci', {'entities': [(22, 27, 'fashion_brand')]}),
 ('vanessa paradis in chanel', {'entities': [(19, 25, 'fashion_brand')]}),
 ('charlotte gainsbourg in saint laurent',
  {'entities': [(24, 37, 'fashion_brand')]}),
 ('andie macdowell in atelier versace',
  {'entities': [(27, 34, 'fashion_brand')]}),
 ('diane kruger in armani privã©', {'entities': [(16, 22, 'fashion_brand')]}),
 ('bella hadid in lanvin', {'entities': []}),
 ('eva herzigova in alberta ferretti', {'entities': []}),
 ('andie macdowell in prada', {'entities': [(19, 24, 'fashion_brand')]}),
 ('bella hadid in

In [12]:
# TBD: train the model
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]


# TBD: define the number of iterations, the batch size and the drop according to your experience or using an empirical value
# Train model
with nlp.disable_pipes(*unaffected_pipes):
    for iteration in range(15):
        print("Iteration #" + str(iteration))

        # Data shuffle for each iteration
        random.shuffle(train_data)
        losses = {}
        batches = minibatch(train_data, size=3)
        for batch in batches:
            for text, annotations in batch:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                nlp.update([example], losses=losses, drop=0.1)
        print("Losses:", losses)

# pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
# unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
#
# with nlp.disable_pipes(*unaffected_pipes):
#     for iteration in range(30):
#         print("Iteration #", iteration)
#
#         random.shuffle(train_data)
#         losses = {}
#
#         batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
#         for batch in batches:
#             for text, annotations in batch:
#                 doc = nlp.make_doc(text)
#                 example = Example.from_dict(doc, annotations)
#                 nlp.update([example], losses=losses, drop=0.1)
#         print("Losses:", losses)


Iteration #0
Losses: {'ner': 0.003666355932702765}
Iteration #1
Losses: {'ner': 0.007433847854167711}
Iteration #2
Losses: {'ner': 7.585273791173344e-05}
Iteration #3
Losses: {'ner': 0.0027987576906086382}
Iteration #4
Losses: {'ner': 3.0889450006415342e-06}
Iteration #5
Losses: {'ner': 7.090398144406186e-07}
Iteration #6
Losses: {'ner': 1.406383078305751e-05}
Iteration #7
Losses: {'ner': 3.5580294209718476e-06}
Iteration #8
Losses: {'ner': 5.237475810436323e-07}
Iteration #9
Losses: {'ner': 1.112022303043886e-07}
Iteration #10
Losses: {'ner': 5.605590571300143e-08}
Iteration #11
Losses: {'ner': 4.5852454569104826e-08}
Iteration #12
Losses: {'ner': 6.364262592299195e-08}
Iteration #13
Losses: {'ner': 1.0061858355359587e-07}
Iteration #14
Losses: {'ner': 6.859893588735122e-08}
Iteration #15
Losses: {'ner': 8.729407593197126e-08}
Iteration #16
Losses: {'ner': 1.634974846604804e-06}
Iteration #17
Losses: {'ner': 1.438708785298242e-07}
Iteration #18
Losses: {'ner': 8.279747637441971e-08}
I

In [26]:
# Save the model
output_dir = Path("model")
nlp.to_disk(output_dir)
print("Saved correctly!")


Saved correctly!


In [35]:
# STEP 3 - TEST THE UPDATED MODEL

# Load updated model
nlp_updated = spacy.load(output_dir)

# TBD: test with a old sentence
doc = nlp_updated("Cynthia Erivo attends the 92nd Academy Awards, Designer: Versace, Year: 2020")
print("entities:", [(ent.text, ent.label_) for ent in doc.ents])
# TBD: test with a new sentence and an old brand
doc = nlp_updated("some Dude rolls up in Gucci and steals the show")
print("entities:", [(ent.text, ent.label_) for ent in doc.ents])
# TBD: test with a new sentence and a new brand
doc = nlp_updated("Christiano Ronaldo is attending in Nike")
print("entities:", [(ent.text, ent.label_) for ent in doc.ents])

# new sentence, no word
doc = nlp_updated("megan wore a tshirt while she was drinking coffee")
print("entities:", [(ent.text, ent.label_) for ent in doc.ents])



doc = nlp_updated("Hugo Boss used to design NS uniforms")
print("entities:", [(ent.text, ent.label_) for ent in doc.ents])

entities: [('Versace', 'fashion_brand')]
entities: [('Gucci', 'fashion_brand')]
entities: [('Nike', 'fashion_brand')]
entities: []
entities: []
