# Name-Entity Recognition of Descriptoins of Various Products

## Part II: Training the Model

## 1. Dataset

In [1]:
import pandas as pd
import numpy as np
import plac
import random
from pathlib import Path
import spacy
from tqdm import tqdm 
from spacy.util import minibatch, compounding
from spacy import displacy
import string
import pickle
from spacy.gold import GoldParse
from spacy.scorer import Scorer

In [2]:
df = pd.read_csv("data.csv")

In [3]:
df.shape

(671621, 8)

In [4]:
df.head()

Unnamed: 0,title,description,summary,brand,price,meta,provider_category,provider
0,"adidas Originals - Superstar - Valkoinen - US 5,5",,,adidas Originals,66.5,"{""SIZE"": [""us 5,5""], ""COLOR"": [""valkoinen""], ""...",17-muoti-ja-vaatetus,Caliroots
1,Sc-Erna Polvipituinen Hame Sininen Soyaconcept,"SOYACONCEPT on tanskalainen brändi, joka luo e...",,Soyaconcept,49.99,"{""SIZE"": [""36""], ""COLOR"": [""cristal blue""], ""G...",17-muoti-ja-vaatetus,Boozt
2,Dana Buchman Silmälasit Taren CARAMEL TORTOISE,Dana Buchman Taren Silmälasit. Collection:Men....,,Dana Buchman,146.0,"{""SIZE"": [""54""], ""COLOR"": [""tortoise""], ""GENDE...",13-silmalasit-ja-piilolinssit,Smartbuy Glasses
3,Active Sports Woven Shorts B Shortsit Musta PUMA,PUMA Active Sports Woven Shorts B,,PUMA,27.0,"{""SIZE"": [""164"", ""128"", ""110"", ""116"", ""104"", ""...",17-muoti-ja-vaatetus,Boozt
4,Renata Polvipituinen Hame Musta Fall Winter Sp...,Fall Winter Spring Summer. A-linjainen.,,Fall Winter Spring Summer,199.0,"{""SIZE"": [""xs""], ""COLOR"": [""jet black""], ""GEND...",17-muoti-ja-vaatetus,Boozt


In [5]:
i=0
print(df.loc[i])
print(df['meta'].loc[i])

title                adidas Originals - Superstar - Valkoinen - US 5,5
description                                                        NaN
summary                                                            NaN
brand                                                 adidas Originals
price                                                             66.5
meta                 {"SIZE": ["us 5,5"], "COLOR": ["valkoinen"], "...
provider_category                                 17-muoti-ja-vaatetus
provider                                                     Caliroots
Name: 0, dtype: object
{"SIZE": ["us 5,5"], "COLOR": ["valkoinen"], "GENDER": ["unisex"]}


In [6]:
i=1
print(df.loc[i])
print(df['meta'].loc[i])

title                   Sc-Erna Polvipituinen Hame Sininen Soyaconcept
description          SOYACONCEPT on tanskalainen brändi, joka luo e...
summary                                                            NaN
brand                                                      Soyaconcept
price                                                            49.99
meta                 {"SIZE": ["36"], "COLOR": ["cristal blue"], "G...
provider_category                                 17-muoti-ja-vaatetus
provider                                                         Boozt
Name: 1, dtype: object
{"SIZE": ["36"], "COLOR": ["cristal blue"], "GENDER": ["women"]}


## 2. Entity and Entity Types

One can obtain entity and entity types from the *meta* and *brand* columns. However, there are many words incorrectly written entities. These are deleted, and many entities obtained from the *title* and *descriptoin* columns are added by hand. The saved model are uploaded below: 

In [7]:
with open("entity.txt", "rb") as fp:   # Unpickling
    entity = pickle.load(fp)

In [8]:
with open("entity_types.txt", "rb") as fp:   # Unpickling
    entity_types = pickle.load(fp)

In [9]:
with open("titles_unique_brand.txt", "rb") as fp:   # Unpickling
    titles_unique_brand = pickle.load(fp)

In [10]:
with open("description_unique_brand.txt", "rb") as fp:   # Unpickling
    description_unique_brand = pickle.load(fp)

In [11]:
entity[0:10]

['US 5,5',
 'Valkoinen',
 '36',
 'Cristal Blue',
 '54',
 'Tortoise',
 '164',
 '128',
 '110',
 '116']

In [12]:
entity_types[0:10]

['SIZE',
 'COLOR',
 'SIZE',
 'COLOR',
 'SIZE',
 'COLOR',
 'SIZE',
 'SIZE',
 'SIZE',
 'SIZE']

## 3. Obtaining Training Data

- Loop through the title column, and the description column, make a list extracted from title/description column which have unique brands.
    - For the description column, we can drop rows include `&` and `<b>` signs, in order to increase the accuracy.
- In each loop, use the entity and entity types lists:
    - Construct annotation, but this annotation will be highly overlapped in the charactors' ranges. This is not allowed in the training process. 
    - Get rid of overlapped charactors' ranges, always choose the phrases with wider range
    - Repeating the same process above, in case there are still overlapped charactors' ranges. This is due to the fact that, the wirtten code, only compares two phrases at once, and this may lead to some left over overlapping. 

In [13]:
def obtain_annotation(title):
    
    # Initial annotation, charactors' ranges are highly overlapped
    entity_set_list=[]
    entity_dict={}
    entity_set_range_list=[]
    for m in range(0,len(entity)):
        if entity[m] in title:
            index_i=title.find(entity[m])
            index_f=index_i+len(entity[m])
            if (index_i != 0) and (index_f != len(title)):
                if (title[index_i-1] == ' ' and title[index_f] == ' '):
                    #print(entity[m])
                    entity_tuple=(index_i, index_f, entity_types[m])
                    entity_set_list.append(entity_tuple)
                    entity_set_range_list.append(range(index_i, index_f))
            if (index_i == 0) and (index_f != len(title)):
                if (title[index_f] == ' '):
                    #print(entity[m])
                    entity_tuple=(index_i, index_f, entity_types[m])
                    entity_set_list.append(entity_tuple)
                    entity_set_range_list.append(range(index_i, index_f))
            if (index_i != 0) and (index_f == len(title)):
                if (title[index_i-1] == ' '):
                    #print(entity[m], index_i, index_f)
                    entity_tuple=(index_i, index_f, entity_types[m])
                    entity_set_list.append(entity_tuple)
                    entity_set_range_list.append(range(index_i, index_f))

    # Second Step: Get rid of overlapped charactors' ranges
    entity_set_list_2=[]
    for n, entity_set_range_1 in enumerate(entity_set_range_list):
        entity_set_range_test=set(entity_set_range_1)
        inter=0
        entity_set_list_2_temp=[]
        for m, entity_set_range_2 in enumerate(entity_set_range_list):
            if entity_set_range_1 != entity_set_range_2:
                interss=entity_set_range_test.intersection(entity_set_range_2)
                #entity_set_list_2_temp=[]
                if interss==set():
                    inter += 1
                else:
                    if set(entity_set_range_1)>set(entity_set_range_2):
                        if entity_set_list[n] not in entity_set_list_2_temp:
                            entity_set_list_2_temp.append(entity_set_list[n])
                    elif set(entity_set_range_1)<set(entity_set_range_2):
                        if entity_set_list[m] not in entity_set_list_2_temp:
                            entity_set_list_2_temp.append(entity_set_list[m])
            else:
                if entity_set_list[n] not in entity_set_list_2_temp:
                    entity_set_list_2_temp.append(entity_set_list[n])
        if m == inter:
            if entity_set_list[n] not in entity_set_list_2:
                entity_set_list_2.append(entity_set_list[n])
        else:
            for entity_set_list_2_temp_item in entity_set_list_2_temp:
                if entity_set_list_2_temp_item not in entity_set_list_2:
                    entity_set_list_2.append(entity_set_list_2_temp_item)
    
    # Third Step: Get rid of overlapped charactors' ranges further if any
    entity_set_list_3=[]
    for n, item_1 in enumerate(entity_set_list_2):
        item_1_range=range(item_1[0],item_1[1])
        inter=0
        entity_set_list_2_temp=[]
        for m, item_2 in enumerate(entity_set_list_2):
            item_2_range=range(item_2[0],item_2[1])
            if item_1_range != item_2_range:
                interss=set(item_1_range).intersection(item_2_range)
                if interss == set():
                    inter += 1
                else:
                    if set(item_1_range)>set(item_2_range):
                        if item_1 not in entity_set_list_2_temp:
                            entity_set_list_2_temp.append(item_1)
                    elif set(item_1_range)<set(item_2_range):
                        if item_2 not in entity_set_list_2_temp:
                            entity_set_list_2_temp.append(item_2)
        if m == inter:
            entity_set_list_3.append(item_1)
        else:
            for entity_set_list_2_temp_item in entity_set_list_2_temp:
                if entity_set_list_2_temp_item not in entity_set_list_3:
                    entity_set_list_3.append(entity_set_list_2_temp_item)
    
    # Fourth Step: Get rid of overlapped charactors' ranges further if any
    entity_set_list_4=[]
    for n, item_1 in enumerate(entity_set_list_3):
        item_1_range=range(item_1[0],item_1[1])
        inter=0
        entity_set_list_2_temp=[]
        for m, item_2 in enumerate(entity_set_list_3):
            item_2_range=range(item_2[0],item_2[1])
            if item_1_range != item_2_range:
                interss=set(item_1_range).intersection(item_2_range)
                if interss == set():
                    inter += 1
                else:
                    if set(item_1_range)>set(item_2_range):
                        if item_1 not in entity_set_list_2_temp:
                            entity_set_list_2_temp.append(item_1)
                    elif set(item_1_range)<set(item_2_range):
                        if item_2 not in entity_set_list_2_temp:
                            entity_set_list_2_temp.append(item_2)
        if m == inter:
            entity_set_list_4.append(item_1)
        else:
            for entity_set_list_2_temp_item in entity_set_list_2_temp:
                if entity_set_list_2_temp_item not in entity_set_list_4:
                    entity_set_list_4.append(entity_set_list_2_temp_item)
    
    # Fifth Step: Get rid of overlapped charactors' ranges further if any
    entity_set_list_5=[]
    for n, item_1 in enumerate(entity_set_list_4):
        item_1_range=range(item_1[0],item_1[1])
        inter=0
        entity_set_list_2_temp=[]
        for m, item_2 in enumerate(entity_set_list_4):
            item_2_range=range(item_2[0],item_2[1])
            if item_1_range != item_2_range:
                interss=set(item_1_range).intersection(item_2_range)
                if interss == set():
                    inter += 1
                else:
                    if set(item_1_range)>set(item_2_range):
                        if item_1 not in entity_set_list_2_temp:
                            entity_set_list_2_temp.append(item_1)
                    elif set(item_1_range)<set(item_2_range):
                        if item_2 not in entity_set_list_2_temp:
                            entity_set_list_2_temp.append(item_2)
        if m == inter:
            entity_set_list_5.append(item_1)
        else:
            for entity_set_list_2_temp_item in entity_set_list_2_temp:
                if entity_set_list_2_temp_item not in entity_set_list_5:
                    entity_set_list_5.append(entity_set_list_2_temp_item)
    
    # Construct annotation
    entity_dict['entities']=entity_set_list_5
    annotation_n=(title, entity_dict)
    return annotation_n

In [14]:
# Loop through lists of titles with unique Brand names
# One can also loop through the whole title column or the description column.
TRAIN_DATA_0=[]
for n in range(0,len(titles_unique_brand)):
    annotation=obtain_annotation(titles_unique_brand[n])
    if annotation[1]['entities']!=[]:
        TRAIN_DATA_0.append(annotation)

In [15]:
# Loop through lists of titles with unique Brand names
# One can also loop through the whole title column or the description column.
TRAIN_DATA_1=[]
for n in range(0,len(description_unique_brand)):
    if '<' not in description_unique_brand[n]:
        if '&' not in description_unique_brand[n]:
            annotation=obtain_annotation(description_unique_brand[n])
            if annotation[1]['entities']!=[]:
                TRAIN_DATA_1.append(annotation)

In [16]:
TRAIN_DATA_0 = TRAIN_DATA_0 + TRAIN_DATA_1

In [17]:
random.shuffle(TRAIN_DATA_0)

In [18]:
len(TRAIN_DATA_0)

9280

In [19]:
TRAIN_DATA_0

[('Custom Vkp80Ii Paper Holder Kit', {'entities': [(0, 6, 'BRAND')]}),
 ('Svartzonker McMio huppari', {'entities': [(0, 11, 'BRAND')]}),
 ('OLIVETTI inkroll IR 40 (2) black',
  {'entities': [(20, 22, 'SIZE'), (27, 32, 'COLOR')]}),
 ('Mercury Mesh pants, harmaa/musta',
  {'entities': [(0, 7, 'BRAND'), (20, 32, 'COLOR')]}),
 ('Grunt Njord Tee', {'entities': [(0, 5, 'BRAND')]}),
 ('SNÖ of Sweden Connected Pendant Neck 80 Rosé/Clear',
  {'entities': [(37, 39, 'SIZE'), (0, 13, 'BRAND'), (40, 50, 'COLOR')]}),
 ('Rakenna esteitä, joiden taakse sotakersantti, Star Coloner, Recon-sotilas ja KREON -hahmo voivat suojautua. Sisältää noin 81 osaa ja 4 KREON -hahmoa.',
  {'entities': [(133, 134, 'SIZE'), (122, 124, 'SIZE'), (46, 50, 'BRAND')]}),
 ('DD Hammocks Frontline riippumatto', {'entities': [(0, 11, 'BRAND')]}),
 ('GOOGLE Pixel 3 XL 64GB - Just Black',
  {'entities': [(30, 35, 'COLOR'), (15, 17, 'SIZE'), (18, 22, 'SIZE')]}),
 ('Tree is an incredibly durable doormat in 100% natural rubber from 

In [20]:
# Obtain a portion of the training data. For the final step, choose all the data.
TRAIN_DATA=TRAIN_DATA_0[0:round(1*len(TRAIN_DATA_0))]

In [21]:
with open("TRAIN_DATA.txt", "wb") as fp:   #Pickling
    pickle.dump(TRAIN_DATA, fp)

## 4. Model

In [22]:
def my_model(train_type, language, batch_type, n_iter, TRAIN_DATA):
    if train_type == 'Transfer':
        if language == 'English':
            
            nlp=spacy.load("en_core_web_sm")   
            ner=nlp.get_pipe('ner')
            ner.add_label('COLOR')
            ner.add_label('SIZE')
            ner.add_label('BRAND')
            optimizer = nlp.resume_training()
        
        elif language == 'Multi':
            nlp=spacy.load("xx_ent_wiki_sm") 
            ner=nlp.get_pipe('ner')
            
            ner.add_label('COLOR')
            ner.add_label('SIZE')
            ner.add_label('BRAND')
            optimizer = nlp.resume_training()
        
        
    elif train_type == 'Scratch':
        model = None
        if model is not None:
            nlp = spacy.load(model)  
            print("Loaded model '%s'" % model)
        else:
            nlp = spacy.blank('en', entity = False)  
            print("Created blank 'en' model")
            
        if 'ner' not in nlp.pipe_names:
            ner = nlp.create_pipe('ner')
            nlp.add_pipe(ner, last=True)
        else:
            ner = nlp.get_pipe('ner')
            
        # Add labels
        for _, annotations in TRAIN_DATA:
            for ent in annotations.get('entities'):
                ner.add_label(ent[2])

        if model is None:
            optimizer = nlp.begin_training()
        else:
            optimizer = nlp.entity.create_optimizer()
     
    if batch_type == 'Full':
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
        with nlp.disable_pipes(*other_pipes):  # only train NER
            for itn in range(n_iter):
                random.shuffle(TRAIN_DATA)
                losses = {}
                for text, annotations in tqdm(TRAIN_DATA):
                    nlp.update([text], [annotations], drop=0.5, sgd=optimizer, losses=losses)
                print(itn, losses)
    elif batch_type == 'mini':
        pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
        with nlp.disable_pipes(*other_pipes) :
            sizes = compounding(1.0, 4.0, 1.001)  
            for itn in range(n_iter):
                random.shuffle(TRAIN_DATA)
                batches = minibatch(TRAIN_DATA, size=sizes)
                losses = {}
                for batch in batches:
                    texts, annotations = zip(*batch)
                    nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
                print("Losses", itn, losses)
    
    return nlp

In [23]:
%%time
# Train type = 'Scratch' (learn from scratch), 'Transform' (transform learning)
# Langauge = 'English', 'Multi' (Only matters if train_type is 'Transform')
# batch_type = 'Full' (slower), 'mini' (faster)
# n_iter = number of iteration
# TRAIN_DATA = The training data
nlp=my_model('Scratch', 'English', 'mini', 200, TRAIN_DATA)

Created blank 'en' model


  proc.begin_training(


Losses 0 {'ner': 19263.178851629622}
Losses 1 {'ner': 13300.812962129527}
Losses 2 {'ner': 10944.055812869623}
Losses 3 {'ner': 9459.817742023553}
Losses 4 {'ner': 8131.568627235093}
Losses 5 {'ner': 7108.936878625545}
Losses 6 {'ner': 6459.339424418825}
Losses 7 {'ner': 5769.787352778118}
Losses 8 {'ner': 5372.226541645742}
Losses 9 {'ner': 4982.057260696371}
Losses 10 {'ner': 4659.743092604878}
Losses 11 {'ner': 4131.137023523863}
Losses 12 {'ner': 3999.000146408548}
Losses 13 {'ner': 3791.222860864624}
Losses 14 {'ner': 3707.8717362341345}
Losses 15 {'ner': 3251.2509417421516}
Losses 16 {'ner': 3218.705701188739}
Losses 17 {'ner': 3039.2962927155345}
Losses 18 {'ner': 2824.964496715341}
Losses 19 {'ner': 2921.7679877474084}
Losses 20 {'ner': 2728.8152269092534}
Losses 21 {'ner': 2431.521742974751}
Losses 22 {'ner': 2478.7166856568742}
Losses 23 {'ner': 2420.3028078512993}
Losses 24 {'ner': 2284.965763246124}
Losses 25 {'ner': 2144.8365373116267}
Losses 26 {'ner': 2173.935644970498}


## 5. Saving The Model

In [24]:
# save model to output directory
output_dir=Path("/Users/farukakbar/Desktop/ner_products/model")
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

    # test the saved model
    #print("Loading from", output_dir)
    #nlp2 = spacy.load(output_dir)
    #for text, _ in TRAIN_DATA:
    #    doc = nlp2(text)
    #    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
    #    #print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Saved model to /Users/farukakbar/Desktop/ner_products/model


## 6. Results

In [25]:
# Entity and Entity Types output
def ent_output(nlp_doc):
    ent_dict={}
    entity_list=list(doc.ents) 
    for item in entity_list:
        ent_dict[item]=item.label_
        #print(item, item.label_)
    print(ent_dict)

### 1) Results on the trained data

In [26]:
for n in range(0,100):
    doc = nlp(TRAIN_DATA[n][0])
    spacy.displacy.render(doc, style="ent")
    ent_output(doc)

{237 ml: 'SIZE'}


{30 ml: 'SIZE'}


{Vac-U-Lock: 'BRAND'}


{Terra plana: 'BRAND'}


{Feather: 'BRAND'}


{Gold: 'COLOR', Kulta: 'COLOR', Syster P: 'BRAND'}


{Fairtex: 'BRAND', Black: 'COLOR'}


{(harmaa): 'COLOR', 170x250 cm: 'SIZE'}


{Yin: 'BRAND'}


{4-7: 'SIZE'}


{Raid: 'BRAND', Black: 'COLOR'}


{Pitillos: 'BRAND'}


{Spot-Hogg: 'BRAND'}


{Van Well: 'BRAND'}


{Coach: 'BRAND', Dark Honey: 'COLOR', Plastic.: 'MATERIAL', 50.: 'SIZE'}


{Seeberger: 'BRAND', Punainen: 'COLOR'}


{black: 'COLOR', naisten: 'GENDER', yksi koko: 'SIZE'}


{Serengeti: 'BRAND'}


{Tyrex: 'BRAND'}


{Vanilla: 'COLOR', XS: 'SIZE'}


{Eos: 'BRAND', vaaleanruskea: 'COLOR', 75 cm: 'SIZE'}


{Brevi: 'BRAND', 15: 'SIZE', 20 kg: 'WEIGHT'}


{WineQueen: 'BRAND'}


{Petrol: 'BRAND', Vaaleanpunainen: 'COLOR', MbyM: 'BRAND'}


{TOPModel: 'BRAND'}


{Keltainen: 'COLOR', WearColour: 'BRAND'}


{20 cm: 'SIZE', Goebel: 'BRAND'}


{Respiro: 'BRAND', Black/Gold: 'COLOR'}


{Bio- Nurmikkolannoite: 'BRAND', 2,5 kg: 'WEIGHT'}


{FAYT: 'BRAND', Purple: 'COLOR'}


{3G: 'SIZE', Black: 'COLOR'}


{grey: 'COLOR', naisten: 'GENDER', 36,37,38,39,40: 'SIZE'}


{Janod: 'BRAND', 92: 'SIZE'}


{Twix: 'BRAND'}


{Hoss x NA-KD: 'BRAND', Black: 'COLOR'}


{Barnängen Founded in Stockholm: 'BRAND'}


{Lineaeffe: 'BRAND', 120g: 'WEIGHT'}


{Harmaa/Musta: 'COLOR', Lampemesteren: 'BRAND'}


{Viking: 'BRAND'}


{Gant: 'BRAND', Grey: 'COLOR'}


{Woden: 'BRAND'}


{Musta: 'COLOR', XS: 'SIZE'}


{Cooler Master: 'BRAND', 212: 'SIZE'}


{Devá States: 'BRAND', Monivärinen: 'COLOR'}


{Pomellato: 'BRAND', Gold.: 'COLOR', Metal.: 'MATERIAL', 57.: 'SIZE'}


{blue: 'COLOR', 40,31,32,33,34,35: 'SIZE'}


{50m: 'SIZE'}


{Diana: 'BRAND', 4,5mm: 'SIZE'}


{Axis: 'BRAND'}


{naisten: 'GENDER', 36,37,41: 'SIZE'}


{100g: 'WEIGHT'}


{24: 'SIZE'}


{Wolf Tyres: 'BRAND', Nord: 'BRAND'}


{Mercedes: 'BRAND', Brown.: 'COLOR', Acetate.: 'MATERIAL', 55.: 'SIZE'}


{Monivärinen: 'COLOR'}


{Glimmies: 'BRAND'}


{Steel: 'BRAND'}


{125 mm: 'SIZE'}


{Magformers: 'BRAND', 20: 'SIZE'}


{Design Of: 'BRAND', S,: 'SIZE', Musta: 'COLOR'}


{Bolle: 'BRAND', Black.: 'COLOR', Plastic.: 'MATERIAL', 128.: 'SIZE'}


{Designa Friend: 'BRAND'}


{Motel: 'BRAND'}


{250 gr: 'WEIGHT'}


{Monster Cars: 'BRAND'}


{American College: 'BRAND'}


{Nevercold: 'BRAND'}


{Clavister: 'BRAND'}


{26mm: 'SIZE'}


{Zolo: 'BRAND', Cobra: 'BRAND'}


{Musta: 'COLOR', UGG: 'BRAND'}


{red: 'COLOR', miesten: 'GENDER', yksi koko: 'SIZE'}


{15: 'SIZE', 75: 'SIZE'}


{Oranssi: 'COLOR', Storm & Marie: 'BRAND'}


{Marshal: 'BRAND', 8.5: 'SIZE'}


{Osram: 'BRAND', Classic: 'BRAND', Lämmin Valkoinen: 'COLOR'}


{Coolway: 'BRAND'}


{Yellow: 'COLOR', 400ml: 'SIZE'}


{Decode: 'BRAND', harmaa: 'COLOR'}


{Battatura: 'BRAND'}


{Kookie Cat: 'BRAND', Caramel: 'COLOR', 50g: 'WEIGHT'}


{Freebra: 'BRAND'}


{Red: 'COLOR', River: 'COLOR'}


{Sätila: 'BRAND', Pink: 'COLOR', Unisex: 'GENDER', OneSize: 'SIZE'}


{Delux: 'BRAND', beige: 'COLOR'}


{No Brand: 'BRAND', Krest: 'BRAND', 56: 'SIZE'}


{Bench: 'BRAND'}


{Pulpe De Vie: 'BRAND'}


{Jill Stuart: 'BRAND', 30: 'SIZE'}


{Astrid Olsen x NA-KD: 'BRAND', Black: 'COLOR'}


{Jo: 'BRAND'}


{Whistler: 'BRAND'}


{Montana Goggles by SBG: 'BRAND'}


{Viking: 'BRAND'}


{Dolphi: 'BRAND', XXXXXL: 'SIZE', 12: 'SIZE'}


{Emotions: 'BRAND'}


{The Ohm Collection: 'BRAND'}


{Aqua Monaco: 'BRAND', Cola: 'COLOR', 230ml: 'SIZE'}


{Belladot: 'BRAND', Large: 'SIZE'}


{Baseus: 'BRAND'}


### 2) Result on the unseen data

In [27]:
for n in range(0,100):
    if df['title'].loc[n] not in titles_unique_brand:
        doc = nlp(df['title'].loc[n])
        spacy.displacy.render(doc, style="ent")
        ent_output(doc)

{White: 'COLOR'}




{}


{Black/Br: 'COLOR'}


{Diesel: 'BRAND'}


{Black: 'COLOR'}


{Black: 'COLOR'}


{Valkoinen: 'COLOR'}


{Vredestein: 'BRAND'}


{Diesel: 'BRAND'}


{Gloryfy Gi15 St: 'BRAND'}


{Vaaleanpunainen: 'COLOR', Résumé: 'BRAND'}


{Superdry: 'BRAND', 104: 'SIZE'}


{21 cm: 'SIZE'}


{40ml: 'SIZE'}


{valkoinen: 'COLOR'}


{Red: 'COLOR'}


{Dana Buchman: 'BRAND', BLACK: 'COLOR'}


{Roberto Cavalli: 'BRAND'}


In [28]:
for n in range(0,200):
    if type(df['description'].loc[n])==str:
        if df['description'].loc[n] not in description_unique_brand:
            doc = nlp(df['description'].loc[n])
            spacy.displacy.render(doc, style="ent")
            ent_output(doc)

{}


{New Balance: 'BRAND'}


{Diesel: 'BRAND', Shiny Black.: 'COLOR', Plastic.: 'MATERIAL', 54.: 'SIZE'}


{}


{}


{}


{}


{Diesel: 'BRAND', Black.: 'COLOR', Plastic.: 'MATERIAL', 52.: 'SIZE'}


{}


{Résumé: 'BRAND'}


{Superdry: 'BRAND', Black.: 'COLOR', Plastic.: 'MATERIAL', 52.: 'SIZE'}


{}


{Dana Buchman: 'BRAND', Black.: 'COLOR', Metal.: 'MATERIAL', 52.: 'SIZE'}


{Roberto Cavalli: 'BRAND', Tortoise.: 'COLOR', Plastic.: 'MATERIAL', 49.: 'SIZE'}


{}


{}


{Farah: 'BRAND', Gold.: 'COLOR', Metal.: 'MATERIAL', 56.: 'SIZE'}


{Diesel: 'BRAND', Black.: 'COLOR', Plastic.: 'MATERIAL', 52.: 'SIZE'}


{}


{Clinique All About: 'BRAND', Purple: 'COLOR', Pumps: 'BRAND'}


{2-3.: 'SIZE'}


{GAP: 'BRAND'}


{Diesel: 'BRAND', Shiny Blue.: 'COLOR', Plastic.: 'MATERIAL', 54.: 'SIZE'}


{blue: 'COLOR', miesten: 'GENDER'}


{Puma PJ0029O Kids: 'BRAND', Black.: 'COLOR', Plastic.: 'MATERIAL', 50.: 'SIZE'}


{Cazal: 'BRAND', Cream/Gold.: 'COLOR', Metal.: 'MATERIAL', 53.: 'SIZE'}


{}


{Aluminum: 'BRAND', Stelton: 'BRAND'}


{pink: 'COLOR', naisten: 'GENDER', s,m,l,xl,xs: 'SIZE'}


{Tommy Hilfiger luo: 'BRAND'}


{naisten: 'GENDER'}


{Police: 'BRAND', Kids: 'AGE_GROUP', Blue.: 'COLOR', Plastic.: 'MATERIAL', 48.: 'SIZE'}


{}


{}


### 3) Arbirary texts

In [29]:
doc = nlp('I have adidas Originals White sneakers.')
spacy.displacy.render(doc, style="ent")
ent_output(doc)

{adidas: 'BRAND', White: 'COLOR'}


In [30]:
doc = nlp('You have Soyaconcept Blue coat.')
spacy.displacy.render(doc, style="ent")
ent_output(doc)

{Soyaconcept: 'BRAND', Blue: 'COLOR'}


In [31]:
doc = nlp('I have a Black shirt, which is 3 kg.')
spacy.displacy.render(doc, style="ent")
ent_output(doc)

{Black: 'COLOR', 3 kg.: 'WEIGHT'}


## 7. Accuracy

In [32]:
def evaluate(ner_model, examples):
    scorer = Scorer()
    for input_, annot in examples:
        doc_gold_text = ner_model.make_doc(input_)
        gold = GoldParse(doc_gold_text, entities=annot['entities'])
        pred_value = ner_model(input_)
        scorer.score(pred_value, gold)
    return scorer.scores

The explanations of the scores:
- ents_p: Named entity accuracy (precision).
- ents_r: Named entity accuracy (recall).
- ents_f: Named entity accuracy (F-score).
- ents_per_type: Scores per entity label. Keyed by label, mapped to a dict of p, r and f scores.
- token_acc: Tokenization accuracy.

### 1) Scores on the Trained Data

In [33]:
results = evaluate(nlp, TRAIN_DATA)

In [34]:
results

{'uas': 0.0,
 'las': 0.0,
 'las_per_type': {'': {'p': 0.0, 'r': 0.0, 'f': 0.0}},
 'ents_p': 99.86586574911063,
 'ents_r': 99.9474697951322,
 'ents_f': 99.90665110851809,
 'ents_per_type': {'SIZE': {'p': 99.83510011778563,
   'r': 99.95283018867924,
   'f': 99.8939304655274},
  'BRAND': {'p': 99.94297120045623,
   'r': 99.9144811858609,
   'f': 99.92872416250891},
  'COLOR': {'p': 99.91847826086956,
   'r': 99.97281131049483,
   'f': 99.9456374014678},
  'MATERIAL': {'p': 99.1566265060241, 'r': 100.0, 'f': 99.57652752571082},
  'GENDER': {'p': 100.0, 'r': 100.0, 'f': 100.0},
  'WEIGHT': {'p': 99.625468164794, 'r': 100.0, 'f': 99.812382739212},
  'AGE_GROUP': {'p': 100.0, 'r': 100.0, 'f': 100.0}},
 'tags_acc': 0.0,
 'token_acc': 100.0,
 'textcat_score': 0.0,
 'textcats_per_cat': {}}

### 2) Scores on the Unseen Data

In [35]:
df_sample=df.sample(n=1000, random_state=1)

In [36]:
test_data=[]
for n in range(0,len(df_sample['title'])):
    if df_sample['title'].iloc[n] not in titles_unique_brand:
        annotation=obtain_annotation(df_sample['title'].iloc[n])
        if annotation[1]['entities']!=[]:
            test_data.append(annotation)

In [37]:
for n in range(0,len(df_sample['description'])):
    if type(df_sample['description'].iloc[n])==str:
        if df_sample['description'].iloc[n] not in description_unique_brand:
            if '<' not in df_sample['description'].iloc[n]:
                if '&' not in df_sample['description'].iloc[n]:
                    annotation=obtain_annotation(df_sample['description'].iloc[n])
                    if annotation[1]['entities']!=[]:
                        test_data.append(annotation)

In [38]:
len(test_data)

1414

In [39]:
with open("test_data.txt", "wb") as fp:   #Pickling
    pickle.dump(test_data, fp)

In [40]:
results = evaluate(nlp, test_data)

In [41]:
results

{'uas': 0.0,
 'las': 0.0,
 'las_per_type': {'': {'p': 0.0, 'r': 0.0, 'f': 0.0}},
 'ents_p': 82.85446595877576,
 'ents_r': 88.16882685277501,
 'ents_f': 85.42907744324584,
 'ents_per_type': {'BRAND': {'p': 80.19884009942005,
   'r': 86.19768477292965,
   'f': 83.09012875536482},
  'GENDER': {'p': 97.85714285714285,
   'r': 98.56115107913669,
   'f': 98.2078853046595},
  'SIZE': {'p': 80.65296251511487,
   'r': 85.18518518518519,
   'f': 82.85714285714286},
  'COLOR': {'p': 86.49789029535864,
   'r': 90.04392386530014,
   'f': 88.23529411764706},
  'AGE_GROUP': {'p': 100.0, 'r': 94.11764705882352, 'f': 96.96969696969697},
  'MATERIAL': {'p': 85.17241379310346,
   'r': 94.6360153256705,
   'f': 89.65517241379311},
  'WEIGHT': {'p': 27.27272727272727, 'r': 100.0, 'f': 42.857142857142854}},
 'tags_acc': 0.0,
 'token_acc': 100.0,
 'textcat_score': 0.0,
 'textcats_per_cat': {}}

- The overall accuraries are above 80%.
- The accuracies for each type are:
    - The most accurate entity label is GENDER and AGE_GROUP (>99%) for all three types of accuracies.
    - The second most accurate entity label is MATERIAL (approximately 90%) for all three types of accuracies.
    - The accuraies of COLOR  label is above just below 90%.
    - BRAND, SIZE have similar accuracies, which are above 80%.
    - WEIGHT label gives quite a different values for three types of accuracies.

## 8. Conclusion

The results for the trained data and unseen data are both good. Although the accuracy of the trained data is 100%, the accuracy on the unseen data is about 80%. This means the current model is overfitting. Future works should be focused on increasing the accuracy of the test data.

Other issues are:
- The problem on the accuracy of the WEIGHT label
- More combined entities, Collection:Men 
- Other entity types (labels)? (Products…)