# Name-Entity Recognition of Descriptoins of Various Products

## Part III: Testing the Model

## 1. Dataset

In [1]:
import pandas as pd
import numpy as np
import plac
import random
from pathlib import Path
import spacy
from tqdm import tqdm 
from spacy.util import minibatch, compounding
from spacy import displacy
import string
import pickle
from spacy.gold import GoldParse
from spacy.scorer import Scorer

In [2]:
df = pd.read_csv("data.csv")

In [3]:
df.shape

(671621, 8)

In [4]:
df.head()

Unnamed: 0,title,description,summary,brand,price,meta,provider_category,provider
0,"adidas Originals - Superstar - Valkoinen - US 5,5",,,adidas Originals,66.5,"{""SIZE"": [""us 5,5""], ""COLOR"": [""valkoinen""], ""...",17-muoti-ja-vaatetus,Caliroots
1,Sc-Erna Polvipituinen Hame Sininen Soyaconcept,"SOYACONCEPT on tanskalainen brändi, joka luo e...",,Soyaconcept,49.99,"{""SIZE"": [""36""], ""COLOR"": [""cristal blue""], ""G...",17-muoti-ja-vaatetus,Boozt
2,Dana Buchman Silmälasit Taren CARAMEL TORTOISE,Dana Buchman Taren Silmälasit. Collection:Men....,,Dana Buchman,146.0,"{""SIZE"": [""54""], ""COLOR"": [""tortoise""], ""GENDE...",13-silmalasit-ja-piilolinssit,Smartbuy Glasses
3,Active Sports Woven Shorts B Shortsit Musta PUMA,PUMA Active Sports Woven Shorts B,,PUMA,27.0,"{""SIZE"": [""164"", ""128"", ""110"", ""116"", ""104"", ""...",17-muoti-ja-vaatetus,Boozt
4,Renata Polvipituinen Hame Musta Fall Winter Sp...,Fall Winter Spring Summer. A-linjainen.,,Fall Winter Spring Summer,199.0,"{""SIZE"": [""xs""], ""COLOR"": [""jet black""], ""GEND...",17-muoti-ja-vaatetus,Boozt


In [5]:
i=0
print(df.loc[i])
print(df['meta'].loc[i])

title                adidas Originals - Superstar - Valkoinen - US 5,5
description                                                        NaN
summary                                                            NaN
brand                                                 adidas Originals
price                                                             66.5
meta                 {"SIZE": ["us 5,5"], "COLOR": ["valkoinen"], "...
provider_category                                 17-muoti-ja-vaatetus
provider                                                     Caliroots
Name: 0, dtype: object
{"SIZE": ["us 5,5"], "COLOR": ["valkoinen"], "GENDER": ["unisex"]}


In [6]:
i=1
print(df.loc[i])
print(df['meta'].loc[i])

title                   Sc-Erna Polvipituinen Hame Sininen Soyaconcept
description          SOYACONCEPT on tanskalainen brändi, joka luo e...
summary                                                            NaN
brand                                                      Soyaconcept
price                                                            49.99
meta                 {"SIZE": ["36"], "COLOR": ["cristal blue"], "G...
provider_category                                 17-muoti-ja-vaatetus
provider                                                         Boozt
Name: 1, dtype: object
{"SIZE": ["36"], "COLOR": ["cristal blue"], "GENDER": ["women"]}


## 2. Entity and Entity Types

One can obtain entity and entity types from the *meta* and *brand* columns. However, there are many words incorrectly written entities. These are deleted, and many entities obtained from the *title* and *descriptoin* columns are added by hand. The saved entities and eneity types are uploaded below.

In [7]:
with open("entity.txt", "rb") as fp:   # Unpickling
    entity = pickle.load(fp)

In [8]:
with open("entity_types.txt", "rb") as fp:   # Unpickling
    entity_types = pickle.load(fp)

In [9]:
with open("titles_unique_brand.txt", "rb") as fp:   # Unpickling
    titles_unique_brand = pickle.load(fp)

In [10]:
with open("description_unique_brand.txt", "rb") as fp:   # Unpickling
    description_unique_brand = pickle.load(fp)

In [11]:
entity[0:10]

['US 5,5',
 'Valkoinen',
 '36',
 'Cristal Blue',
 '54',
 'Tortoise',
 '164',
 '128',
 '110',
 '116']

In [12]:
entity_types[0:10]

['SIZE',
 'COLOR',
 'SIZE',
 'COLOR',
 'SIZE',
 'COLOR',
 'SIZE',
 'SIZE',
 'SIZE',
 'SIZE']

## 3. Training Data and Model

- Loop through the title column, and the description column, make a list extracted from title/description column which have unique brands.
    - For the description column, we can drop rows include `&` and `<b>` signs, in order to increase the accuracy.
- In each loop, use the entity and entity types lists:
    - Construct annotation, but this annotation will be highly overlapped in the charactors' ranges. This is not allowed in the training process. 
    - Get rid of overlapped charactors' ranges, always choose the phrases with wider range
    - Repeating the same process above, in case there are still overlapped charactors' ranges. This is due to the fact that, the wirtten code, only compares two phrases at once, and this may lead to some left over overlapping.
- The saved data is uploaded below.

In [13]:
with open("TRAIN_DATA.txt", "rb") as fp:   # Unpickling
    TRAIN_DATA = pickle.load(fp)

In [14]:
# upload the model to output directory
output_dir=Path("/Users/farukakbar/Desktop/ner_products/model")
if output_dir is not None:
    print("Loading from", output_dir)
    nlp = spacy.load(output_dir)
    #for text, _ in TRAIN_DATA:
    #    doc = nlp(text)
    #    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
    #    #print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Loading from /Users/farukakbar/Desktop/ner_products/model


## 4. Results

In [15]:
# Entity and Entity Types output
def ent_output(nlp_doc):
    ent_dict={}
    entity_list=list(doc.ents) 
    for item in entity_list:
        ent_dict[item]=item.label_
        #print(item, item.label_)
    print(ent_dict)

### 1) Results on the trained data

In [16]:
trained_data=[]
for n in range(0,100):
    doc = nlp(TRAIN_DATA[n][0])
    spacy.displacy.render(doc, style="ent")
    ent_output(doc)
    trained_data.append(TRAIN_DATA[n])

{Custom: 'BRAND'}


{Svartzonker: 'BRAND'}


{40: 'SIZE', black: 'COLOR'}


{Mercury: 'BRAND', harmaa/musta: 'COLOR'}


{Grunt: 'BRAND'}


{SNÖ of Sweden: 'BRAND', 80: 'SIZE', Rosé/Clear: 'COLOR'}


{Star: 'BRAND', 81: 'SIZE', 4: 'SIZE'}


{DD Hammocks: 'BRAND'}


{XL: 'SIZE', 64GB: 'SIZE', Black: 'COLOR'}


{Dixie: 'BRAND'}


{Peanut: 'COLOR'}


{Titanium: 'COLOR', musta: 'COLOR'}


{Basson: 'BRAND', 59 cm,: 'SIZE', 73 cm,: 'SIZE', 90 cm,: 'SIZE', 100 cm: 'SIZE', 13,6 kg: 'WEIGHT'}


{brown: 'COLOR', naisten: 'GENDER', 39: 'SIZE'}


{4g: 'WEIGHT'}


{Watermelon: 'COLOR', 300 ml: 'SIZE'}


{20g: 'WEIGHT', Metsän Emäntä: 'BRAND'}


{Makari: 'BRAND', Extreme: 'BRAND', 50g: 'WEIGHT'}


{By May: 'BRAND'}


{Sonicwall: 'BRAND'}


{76 cm.: 'SIZE'}


{white: 'COLOR'}


{Miamor: 'BRAND', Royale: 'COLOR', 100g: 'WEIGHT'}


{Heimstone: 'BRAND'}


{Cool shoe: 'BRAND'}


{11 Degrees: 'BRAND', Core: 'BRAND', Mens,: 'GENDER', Musta: 'COLOR'}


{Le comptoir scandinave: 'BRAND'}


{Beauty Sweeties: 'BRAND'}


{Wella: 'BRAND'}


{Biovegan: 'BRAND'}


{Celine: 'BRAND'}


{Musta: 'COLOR', Forét: 'BRAND'}


{Flowee: 'BRAND', 60 g: 'WEIGHT'}


{70g: 'WEIGHT'}


{ReadeREST: 'BRAND', Steel: 'BRAND'}


{CHI: 'BRAND'}


{Phil&Teds: 'BRAND'}


{Keeper: 'BRAND'}


{Victor Vaissier: 'BRAND'}


{Panasonic: 'BRAND', Eneloop: 'BRAND'}


{99 gr: 'WEIGHT'}


{NA-KD Party: 'BRAND'}


{12: 'SIZE'}


{Valkoinen: 'COLOR', CPH Lighting: 'BRAND'}


{Bio- Puutarhalannoite: 'BRAND', 6 kg: 'WEIGHT'}


{Snow Peak: 'BRAND', Musta: 'COLOR', L: 'SIZE'}


{Helena Rubinstein: 'BRAND'}


{Sante: 'BRAND', Jojoba: 'COLOR'}


{Lucepla: 'BRAND'}


{Cheap Monday: 'BRAND', Sininen: 'COLOR', W29: 'SIZE'}


{Sininen: 'COLOR', Oscar Jacobson: 'BRAND'}


{WOS: 'BRAND'}


{E-Stim: 'BRAND', Medium: 'SIZE'}


{Coach Fragrance: 'BRAND', Blue: 'COLOR'}


{212ml: 'SIZE'}


{Boss by Hugo Boss: 'BRAND'}


{Emilie Briting x NA-KD: 'BRAND'}


{brown: 'COLOR', naisten: 'GENDER', 40: 'SIZE'}


{Light My Fire: 'BRAND'}


{Aigle: 'BRAND'}


{Armour: 'BRAND', Sininen: 'COLOR', Under Armour: 'BRAND'}


{Dafi: 'BRAND'}


{Viatti: 'BRAND'}


{26: 'SIZE', Co: 'BRAND', gold or silver.: 'COLOR'}


{Artek: 'BRAND'}


{Falken: 'BRAND'}


{Nortenha: 'BRAND'}


{Mexx: 'BRAND', Brown.: 'COLOR', Plastic.: 'MATERIAL', 53.: 'SIZE'}


{Clearasil: 'BRAND', 65: 'SIZE'}


{puuvillaa.: 'MATERIAL'}


{Musta: 'COLOR', Hunkemöller: 'BRAND'}


{Hanf & Natur: 'BRAND', 500g: 'WEIGHT'}


{Party Popteenies: 'BRAND'}


{Tianli: 'BRAND'}


{black: 'COLOR', naisten: 'GENDER', de 34,de 38: 'SIZE'}


{Hauppauge: 'BRAND', 60: 'SIZE'}


{Oribe: 'BRAND'}


{Cobra: 'BRAND', Midland: 'BRAND', Zodiac: 'BRAND', 68: 'SIZE', Nord: 'BRAND'}


{240mm: 'SIZE'}


{Sininen: 'COLOR', Hilfiger Collection: 'BRAND'}


{Partner Tech: 'BRAND'}


{Adidas: 'BRAND'}


{beige: 'COLOR', naisten: 'GENDER', l,xs: 'SIZE'}


{Sininen: 'COLOR', Icebreaker: 'BRAND'}


{MAX: 'BRAND'}


{6 cm,: 'SIZE', 38 cm.: 'SIZE', 28 kg.: 'WEIGHT'}


{Weather Report: 'BRAND'}


{Genzo: 'BRAND', Kombi: 'BRAND'}


{Next Tread: 'BRAND'}


{600ml: 'SIZE', Purple: 'COLOR'}


{150x210 cm,: 'SIZE', Cobalt: 'COLOR'}


{Sininen: 'COLOR', Calvin Klein Jeans: 'BRAND'}


{100 ml: 'SIZE'}


{Halva: 'BRAND'}


{Hem: 'BRAND'}


{black: 'COLOR', naisten: 'GENDER', 40: 'SIZE'}


{Bulldog: 'BRAND', Bamboo: 'COLOR'}


{Vicomte A.: 'BRAND'}


{Stanton Street Sports: 'BRAND', Sininen: 'COLOR'}


{Silhouette: 'BRAND', Titan: 'BRAND'}


### 2) Result on the unseen data

In [17]:
for n in range(0,100):
    if df['title'].loc[n] not in titles_unique_brand:
        doc = nlp(df['title'].loc[n])
        spacy.displacy.render(doc, style="ent")
        ent_output(doc)

{White: 'COLOR'}




{}


{Black/Br: 'COLOR'}


{Diesel: 'BRAND'}


{Black: 'COLOR'}


{Black: 'COLOR'}


{Valkoinen: 'COLOR'}


{Vredestein: 'BRAND'}


{Diesel: 'BRAND'}


{Gloryfy Gi15 St: 'BRAND'}


{Vaaleanpunainen: 'COLOR', Résumé: 'BRAND'}


{Superdry: 'BRAND', 104: 'SIZE'}


{21 cm: 'SIZE'}


{40ml: 'SIZE'}


{valkoinen: 'COLOR'}


{Red: 'COLOR'}


{Dana Buchman: 'BRAND', BLACK: 'COLOR'}


{Roberto Cavalli: 'BRAND'}


In [18]:
for n in range(0,200):
    if type(df['description'].loc[n])==str:
        if df['description'].loc[n] not in description_unique_brand:
            doc = nlp(df['description'].loc[n])
            spacy.displacy.render(doc, style="ent")
            ent_output(doc)

{}


{New Balance: 'BRAND'}


{Diesel: 'BRAND', Shiny Black.: 'COLOR', Plastic.: 'MATERIAL', 54.: 'SIZE'}


{}


{}


{}


{}


{Diesel: 'BRAND', Black.: 'COLOR', Plastic.: 'MATERIAL', 52.: 'SIZE'}


{}


{Résumé: 'BRAND'}


{Superdry: 'BRAND', Black.: 'COLOR', Plastic.: 'MATERIAL', 52.: 'SIZE'}


{}


{Dana Buchman: 'BRAND', Black.: 'COLOR', Metal.: 'MATERIAL', 52.: 'SIZE'}


{Roberto Cavalli: 'BRAND', Tortoise.: 'COLOR', Plastic.: 'MATERIAL', 49.: 'SIZE'}


{}


{}


{Farah: 'BRAND', Gold.: 'COLOR', Metal.: 'MATERIAL', 56.: 'SIZE'}


{Diesel: 'BRAND', Black.: 'COLOR', Plastic.: 'MATERIAL', 52.: 'SIZE'}


{}


{Clinique All About: 'BRAND', Purple: 'COLOR', Pumps: 'BRAND'}


{2-3.: 'SIZE'}


{GAP: 'BRAND'}


{Diesel: 'BRAND', Shiny Blue.: 'COLOR', Plastic.: 'MATERIAL', 54.: 'SIZE'}


{blue: 'COLOR', miesten: 'GENDER'}


{Puma PJ0029O Kids: 'BRAND', Black.: 'COLOR', Plastic.: 'MATERIAL', 50.: 'SIZE'}


{Cazal: 'BRAND', Cream/Gold.: 'COLOR', Metal.: 'MATERIAL', 53.: 'SIZE'}


{}


{Aluminum: 'BRAND', Stelton: 'BRAND'}


{pink: 'COLOR', naisten: 'GENDER', s,m,l,xl,xs: 'SIZE'}


{Tommy Hilfiger luo: 'BRAND'}


{naisten: 'GENDER'}


{Police: 'BRAND', Kids: 'AGE_GROUP', Blue.: 'COLOR', Plastic.: 'MATERIAL', 48.: 'SIZE'}


{}


{}


### 3) Arbirary texts

In [19]:
doc = nlp('I have adidas Originals White sneakers.')
spacy.displacy.render(doc, style="ent")
ent_output(doc)

{adidas: 'BRAND', White: 'COLOR'}


In [20]:
doc = nlp('You have Soyaconcept Blue coat.')
spacy.displacy.render(doc, style="ent")
ent_output(doc)

{Soyaconcept: 'BRAND', Blue: 'COLOR'}


In [21]:
doc = nlp('I have a Black shirt, which is 3 kg.')
spacy.displacy.render(doc, style="ent")
ent_output(doc)

{Black: 'COLOR', 3 kg.: 'WEIGHT'}


## 5. Accuracy

In [22]:
def evaluate(ner_model, examples):
    scorer = Scorer()
    for input_, annot in examples:
        doc_gold_text = ner_model.make_doc(input_)
        gold = GoldParse(doc_gold_text, entities=annot['entities'])
        pred_value = ner_model(input_)
        scorer.score(pred_value, gold)
    return scorer.scores

The explanations of the scores:
- ents_p: Named entity accuracy (precision).
- ents_r: Named entity accuracy (recall).
- ents_f: Named entity accuracy (F-score).
- ents_per_type: Scores per entity label. Keyed by label, mapped to a dict of p, r and f scores.
- token_acc: Tokenization accuracy.

### 1) Scores on the Trained Data

In [23]:
results = evaluate(nlp, trained_data)

In [24]:
results

{'uas': 0.0,
 'las': 0.0,
 'las_per_type': {'': {'p': 0.0, 'r': 0.0, 'f': 0.0}},
 'ents_p': 100.0,
 'ents_r': 100.0,
 'ents_f': 100.0,
 'ents_per_type': {'BRAND': {'p': 100.0, 'r': 100.0, 'f': 100.0},
  'SIZE': {'p': 100.0, 'r': 100.0, 'f': 100.0},
  'COLOR': {'p': 100.0, 'r': 100.0, 'f': 100.0},
  'WEIGHT': {'p': 100.0, 'r': 100.0, 'f': 100.0},
  'GENDER': {'p': 100.0, 'r': 100.0, 'f': 100.0},
  'MATERIAL': {'p': 100.0, 'r': 100.0, 'f': 100.0}},
 'tags_acc': 0.0,
 'token_acc': 100.0,
 'textcat_score': 0.0,
 'textcats_per_cat': {}}

### 2) Scores on the Unseen Data

In [25]:
with open("test_data.txt", "rb") as fp:   # Unpickling
    test_data = pickle.load(fp)

In [26]:
len(test_data)

1414

In [27]:
results = evaluate(nlp, test_data)

In [28]:
results

{'uas': 0.0,
 'las': 0.0,
 'las_per_type': {'': {'p': 0.0, 'r': 0.0, 'f': 0.0}},
 'ents_p': 82.85446595877576,
 'ents_r': 88.16882685277501,
 'ents_f': 85.42907744324584,
 'ents_per_type': {'SIZE': {'p': 80.65296251511487,
   'r': 85.18518518518519,
   'f': 82.85714285714286},
  'GENDER': {'p': 97.85714285714285,
   'r': 98.56115107913669,
   'f': 98.2078853046595},
  'BRAND': {'p': 80.19884009942005,
   'r': 86.19768477292965,
   'f': 83.09012875536482},
  'COLOR': {'p': 86.49789029535864,
   'r': 90.04392386530014,
   'f': 88.23529411764706},
  'AGE_GROUP': {'p': 100.0, 'r': 94.11764705882352, 'f': 96.96969696969697},
  'MATERIAL': {'p': 85.17241379310346,
   'r': 94.6360153256705,
   'f': 89.65517241379311},
  'WEIGHT': {'p': 27.27272727272727, 'r': 100.0, 'f': 42.857142857142854}},
 'tags_acc': 0.0,
 'token_acc': 100.0,
 'textcat_score': 0.0,
 'textcats_per_cat': {}}