# Hugging Face Transformers

## 0. Read in Data

In [1]:
import pandas as pd

# modify the column width
pd.set_option('display.max_colwidth', None) # Default is 50, None shows all text

# look at a subset of the reviews
df = pd.read_excel('Data/Popchip_Reviews_Sentiment.xlsx').head(30)
df.head(2)

Unnamed: 0,Id,UserId,Rating,Priority,Title,Text,Sentiment_VADER
0,23689,A21SYVGVNG8RAS,5,Low,Yummy snacks!,Popchips are the bomb!! I use the parmesan garlic to scoop up cottage cheese as a healthy alternative to chips and dip. My healthy eating program is saved.,0.9244
1,23690,AQJYXC0MPRQJL,5,Low,Great chip that is different from the rest,"I like the puffed nature of this chip that makes it more unique in the chip market. I ordered the Salt and Vinegar and absolutely love that flavor, hands down my favorite chip ever. I have tried the cheddar and regular flavors as well. The cheddar is about a 4/5 and the regular is about a 3/5 because I prefer strong flavors and obviously that would not be the case for the regular. The Salt and Vinegar is kind of weak compared to some regular S&V chips, but is quite flavorful and makes you wanting to come back for more.",0.7269


In [2]:
df.shape

(30, 7)

## 1. Sentiment Analysis

In [3]:
from transformers import pipeline

In [4]:
sentiment_analyzer = pipeline('sentiment-analysis', 
                              model='distilbert/distilbert-base-uncased-finetuned-sst-2-english',
                              device=-1 # -1 to use CPU
                             )

Device set to use cpu


In [5]:
text1 = 'When life gives you lemons, make lemonade! ðŸ™‚'
text2 = 'A dozen lemons will make a gallon of lemonade.'
text3 = 'I didn\'t like the taste of that lemonade at all.'

In [6]:
sentiment_analyzer(text1)

[{'label': 'POSITIVE', 'score': 0.996239423751831}]

In [7]:
sentiment_analyzer(text2)

[{'label': 'POSITIVE', 'score': 0.7781572341918945}]

In [8]:
sentiment_analyzer(text3)

[{'label': 'NEGATIVE', 'score': 0.9955589771270752}]

In [9]:
## Practical Example

In [10]:
sentiment_analyzer = pipeline('sentiment-analysis', 
                              model='distilbert/distilbert-base-uncased-finetuned-sst-2-english',
                              device=-1, # -1 to use CPU
                              truncation=True # Truncates text to make it shorter (Text we want to analyze)
                             )

Device set to use cpu


In [11]:
df.Text.apply(sentiment_analyzer)

0     [{'label': 'POSITIVE', 'score': 0.9935213923454285}]
1      [{'label': 'POSITIVE', 'score': 0.999605119228363}]
2     [{'label': 'NEGATIVE', 'score': 0.6984866261482239}]
3     [{'label': 'NEGATIVE', 'score': 0.9996308088302612}]
4     [{'label': 'POSITIVE', 'score': 0.9991814494132996}]
5     [{'label': 'POSITIVE', 'score': 0.9994196891784668}]
6     [{'label': 'POSITIVE', 'score': 0.9992188215255737}]
7     [{'label': 'POSITIVE', 'score': 0.9969040751457214}]
8     [{'label': 'POSITIVE', 'score': 0.9894027709960938}]
9     [{'label': 'POSITIVE', 'score': 0.9991832375526428}]
10    [{'label': 'POSITIVE', 'score': 0.9994851350784302}]
11    [{'label': 'NEGATIVE', 'score': 0.7255946397781372}]
12    [{'label': 'POSITIVE', 'score': 0.9966173768043518}]
13    [{'label': 'POSITIVE', 'score': 0.9997195601463318}]
14    [{'label': 'POSITIVE', 'score': 0.8944363594055176}]
15    [{'label': 'POSITIVE', 'score': 0.9989368319511414}]
16    [{'label': 'POSITIVE', 'score': 0.9998534917831421

In [12]:
## Sentiment: round 2

In [13]:
%%time
# ^ Specific to jupyter notebook. It puts a magic function to see how long the cell took to run

from transformers import logging

logging.set_verbosity_error() # removes confusing errors when running pipeline

sentiment_analyzer = pipeline('sentiment-analysis', 
                              model='distilbert/distilbert-base-uncased-finetuned-sst-2-english',
                              device=-1, # -1 to use CPU
                              truncation=True # Truncates text to make it shorter (Text we want to analyze)
                             )

df.Text.apply(sentiment_analyzer)

CPU times: total: 13.4 s
Wall time: 850 ms


0     [{'label': 'POSITIVE', 'score': 0.9935213923454285}]
1      [{'label': 'POSITIVE', 'score': 0.999605119228363}]
2     [{'label': 'NEGATIVE', 'score': 0.6984866261482239}]
3     [{'label': 'NEGATIVE', 'score': 0.9996308088302612}]
4     [{'label': 'POSITIVE', 'score': 0.9991814494132996}]
5     [{'label': 'POSITIVE', 'score': 0.9994196891784668}]
6     [{'label': 'POSITIVE', 'score': 0.9992188215255737}]
7     [{'label': 'POSITIVE', 'score': 0.9969040751457214}]
8     [{'label': 'POSITIVE', 'score': 0.9894027709960938}]
9     [{'label': 'POSITIVE', 'score': 0.9991832375526428}]
10    [{'label': 'POSITIVE', 'score': 0.9994851350784302}]
11    [{'label': 'NEGATIVE', 'score': 0.7255946397781372}]
12    [{'label': 'POSITIVE', 'score': 0.9966173768043518}]
13    [{'label': 'POSITIVE', 'score': 0.9997195601463318}]
14    [{'label': 'POSITIVE', 'score': 0.8944363594055176}]
15    [{'label': 'POSITIVE', 'score': 0.9989368319511414}]
16    [{'label': 'POSITIVE', 'score': 0.9998534917831421

In [14]:
%%time
# ^ Specific to jupyter notebook. It puts a magic function to see how long the cell took to run

from transformers import logging

logging.set_verbosity_error() # removes confusing errors when running pipeline

sentiment_analyzer = pipeline('sentiment-analysis', 
                              model='distilbert/distilbert-base-uncased-finetuned-sst-2-english',
                              device=0, # -1 to use CPU, 'mps' to use apple GPU. Windows NVIDIA GPU use 'cuda' or 'cuda:0' or 0.
                              truncation=True # Truncates text to make it shorter (Text we want to analyze)
                             )

sentiment_scores = df.Text.apply(sentiment_analyzer)
sentiment_scores[:5]

CPU times: total: 13.4 s
Wall time: 856 ms


0    [{'label': 'POSITIVE', 'score': 0.9935213923454285}]
1     [{'label': 'POSITIVE', 'score': 0.999605119228363}]
2    [{'label': 'NEGATIVE', 'score': 0.6984866261482239}]
3    [{'label': 'NEGATIVE', 'score': 0.9996308088302612}]
4    [{'label': 'POSITIVE', 'score': 0.9991814494132996}]
Name: Text, dtype: object

In [15]:
sentiment_scores[0][0]['label']

'POSITIVE'

In [16]:
sentiment_scores[0][0]['score']

0.9935213923454285

In [17]:
df.head(2)

Unnamed: 0,Id,UserId,Rating,Priority,Title,Text,Sentiment_VADER
0,23689,A21SYVGVNG8RAS,5,Low,Yummy snacks!,Popchips are the bomb!! I use the parmesan garlic to scoop up cottage cheese as a healthy alternative to chips and dip. My healthy eating program is saved.,0.9244
1,23690,AQJYXC0MPRQJL,5,Low,Great chip that is different from the rest,"I like the puffed nature of this chip that makes it more unique in the chip market. I ordered the Salt and Vinegar and absolutely love that flavor, hands down my favorite chip ever. I have tried the cheddar and regular flavors as well. The cheddar is about a 4/5 and the regular is about a 3/5 because I prefer strong flavors and obviously that would not be the case for the regular. The Salt and Vinegar is kind of weak compared to some regular S&V chips, but is quite flavorful and makes you wanting to come back for more.",0.7269


In [18]:
df['Label_HF'] = sentiment_scores.apply(lambda x: x[0]['label'])
df['Score_HF'] = sentiment_scores.apply(lambda x: x[0]['score'])

In [20]:
df['Sentiment_HF'] = df.apply(lambda row: row['Score_HF'] if row['Label_HF'] == 'POSITIVE' else -row['Score_HF'], axis=1)

In [21]:
df.head()

Unnamed: 0,Id,UserId,Rating,Priority,Title,Text,Sentiment_VADER,Label_HF,Score_HF,Sentiment_HF
0,23689,A21SYVGVNG8RAS,5,Low,Yummy snacks!,Popchips are the bomb!! I use the parmesan garlic to scoop up cottage cheese as a healthy alternative to chips and dip. My healthy eating program is saved.,0.9244,POSITIVE,0.993521,0.993521
1,23690,AQJYXC0MPRQJL,5,Low,Great chip that is different from the rest,"I like the puffed nature of this chip that makes it more unique in the chip market. I ordered the Salt and Vinegar and absolutely love that flavor, hands down my favorite chip ever. I have tried the cheddar and regular flavors as well. The cheddar is about a 4/5 and the regular is about a 3/5 because I prefer strong flavors and obviously that would not be the case for the regular. The Salt and Vinegar is kind of weak compared to some regular S&V chips, but is quite flavorful and makes you wanting to come back for more.",0.7269,POSITIVE,0.999605,0.999605
2,23691,A30NYUHEDLWI0Y,5,High,Great Alternative to Potato Chips,"I just love these chips! I was always a big fan of potato chips, but haven't had one since I discovered popchips. They are great for dipping or all alone. I am constantly re-ordering them. One note however-if you are on a low salt diet these chips are probably not for you. They are high in sodium. We go through a case every two months. If you love them it pays to join the subscribe and save program through Amazon. You save money and stay supplied!",0.979,NEGATIVE,0.698487,-0.698487
3,23692,A2NU55U9LKTB5J,3,Low,Not somthing I would crave,"These tasted like potatoe stix, that we got in grade school with our lunches usually on pizza day. They were the bomb then, not so much now. Won't buy again unless I get them for cheap or free.",0.8689,NEGATIVE,0.999631,-0.999631
4,23693,A225F7QFP5LIW2,5,High,healthy and delicious,"These chips are great! They look almost like a flattened rice cake, but taste so much better, more like a potato chip. The bbq flavor is delicious. They are very low in fat and full of flavor. It is easy to eat an entire bag of these!",0.9613,POSITIVE,0.999181,0.999181


## 2. NER

In [23]:
logging.set_verbosity_warning()

In [28]:
ner_analyzer = pipeline('ner',
                        model='dbmdz/bert-large-cased-finetuned-conll03-english',
                        device=0, # GPU
                        aggregation_strategy='SIMPLE'
                       )

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [29]:
text4 = "I ordered an Arnold Palmer at Applebee's in Springfield."

In [30]:
ner_analyzer(text4)

[{'entity_group': 'MISC',
  'score': 0.9914088,
  'word': 'Arnold Palmer',
  'start': 13,
  'end': 26},
 {'entity_group': 'ORG',
  'score': 0.9436139,
  'word': "Applebee ' s",
  'start': 30,
  'end': 40},
 {'entity_group': 'LOC',
  'score': 0.9780036,
  'word': 'Springfield',
  'start': 44,
  'end': 55}]

In [31]:
ner_analyzer2 = pipeline('ner',
                        model='dslim/bert-base-NER',
                        device=0, # GPU
                        aggregation_strategy='SIMPLE'
                       )

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


In [32]:
ner_analyzer2(text4)

[{'entity_group': 'PER',
  'score': 0.87662226,
  'word': 'Arnold Palmer',
  'start': 13,
  'end': 26},
 {'entity_group': 'ORG',
  'score': 0.7005143,
  'word': 'Applebee',
  'start': 30,
  'end': 38},
 {'entity_group': 'LOC',
  'score': 0.6289259,
  'word': "' s",
  'start': 38,
  'end': 40},
 {'entity_group': 'LOC',
  'score': 0.99173564,
  'word': 'Springfield',
  'start': 44,
  'end': 55}]

In [33]:
## practical example

In [34]:
df.head(2)

Unnamed: 0,Id,UserId,Rating,Priority,Title,Text,Sentiment_VADER,Label_HF,Score_HF,Sentiment_HF
0,23689,A21SYVGVNG8RAS,5,Low,Yummy snacks!,Popchips are the bomb!! I use the parmesan garlic to scoop up cottage cheese as a healthy alternative to chips and dip. My healthy eating program is saved.,0.9244,POSITIVE,0.993521,0.993521
1,23690,AQJYXC0MPRQJL,5,Low,Great chip that is different from the rest,"I like the puffed nature of this chip that makes it more unique in the chip market. I ordered the Salt and Vinegar and absolutely love that flavor, hands down my favorite chip ever. I have tried the cheddar and regular flavors as well. The cheddar is about a 4/5 and the regular is about a 3/5 because I prefer strong flavors and obviously that would not be the case for the regular. The Salt and Vinegar is kind of weak compared to some regular S&V chips, but is quite flavorful and makes you wanting to come back for more.",0.7269,POSITIVE,0.999605,0.999605


In [35]:
ner_analyzer = pipeline('ner',
                        model='dslim/bert-base-NER',
                        device=0, # GPU
                        aggregation_strategy='SIMPLE'
                       )

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [37]:
ner_analyzer(df.Text[1])

[{'entity_group': 'MISC',
  'score': 0.9443704,
  'word': 'Salt and Vinegar',
  'start': 99,
  'end': 115},
 {'entity_group': 'MISC',
  'score': 0.9036544,
  'word': 'Salt and Vinegar',
  'start': 392,
  'end': 408},
 {'entity_group': 'MISC',
  'score': 0.94210386,
  'word': 'S & V',
  'start': 450,
  'end': 453}]

In [38]:
[entity['word'] for entity in ner_analyzer(df.Text[1])]

['Salt and Vinegar', 'Salt and Vinegar', 'S & V']

In [41]:
df['Named_Entities'] = df.Text.apply(lambda x: [entity['word'] for entity in ner_analyzer(x)])
df.head(2)

Unnamed: 0,Id,UserId,Rating,Priority,Title,Text,Sentiment_VADER,Label_HF,Score_HF,Sentiment_HF,Named_Entities
0,23689,A21SYVGVNG8RAS,5,Low,Yummy snacks!,Popchips are the bomb!! I use the parmesan garlic to scoop up cottage cheese as a healthy alternative to chips and dip. My healthy eating program is saved.,0.9244,POSITIVE,0.993521,0.993521,[]
1,23690,AQJYXC0MPRQJL,5,Low,Great chip that is different from the rest,"I like the puffed nature of this chip that makes it more unique in the chip market. I ordered the Salt and Vinegar and absolutely love that flavor, hands down my favorite chip ever. I have tried the cheddar and regular flavors as well. The cheddar is about a 4/5 and the regular is about a 3/5 because I prefer strong flavors and obviously that would not be the case for the regular. The Salt and Vinegar is kind of weak compared to some regular S&V chips, but is quite flavorful and makes you wanting to come back for more.",0.7269,POSITIVE,0.999605,0.999605,"[Salt and Vinegar, Salt and Vinegar, S & V]"


In [46]:
 # we use a set to eliminate duplicate values of a list, but turn it to a list again.
named_entities = list(set(df['Named_Entities'].explode().dropna().tolist()))
named_entities[:5]

['##stco', 'T', 'B', 'Stop and Shop', '##y']

In [47]:
[entity for entity in named_entities if '#' not in entity]

['T',
 'B',
 'Stop and Shop',
 'BBQ Pop',
 'Popchi',
 'Amazon. com',
 'Lay',
 'S & V',
 'BBQ Pop Chip',
 'COSTCO',
 'PopChips',
 'Chip',
 'Miami',
 'and Vin',
 'Salt and Vinegar',
 'Co',
 'I',
 'Salt',
 'Amazon',
 'General Mills',
 'A',
 'Cal',
 'P',
 'Pop',
 'L']