<a href="https://colab.research.google.com/github/Fackor/Advanced_Machine_Learning_Course/blob/main/Zero_Shot_Pipeline_for_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers==3.1.0



In [2]:
from transformers import pipeline

In [3]:
#classifier = pipeline("zero-shot-classification")
classifier = pipeline("zero-shot-classification", device=0) # to utilize GPU

Some weights of the model checkpoint at facebook/bart-large-mnli were not used when initializing BartForSequenceClassification: ['model.encoder.version', 'model.decoder.version']
- This IS expected if you are initializing BartForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BartForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
def zeroshot_NER(paragraph, candidate_labels, threshold):
  words = [word.lower() for word in paragraph.split() if word.isalpha()]
  NER = []
  for word in words:
    output = classifier(word, candidate_labels)

    label =  output['labels'][0] if output['scores'][0] > threshold else '[UNK]'
    NER.append([word, label])

  return NER  

In [10]:
from sklearn.metrics import f1_score, precision_score, recall_score
from tqdm import tqdm

candidate_labels = ['Person', 'Organization', 'Location', 'Miscellaneous', '[UNK]']

label_maps = {'Person' : 'PER',
              'Organization' : 'ORG',
              'Location' : 'LOC',
              'Miscellaneous': 'MISC', 
              '[UNK]' : 'O'}

other_label_map = {'B-ORG' : 'ORG',
                   'I-ORG' : 'ORG',
                   'B-PER' : 'PER',
                   'I-PER' : 'PER',
                   'B-LOC' : 'LOC',
                   'I-LOC' : 'LOC',
                   'B-MISC' : 'MISC',
                   'I-MISC' : 'MISC',
                   'O' : 'O'}

filename = '/content/test.txt'

In [None]:
predictions = []
labels = []

THRESHOLD = 0.5
with open(filename) as f:
  for i, line in tqdm(enumerate(f)):
    if i == 0:
        continue
    line = line.split()
    if len(line) > 0 and line[0].isalpha():
      word = line[0]
      label = other_label_map[line[3]]
      labels.append(label)
      
      out = zeroshot_NER(word, candidate_labels, THRESHOLD)

      predictions.append(label_maps[out[0][1]])

f1 = f1_score(labels, predictions, average='weighted')
precision = precision_score(labels, predictions, average='weighted')
recall = recall_score(labels, predictions, average='weighted')
print(f"Precision - {recall}\nRecall - {precision}\nf1 - {f1}")
'''
Precision - 0.5756579921248187
Recall - 0.6500091058577243
f1 - 0.597280030256445
'''


# result = {}
#'''for label in label_maps.values():
#  pred = [x == label for x in predictions]
#  true = [x == label for x in labels]
#  result[label] = f1_score(true, pred, average='binary')

#print(result)
#''''''
#50350it [22:49, 36.76it/s]

#{'PER': 0.2156064461407973, 'ORG': 0.036968576709796676, 'LOC': 0.3058277462609592, 'MISC': 0.005212858384013901, 'O': 0.7298355194669998}
#'''

In [None]:
predictions = []
labels = []

THRESHOLD = 0.6
with open(filename) as f:
  for i, line in tqdm(enumerate(f)):
    if i == 0:
        continue
    line = line.split()
    if len(line) > 0 and line[0].isalpha():
      word = line[0]
      label = other_label_map[line[3]]
      labels.append(label)
      
      #print(line)
      #print(word)
      
      out = zeroshot_NER(word, candidate_labels, THRESHOLD)

      predictions.append(label_maps[out[0][1]])

f1 = f1_score(labels, predictions, average='weighted')
precision = precision_score(labels, predictions, average='weighted')
recall = recall_score(labels, predictions, average='weighted')
print(f"Precision - {recall}\nRecall - {precision}\nf1 - {f1}")
'''
Precision - 0.7299049649169553
Recall - 0.6752435637801734
f1 - 0.6949574012914023
'''


#'''
#for label in label_maps.values():
#  pred = [x == label for x in predictions]
#  true = [x == label for x in labels]
#  result[label] = f1_score(true, pred, average='binary')

#print(result)

#55044it [20:10, 45.48it/s]

#{'PER': 0.21206682313958305, 'ORG': 0.04953560371517028, 'LOC': 0.35424354243542433, 'MISC': 0.0, 'O': 0.7353024911032029}
#'''

In [None]:
sequence = input("Enter a paragraph to do NER on.")
candidate_labels = input("Enter candidate labels separated by double colon(::).").split("::")
candidate_labels.append('[UNK]')

NER = zeroshot_NER(sequence, candidate_labels, 0.4)
for each in NER:
  print(f"{each[0]}<{each[1]}>", end=" ")