In [18]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [19]:
from google.colab import drive
from transformers import BertTokenizer, BertForTokenClassification
import pandas as pd
import numpy as np
import torch
import pickle


drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
import os

os.listdir('./drive/MyDrive/NLP Project Work/')

['eng_train.txt',
 '.here',
 'news_article_collection.ipynb',
 'news_data.csv',
 'model.sav',
 'test.ipynb',
 'nlp_project.ipynb',
 'model_copy.sav',
 'model3.pt',
 'model_save',
 'doc2vec.ipynb']

In [21]:
# Load a trained model and vocabulary that you have fine-tuned
output_dir = './drive/MyDrive/NLP Project Work/model_save/'
device = 'cuda'

model = BertForTokenClassification.from_pretrained(output_dir)
tokenizer = BertTokenizer.from_pretrained(output_dir)
# Copy the model to the GPU.
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [22]:
test_sentence = """
Mr. Trump’s tweets began just moments after a Fox News report by Mike Tobin, a 
reporter for the network, about protests in Minnesota and elsewhere. 
"""

In [23]:
def test_func(test_sentence):
  tokenized_sentence = tokenizer.encode(test_sentence)
  input_ids = torch.tensor([tokenized_sentence]).cuda()
  ner_tokens = []
  ner = []

  with torch.no_grad():
    output = model(input_ids)
  label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
  # join bpe split tokens
  tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
  new_tokens, new_labels = [], []
  for token, label_idx in zip(tokens, label_indices[0]):
    if token.startswith("##"):
      new_tokens[-1] = new_tokens[-1] + token[2:]
    else:
      new_labels.append(tag_values[label_idx])
      new_tokens.append(token)
      
  for token, label in zip(new_tokens, new_labels):
    #print("{}\t{}".format(label, token))
    if label in ['B-LOC', 'B-MISC', 'B-ORG', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER']:
      ner_tokens.append(label)
      ner.append(token)
      
  return ner_tokens, ner

In [26]:
tag_values = list({'B-LOC', 'B-MISC', 'B-ORG', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O'})
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}

In [27]:
test_func(test_sentence)

(['B-ORG',
  'B-ORG',
  'B-ORG',
  'B-ORG',
  'B-ORG',
  'B-ORG',
  'B-ORG',
  'B-ORG',
  'B-ORG',
  'B-ORG',
  'B-ORG',
  'B-ORG',
  'B-LOC',
  'B-LOC',
  'B-ORG',
  'B-ORG',
  'I-LOC',
  'I-LOC',
  'B-ORG',
  'B-ORG',
  'B-ORG',
  'B-ORG',
  'B-ORG',
  'B-ORG',
  'B-ORG',
  'B-ORG',
  'B-ORG',
  'B-ORG',
  'B-ORG',
  'B-ORG',
  'B-ORG',
  'B-ORG'],
 ['[CLS]',
  'Mr',
  '.',
  'Trump',
  '’',
  's',
  'tweets',
  'began',
  'just',
  'moments',
  'after',
  'a',
  'Fox',
  'News',
  'report',
  'by',
  'Mike',
  'Tobin',
  ',',
  'a',
  'reporter',
  'for',
  'the',
  'network',
  ',',
  'about',
  'protests',
  'in',
  'and',
  'elsewhere',
  '.',
  '[SEP]'])

In [None]:
main_path = './drive/MyDrive/NLP Project Work/'
data = pd.read_csv(f'{main_path}news_data.csv')
data = data[~pd.isnull(data['text'])]

In [None]:
filename = f'{main_path}model.sav'
  
# load the model from disk
model = pickle.load(open(filename, 'rb'))

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

In [None]:
tag_values = list({'B-LOC', 'B-MISC', 'B-ORG', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O', 'PAD'})
tag2idx = {t: i for i, t in enumerate(tag_values)}

In [None]:
input_ids

tensor([  101,  1332,  1103,  4447,  1104,  6314, 10910,  6596,  1157,  1357,
         5448,  2592,  1224,  1113,  5286,  2106,   117,  1122,  1209,  1129,
         1103,  1314,  1558,  2373,  1104,  1103,  4190,  1196,  1103,  2286,
         2083,  1306,  3212,   783,  1105,  1122,  1209,  6707,   170,  1989,
         1104,  1207,  2233, 16085,  1115,  1103,  1653,   118,  2633,  5530,
         2319,  1110,  4000,   795,   102], device='cuda:0')

In [None]:
for j in range(1):
  i = 10
  test_sentence = """
                  Mr. Trump’s tweets began just moments after a Fox News report by Mike Tobin, a 
                  reporter for the network, about protests in Minnesota and elsewhere. 
                  """
  # encoded_text = tokenizer.encode(data['text'].iloc[i])
  encoded_text = tokenizer.encode(test_sentence)
  input_ids = torch.tensor([encoded_text]).cuda()

  with torch.no_grad():
    output = model(input_ids)
  label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)

  tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
  new_tokens, new_labels = [], []
  for token, label_idx in zip(tokens, label_indices[0]):
      if token.startswith("##"):
          new_tokens[-1] = new_tokens[-1] + token[2:]
      else:
          new_labels.append(tag_values[label_idx])
          new_tokens.append(token)

In [None]:
for token, label in zip(new_tokens, new_labels):
    print("{}\t{}".format(label, token))


I-PER	[CLS]
I-PER	Mr
I-PER	.
PAD	Trump
I-PER	’
I-PER	s
I-PER	tweets
I-PER	began
I-PER	just
I-PER	moments
I-PER	after
I-PER	a
B-MISC	Fox
B-MISC	News
I-PER	report
I-PER	by
PAD	Mike
PAD	Tobin
I-PER	,
I-PER	a
I-PER	reporter
I-PER	for
I-PER	the
I-PER	network
I-PER	,
I-PER	about
I-PER	protests
I-PER	in
I-MISC	Minnesota
I-PER	and
I-PER	elsewhere
I-PER	.
I-PER	[SEP]


In [None]:
  entities = [x for x in tokens if x in tag_values]

In [None]:
entities

[]

In [None]:
# MUST BE REPLACED WITH BERT NER
encoded_text = [tokenizer.encode(x, padding = 'max_length', max_length = 100) for x in data['text']]

In [None]:
input_ids = torch.tensor(encoded_text).cuda()
input_ids

tensor([[  101,  1332,  1103,  ...,     0,     0,     0],
        [  101, 10616,   112,  ...,     0,     0,     0],
        [  101,  4280,  1244,  ...,     0,     0,     0],
        ...,
        [  101, 10616, 18806,  ...,     0,     0,     0],
        [  101,  1109,  1441,  ...,     0,     0,     0],
        [  101,   138,  5973,  ...,     0,     0,     0]], device='cuda:0')

In [None]:
import numpy as np



In [None]:
tokens

'[CLS]'

In [None]:
for i in range(data.shape[0]):
  try:
    encoded_text = tokenizer.encode(data['text'].iloc[i])
  except:
    print(data['text'].iloc[i], data['headline'].iloc[i], i)
    continue
print(i)

nan Live updates: Lionel Messi and Argentina play and other World Cup news and highlights 460
526


In [None]:
len(data['text'].tolist())

527

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data)

In [None]:
test.shape

(132, 2)

In [None]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

train['ner'] = [TaggedDocument(x.split(), [i]) for i, x in enumerate(train['text'])]
model = Doc2Vec(vector_size = 2, min_count = 2, epochs = 40)
model.build_vocab(train['ner'].tolist())

In [None]:
model.train(train['ner'], total_examples = model.corpus_count, epochs = model.epochs)

In [None]:
vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])
vector

array([ 0.5220812 , -0.25025648], dtype=float32)

In [None]:
pip3 install --upgrade gensim --user

In [None]:
print(f"Word 'penalty' appeared {model.wv.get_vecattr('b', 'count')} times in the training corpus.")

KeyError: "Key 'b' not present"