In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers
import warnings

In [2]:

torch.cuda.is_available()

torch.cuda.current_device()

torch.cuda.get_device_name(0)

'Tesla V100-SXM2-32GB'

In [3]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

In [4]:
df.head()

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1


In [5]:
# Load pretrained model/tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = transformers.AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

In [6]:
list_of_models = ["dslim/bert-base-NER"]

for models in list_of_models:
    tokenizer = transformers.AutoTokenizer.from_pretrained(models)
    model = transformers.AutoModelForTokenClassification.from_pretrained(models)
    nlp = transformers.pipeline("ner", model=model, tokenizer=tokenizer)
    example = "My name is Slim Shady"

    ner_results = nlp(example)
    print(ner_results)

[{'word': 'Slim', 'score': 0.9975119829177856, 'entity': 'B-PER', 'index': 4, 'start': 11, 'end': 15}, {'word': 'S', 'score': 0.9924010038375854, 'entity': 'I-PER', 'index': 5, 'start': 16, 'end': 17}, {'word': '##hady', 'score': 0.9969489574432373, 'entity': 'I-PER', 'index': 6, 'start': 17, 'end': 21}]


In [7]:
tokenized = df[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [8]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [9]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(6920, 73)

In [10]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

In [11]:
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [12]:
last_hidden_states

TokenClassifierOutput(loss=None, logits=tensor([[[ 8.7263e+00, -2.2096e-01, -9.2287e-01,  ..., -1.5831e+00,
          -1.3599e+00, -1.2300e+00],
         [ 1.0925e+01, -3.4858e-01, -1.7687e+00,  ..., -1.8523e+00,
          -1.2979e+00, -1.5752e+00],
         [ 1.1218e+01, -4.8180e-01, -1.4845e+00,  ..., -1.6833e+00,
          -1.6667e+00, -1.2625e+00],
         ...,
         [ 9.7625e+00, -5.5739e-01, -1.5718e+00,  ..., -1.6915e+00,
          -1.3323e+00, -1.5667e+00],
         [ 9.5080e+00, -5.0637e-01, -1.3887e+00,  ..., -1.5824e+00,
          -1.3212e+00, -1.6749e+00],
         [ 9.1156e+00, -4.1663e-01, -1.4644e+00,  ..., -1.5645e+00,
          -1.3705e+00, -1.6793e+00]],

        [[ 9.0401e+00, -6.3305e-01, -8.1791e-01,  ..., -1.3848e+00,
          -1.5682e+00, -1.3040e+00],
         [ 1.1295e+01, -9.9236e-01, -1.3954e+00,  ..., -1.5412e+00,
          -1.6942e+00, -1.2654e+00],
         [ 1.1355e+01, -1.1130e+00, -1.1480e+00,  ..., -1.3878e+00,
          -1.8952e+00, -1.2091e+00],