In [1]:
import pandas as pd
from transformers import pipeline
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer

# Create class for data preparation
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("theta/MBTI-ckiplab-bert")
model = AutoModelForSequenceClassification.from_pretrained("theta/MBTI-ckiplab-bert")
test = pipeline(task="text-classification", model=model, tokenizer=tokenizer)

In [27]:
df = pd.read_csv('test_set.csv')
df.Sentence = df.Sentence.apply(str.lower)
y = df.Label

In [28]:
rob = pipeline("sentiment-analysis", model="siebert/sentiment-roberta-large-english")
bert = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


In [29]:
# Load tokenizer and model, create trainer
model_name = "siebert/sentiment-roberta-large-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model)
pred_texts = df['Sentence'].dropna().astype('str').tolist()

In [30]:
# Tokenize texts and create prediction data set
tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
pred_dataset = SimpleDataset(tokenized_texts)

Ignored unknown kwarg option direction


In [31]:
# Run predictions
predictions = trainer.predict(pred_dataset)
# Transform predictions to labels
preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map(model.config.id2label)
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)
# Create DataFrame with texts, predictions, labels, and scores
roberta_pred = pd.DataFrame(list(zip(pred_texts,preds,labels,scores)), columns=['text','pred','rob_label','rob_score'])

***** Running Prediction *****
  Num examples = 44
  Batch size = 8
  0%|          | 0/6 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 6/6 [00:09<00:00,  1.50s/it]

In [32]:
x = df.Sentence.to_list()
predict = []
score = []
for sent in x:
    predict.append(bert(sent)[0]['label'])
    score.append(bert(sent)[0]['score'])
bert_df = pd.DataFrame({'bert_label': predict, 'bert_score': score})
df = pd.concat((roberta_pred,bert_df,y),axis=1)

In [33]:
print(df.bert_score.sum())
print(df.rob_score.sum()) # Roberta is more confident

42.2007577419281
43.249733


In [34]:
def numberize(x):
    if x == 'POSITIVE':
        return 1
    else:
        return 0
        
bert_label = df.bert_label.apply(numberize)
rob_label = df.rob_label.apply(numberize)

In [15]:
from transformers import RobertaTokenizer, RobertaModel
import torch.nn as nn
import torch.nn.functional as F

In [17]:
model = RobertaModel.from_pretrained("siebert/sentiment-roberta-large-english")

Some weights of the model checkpoint at siebert/sentiment-roberta-large-english were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at siebert/sentiment-roberta-large-english and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [19]:
model.save_pretrained('/Users/gabrielnicholson/Desktop/corona/pre_trained_roberta')

In [20]:
RobertaModel.from_pretrained('/Users/gabrielnicholson/Desktop/corona/pre_trained_roberta')

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 1024, padding_idx=1)
    (position_embeddings): Embedding(514, 1024, padding_idx=1)
    (token_type_embeddings): Embedding(1, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (d

In [36]:
class RobertaClassifier(nn.Module):
    def __init__(self):
        super(RobertaClassifier, self).__init__()
        self.l1 = RobertaModel.from_pretrained("siebert/sentiment-roberta-large-english")
        self.pre_classifier = torch.nn.Linear(1024, 1024)
        self.dropout = torch.nn.Dropout(0.2)
        self.classifier = torch.nn.Linear(1024, 2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output
        

In [37]:
# finetuned_rob = torch.load('/Users/gabrielnicholson/Downloads/covid_checkpoint.pth', map_location=torch.device('cpu'))
finetuned_rob = RobertaClassifier()
finetuned_rob.load_state_dict(torch.load('/Users/gabrielnicholson/Downloads/covid_checkpoint.pth', map_location=torch.device('cpu')))

loading configuration file https://huggingface.co/siebert/sentiment-roberta-large-english/resolve/main/config.json from cache at /Users/gabrielnicholson/.cache/huggingface/transformers/228e83e1ade2247aebc5f0725e330fa58dedee3d9eec36c9249f25084a946130.1aece0680a18a95d51d6e1a5f83631412da37b87db65380c52052161354505ba
Model config RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_tok

<All keys matched successfully>

In [38]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [21]:
model_name = "siebert/sentiment-roberta-large-english"
tokenizer = RobertaTokenizer.from_pretrained(model_name,truncation=True, do_lower_case=True)

In [22]:
# tokenizer.save_pretrained('/Users/gabrielnicholson/Desktop/corona/pre_trained_tokenizer')

('/Users/gabrielnicholson/Desktop/corona/pre_trained_tokenizer/tokenizer_config.json',
 '/Users/gabrielnicholson/Desktop/corona/pre_trained_tokenizer/special_tokens_map.json',
 '/Users/gabrielnicholson/Desktop/corona/pre_trained_tokenizer/vocab.json',
 '/Users/gabrielnicholson/Desktop/corona/pre_trained_tokenizer/merges.txt',
 '/Users/gabrielnicholson/Desktop/corona/pre_trained_tokenizer/added_tokens.json')

In [41]:
df = pd.read_csv('test_set.csv')
df.Sentence = df.Sentence.apply(str.lower)

In [42]:
def sentiment_prediction(sentence, model, tokenizer):
    """ Takes in a sentence and returns a single predicted value.
    """
    model.eval()
    with torch.no_grad(): 
        encoded_review = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=82,
            padding='max_length',
            return_token_type_ids=True,
            return_tensors='pt'
            )
    
        input_ids = encoded_review['input_ids'].to(device)
        attention_mask = encoded_review['attention_mask'].to(device)
        token_type_ids = encoded_review["token_type_ids"].to(device)
        outputs = F.softmax(model(input_ids, attention_mask, token_type_ids), dim=-1)
        predicted_index = np.argmax(outputs[0].cpu().numpy())
        predict = {'prediction':predicted_index, 'score': outputs[0].cpu().numpy()[predicted_index]}
        return predict


In [None]:
# Using fine tuned roberta to classify multiple sentences.
correct = 0
confidence = 0
for index, sentence in enumerate(df.Sentence):
    prediction = sentiment_prediction(sentence.lower(),finetuned_rob,tokenizer)
    confidence += prediction['score']
    if prediction['prediction'] == df.Label[index]:
        correct += 1

In [2]:
print('Results ', '-' * 40)
print(f'Roberta got {np.sum(rob_label == df.Label)}/{len(df)} correct')
print(f'BERT got {np.sum(bert_label == df.Label)}/{len(df)} correct')
print(f'Finetuned Roberta got {correct}/44 correct')

Results  ----------------------------------------


NameError: name 'np' is not defined