<a href="https://colab.research.google.com/github/Halifaxi/Covid-Reproducible/blob/main/(1)_Roberta_Covid_News_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!python --version

Python 3.7.13


In [None]:

!pip install transformers; 

Collecting torch==1.4.0
  Downloading torch-1.4.0-cp37-cp37m-manylinux1_x86_64.whl (753.4 MB)
[K     |████████████████████████████████| 753.4 MB 6.8 kB/s 
[?25hInstalling collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 1.11.0+cu113
    Uninstalling torch-1.11.0+cu113:
      Successfully uninstalled torch-1.11.0+cu113
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.12.0+cu113 requires torch==1.11.0, but you have torch 1.4.0 which is incompatible.
torchtext 0.12.0 requires torch==1.11.0, but you have torch 1.4.0 which is incompatible.
torchaudio 0.11.0+cu113 requires torch==1.11.0, but you have torch 1.4.0 which is incompatible.[0m
Successfully installed torch-1.4.0
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 

In [None]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from google.colab import drive
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import confusion_matrix, classification_report
%matplotlib inline

In [None]:
torch.__version__

'1.4.0'

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


## Import Data

In [None]:
df = pd.read_csv('/content/drive/MyDrive/covid_annotations.csv')

In [None]:
print(df.Label.value_counts())
neg_to_pos = df.Label.value_counts()[0] / df.Label.value_counts().sum()
print(f'Proportion of negative samples in the dataset = {np.around(neg_to_pos,3)}')

0    556
1    436
Name: Label, dtype: int64
Proportion of negative samples in the dataset = 0.56


In [None]:
len(df)

992

In [None]:
def text_preprocessing(text):
    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)
    # Replace '&amp;' with '&'
    text = re.sub('\n','',text).strip()
    text = re.sub('we\'ll','we will',text, flags=re.IGNORECASE)
    text = re.sub('didn\'t','did not',text, flags=re.IGNORECASE)
    text = re.sub('don\'t','do not',text, flags=re.IGNORECASE)
    text = re.sub('aren\'t','are not',text, flags=re.IGNORECASE)
    text = re.sub('doesn\'t','does not',text, flags=re.IGNORECASE)
    text = re.sub('what\'s','what is',text, flags=re.IGNORECASE)
    text = re.sub('it\'s','it is',text, flags=re.IGNORECASE)
    text = re.sub('that\'s','that is',text, flags=re.IGNORECASE)
    text = re.sub('he\'s','he is',text, flags=re.IGNORECASE)
    text = re.sub('she\'s','she is',text, flags=re.IGNORECASE)
    text = re.sub(r'coronavirus', 'covid', text, flags=re.IGNORECASE)
    text = re.sub(r'covid-19', 'covid', text, flags=re.IGNORECASE)
    text = re.sub(r'sars-cov-2', 'covid', text, flags=re.IGNORECASE)
    text = re.sub(r'\s+', ' ', text).strip()
    text = text + '.' if text and text[-1] not in ('.', '!', '?') else text
    return text
df.Sentence = df.Sentence.apply(text_preprocessing)

In [None]:
df.head()

Unnamed: 0,Sentence,Label
0,"Countys milestone comes after it hit 4,000 dea...",0
1,Riverside Countys covid death toll surpassed 5...,0
2,Numbers posted on the countys covid website Fr...,0
3,Our county team continues to work diligently t...,1
4,"Orange County , which has about 700,000 more p...",1


# Regular roberta training

In [None]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla K80


#### Preprocessing step

In [None]:
from transformers import RobertaTokenizer, RobertaModel
model_name = "siebert/sentiment-roberta-large-english"
tokenizer = RobertaTokenizer.from_pretrained(model_name, truncation=True)

I believe what I did is install the newer Roberta model on an updated version of PyTorch and then I saved it using _use_new_zipfile_serialization=False. Which I then could open using regular PyTorch.

In [None]:
#  model = RobertaModel.from_pretrained(model_name, map_location=device)
# torch.save(model, f = '/content/drive/MyDrive/base_old_torch_rob.pth', _use_new_zipfile_serialization=False)
model = torch.load('/content/drive/MyDrive/base_old_torch_rob.pth')
# torch.save(model.state_dict(),f = '/content/drive/MyDrive/covid_TEST.pth' ,_use_new_zipfile_serialization=False)

In [None]:
class SentimentData(Dataset):
    """Converts pandas dataframe into usable input for pytorch

    Params:
            dataframe, tokenizer, Max_len
    
    Returns:
            'ids',
            'mask',
            'token_type_ids',
            'targets',
    """
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Sentence
        self.targets = self.data.Label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            return_token_type_ids=True,
            )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
# Concatenate train data and test data
all_text = df.Sentence.values

# Encode our concatenated data
encoded_text = [tokenizer.encode(sent, add_special_tokens=True) for sent in all_text]

# Find the maximum length
MAX_LEN = max([len(sent) for sent in encoded_text])
print('Max length: ', MAX_LEN)

Max length:  109


In [None]:
train_size = 0.95
train_data= df.sample(frac=train_size,random_state=42)
test_data= df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = SentimentData(train_data, tokenizer, MAX_LEN)
testing_set = SentimentData(test_data, tokenizer, MAX_LEN)

FULL Dataset: (992, 2)
TRAIN Dataset: (942, 2)
TEST Dataset: (50, 2)


In [None]:
TRAIN_BATCH_SIZE = 5
VALID_BATCH_SIZE = 5
LEARNING_RATE = 1e-05

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }
test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }
training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
class RobertaClassifier(nn.Module):
    def __init__(self):
        super(RobertaClassifier, self).__init__()
        # self.l1 = RobertaModel.from_pretrained("/content/drive/MyDrive/pre_trained_roberta_model_base") # Save it locally for faster import time.
        # self.l1 = RobertaModel.from_pretrained("siebert/sentiment-roberta-large-english")
        self.l1 = torch.load('/content/drive/MyDrive/base_old_torch_rob.pth') #PLEASE
        self.pre_classifier = torch.nn.Linear(1024, 1024)
        self.dropout = torch.nn.Dropout(0.18)
        self.classifier = torch.nn.Linear(1024, 2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output
        

In [None]:
model = RobertaClassifier().to(device);
model.load_state_dict(torch.load('/content/drive/MyDrive/covid_checkpoint.pth', map_location=device))



<All keys matched successfully>

### Fine Tuning and Training the Model

In [None]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)

In [None]:
def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [None]:
%%time
EPOCHS = 2
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]

Training Loss per 5000 steps: 0.5505173206329346
Training Accuracy per 5000 steps: 100.0


189it [01:28,  2.13it/s]


The Total Accuracy for Epoch 0: 86.94267515923566
Training Loss Epoch: 0.43486682048915004
Training Accuracy Epoch: 86.94267515923566


0it [00:00, ?it/s]

Training Loss per 5000 steps: 0.08309106528759003
Training Accuracy per 5000 steps: 100.0


189it [01:28,  2.13it/s]

The Total Accuracy for Epoch 1: 95.01061571125265
Training Loss Epoch: 0.24295600312860516
Training Accuracy Epoch: 95.01061571125265
CPU times: user 2min 24s, sys: 22.7 s, total: 2min 47s
Wall time: 2min 57s





### Model Validation

In [None]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad(): 
        for num, data in tqdm(enumerate(testing_loader)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            
            # outputs = model(ids, mask, token_type_ids).squeeze()
            outputs = model(ids, mask, token_type_ids)
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1) # This for prediction and score. 
            n_correct += calcuate_accuracy(big_idx, targets) 

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if num%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"\n Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")

    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu


In [None]:
%%time
acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

2it [00:00,  6.31it/s]


 Validation Loss per 100 steps: 1.1860120296478271
Validation Accuracy per 100 steps: 80.0


10it [00:01,  8.36it/s]

Validation Loss Epoch: 0.12521915442775935
Validation Accuracy Epoch: 98.0
Accuracy on test data = 98.00%
CPU times: user 1.19 s, sys: 7.07 ms, total: 1.2 s
Wall time: 1.21 s





In [None]:
nb_classes = 2
wrong_preds = []
# Initialize the prediction and label lists(tensors)
predlist=torch.zeros(0,dtype=torch.long, device='cpu')
lbllist=torch.zeros(0,dtype=torch.long, device='cpu')
model.eval()
how_confident = []
with torch.no_grad():      
    for i, data in enumerate(testing_loader):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            # outputs = model(ids, mask, token_type_ids) # REGULAR
            outputs = F.softmax(model(ids, mask, token_type_ids), dim=-1) # SOFTMAX
            scores, preds = torch.max(outputs, 1)

            # Append batch prediction results
            predlist = torch.cat([predlist,preds.view(-1).cpu()])
            lbllist = torch.cat([lbllist,targets.view(-1).cpu()])
            how_confident.append(np.mean(scores.cpu().numpy()))

            # keep track of the predictions when the model goes wrong.
            idx_wrong_predictions = np.where(preds.view(-1).cpu() != targets.view(-1).cpu())[0]
            if len(idx_wrong_predictions) > 0:
                converted_lst = tokenizer.batch_decode(data['ids'], skip_special_tokens=True)
                for i in idx_wrong_predictions:
                    wrong_preds.append((converted_lst[i], preds.view(-1).cpu()[i], np.max(outputs[i].cpu().numpy())))

# Confusion matrix
conf_mat = confusion_matrix(lbllist.numpy(), predlist.numpy())
class_report = classification_report(lbllist.numpy(), predlist.numpy(), output_dict=True, digits=2)
print(conf_mat)
print(classification_report(lbllist.numpy(), predlist.numpy(), output_dict=False))
# Per-class accuracy
class_accuracy=100*conf_mat.diagonal()/conf_mat.sum(1)
print(class_accuracy)

[[28  0]
 [ 1 21]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.98        28
           1       1.00      0.95      0.98        22

    accuracy                           0.98        50
   macro avg       0.98      0.98      0.98        50
weighted avg       0.98      0.98      0.98        50

[100.          95.45454545]


In [None]:
np.mean(how_confident)

0.9936663

### Save Model

In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/covid_checkpoint.pth', _use_new_zipfile_serialization=False)

# Classification Reports

In [None]:
class_report_df = pd.DataFrame(class_report).iloc[:-1, :].T
class_report_df = class_report_df.round(2)
# class_report_df.to_latex('/content/drive/MyDrive/latex/classification_report.tex')

In [None]:
cov_rob_f1 = class_report['weighted avg']['f1-score']
cov_rob_acc = class_report['accuracy']

In [None]:
wrong_preds

[('"Guys, the covid is not a joke or a fiction. It is surprising that in the second year of the pandemic, people still need to be convinced of this," he said, calling on medical colleagues to "start talking in a frank and unadorned way to our compatriots.".',
  tensor(1),
  0.81678647),
 ('Under no circumstances is the covid simply going to disappear this summer.',
  tensor(1),
  0.5245222),
 ('These case declines are very welcome, but are taking place against a backdrop of very high viral transmission.',
  tensor(1),
  0.5646877),
 ('Although the first dose provides "some degree of protection," the second dose multiplies the level of protection by a factor of 10.',
  tensor(0),
  0.829488)]

## Comparing the big models

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer

In [None]:
model_name = "siebert/sentiment-roberta-large-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

In [None]:
pred_texts = test_data.Sentence.astype('str').tolist()

In [None]:
tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
pred_dataset = SimpleDataset(tokenized_texts)

In [None]:
# boiler plate syntax. 
trainer = Trainer(model=model)
predictions = trainer.predict(pred_dataset)

***** Running Prediction *****
  Num examples = 90
  Batch size = 8


In [None]:
# Transform predictions to labels
preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map(model.config.id2label)
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

In [None]:
# Create DataFrame with texts, predictions, labels, and scores
t_df = pd.DataFrame(list(zip(pred_texts,preds,scores)), columns=['text','pred','score'])
t_df.head()

Unnamed: 0,text,pred,score
0,"Countys milestone comes after it hit 4,000 dea...",1,0.995588
1,Most projected that COVID-19 cases would conti...,1,0.986337
2,Since COVID-19 vaccines became widely availabl...,0,0.987297
3,Just 2.1 million people had received their fir...,0,0.997878
4,"''In places like the U.S. and the U.K., where ...",0,0.993721


In [None]:
t_df = pd.concat((test_data.Label, t_df),axis=1)

In [None]:
conf_mat = confusion_matrix(t_df.Label, t_df.pred)
class_report = classification_report(t_df.Label, t_df.pred, output_dict=True, digits=2)
print(conf_mat)
print(classification_report(t_df.Label, t_df.pred, output_dict=False))

[[32 18]
 [ 7 33]]
              precision    recall  f1-score   support

         0.0       0.82      0.64      0.72        50
         1.0       0.65      0.82      0.73        40

    accuracy                           0.72        90
   macro avg       0.73      0.73      0.72        90
weighted avg       0.74      0.72      0.72        90



In [None]:
rob_f1 = class_report['weighted avg']['f1-score']
rob_acc = class_report['accuracy']

**BERT SENTIMENT**

In [None]:
from transformers import pipeline
pipe = pipeline('sentiment-analysis')

In [None]:
bert_pred = []
bert_score = []
for i in pred_texts:
    pred = pipe(i)[0]
    bert_pred.append(pred['label'])
    bert_score.append(pred['score'])
bert_pred = (np.array(bert_pred) =='POSITIVE').astype('int')

In [None]:
conf_mat = confusion_matrix(t_df.Label, bert_pred)
class_report = classification_report(t_df.Label, bert_pred, output_dict=True, digits=2)
print(conf_mat)
print(classification_report(t_df.Label, bert_pred, output_dict=False))

[[42  8]
 [25 15]]
              precision    recall  f1-score   support

         0.0       0.63      0.84      0.72        50
         1.0       0.65      0.38      0.48        40

    accuracy                           0.63        90
   macro avg       0.64      0.61      0.60        90
weighted avg       0.64      0.63      0.61        90



In [None]:
bert_f1 = class_report['weighted avg']['f1-score']
bert_acc = class_report['accuracy']

**Vader Sentiment**

In [None]:
pip install vaderSentiment



In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
def sentiment_scores(sentence):
 
    # Create a SentimentIntensityAnalyzer object.
    sid_obj = SentimentIntensityAnalyzer()
    sentiment_dict = sid_obj.polarity_scores(sentence)
    # decide sentiment as positive, negative and neutral
    if sentiment_dict['compound'] >= 0.05 :
        return 1
 
    elif sentiment_dict['compound'] <= - 0.05 :
        return 0
 
    else:
        return np.random.choice([0,1])

In [None]:
vader_pred = []
for sent in pred_texts:
    vader_pred.append(sentiment_scores(sent))

In [None]:
conf_mat = confusion_matrix(t_df.Label, vader_pred)
class_report = classification_report(t_df.Label, vader_pred, output_dict=True, digits=2)
print(conf_mat)
print(classification_report(t_df.Label, vader_pred, output_dict=False))

[[27 23]
 [ 9 31]]
              precision    recall  f1-score   support

         0.0       0.75      0.54      0.63        50
         1.0       0.57      0.78      0.66        40

    accuracy                           0.64        90
   macro avg       0.66      0.66      0.64        90
weighted avg       0.67      0.64      0.64        90



In [None]:
vader_f1 = class_report['weighted avg']['f1-score']
vader_acc = class_report['accuracy']

**Combining all classification reports together**

In [None]:
f1_scores = {'Fine-tuned RoBERTa': [cov_rob_f1,cov_rob_acc] , 'Vader': [vader_f1,vader_acc], 'Distil BERT': [bert_f1,bert_acc] ,'RoBERTa Base': [rob_f1, rob_acc]}
# acc_scores = {'Fine-tuned RoBERTa': cov_rob_acc, 'Vader': vader_acc, 'Distil BERT': bert_acc ,'Base RoBERTa': rob_acc}
compare_scores = pd.DataFrame(f1_scores).round(3)
compare_scores.index = ['F1 Score', 'Accuracy Score']
compare_scores = compare_scores.T
compare_scores

In [None]:
compare_scores.to_latex('/content/drive/MyDrive/latex/classification_comparison.tex')

### Testing model on individual cases

In [None]:
state_dict = torch.load('/content/drive/MyDrive/covid_checkpoint.pth', map_location=device)
model = RobertaClassifier().to(device)
model.load_state_dict(state_dict)

In [None]:
# testing_set = SentimentData(test_data, tokenizer, MAX_LEN)

In [None]:
example1 = df['Sentence'][0]
example1

'Countys milestone comes after it hit 4,000 deaths in March.'

In [None]:
def sentiment_prediction(sentence, tokenizer):
    """ Takes in a sentence and returns a single predicted value.
    """
    with torch.no_grad(): 
        encoded_review = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=250,
            padding='max_length',
            return_token_type_ids=True,
            return_tensors='pt'
            )
                
        input_ids = encoded_review['input_ids'].to(device)
        attention_mask = encoded_review['attention_mask'].to(device)
        token_type_ids = encoded_review["token_type_ids"].to(device)
        outputs = F.softmax(model(input_ids, attention_mask, token_type_ids), dim=-1)
        
        predicted_index = np.argmax(outputs[0].cpu().numpy())
        predict = {'prediction':predicted_index, 'score': outputs[0].cpu().numpy()[predicted_index]}
        return predict

**Compare the pipeline trainer api model with the one from PyTorch**

In [None]:
test_sentence = 'while hopes are rising that a rapid rollout of covid vaccines is helping to tame the pandemic in the united states, in much of the world the virus is still surging.'
test_sentence = 'the country, which was regularly recording more than 200,000 new cases a day in january, has administered at least one dose of vaccine to more than half of adults and has averaged between 50,000 and 70,000 cases a day since mid-february'
test_sentence = 'the country, which was regularly recording more than 200,000 new cases a day in january, is now averaging less cases between 50,000 and 70,000 cases a day since mid-february'
test_sentence = 'globally, deaths have risen sharply this spring, but less so than cases, partly due to vaccine campaigns targeting those most vulnerable to covid, such as the elderly'
test_sentence = 'Daily vaccination rates have fallen to about 2.5 million doses a day, down from 3 million.'
test_sentence = 'Covid testing rates have increased.'

test_lowpr = 'Larry Dubinski, president and chief executive officer of the institute, said the newest award recipients are being recognized at a moment when rigorous science is as important as ever. “These achievements come at a critical time for us all,” he said.'

In [None]:
test = 'Seis millones de residentes de nuestra ciudad han sido vacunados hasta ahora. Y esa es la razón por la que el covid ha disminuido.'
print(sentiment_prediction(test, tokenizer))
# print(pipe(test_sentence))

{'prediction': 1, 'score': 0.7479823}


In [None]:
test_sentence = 'Vaccine rates are increasing. But delta is already here.'
# test_sentence = 'Vaccine rates are increasing. But more are on the way.'

print(sentiment_prediction(test_sentence, tokenizer))

AttributeError: ignored

In [None]:
print(sentiment_prediction(test_sentence, tokenizer))
print(pipe(test_sentence))