Reference: https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest

In [3]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
import pandas as pd
from scipy.special import softmax

### Model setup

In [4]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [61]:
# Example prediction task

text = "It's meh"
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
ranking = np.argsort(scores)
ranking = ranking[::-1]
scores

array([0.61255765, 0.32987192, 0.05757042], dtype=float32)

In [62]:
for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

1) negative 0.6126
2) neutral 0.3299
3) positive 0.0576


### Input Data Setup
Load whichever input data of interest

In [63]:
'''
# Predicting sentiment for the full dataset
main_comment_df = pd.read_csv('data_2018_2023/comments.csv')
main_comment_df.body = main_comment_df.body.fillna('')

# Split into 40 batches as checkpoints
batches = np.array_split(main_comment_df, 40)
'''

"\n# Predicting sentiment for the full dataset\nmain_comment_df = pd.read_csv('data_2018_2023/comments.csv')\nmain_comment_df.body = main_comment_df.body.fillna('')\n\n# Split into 40 batches as checkpoints\nbatches = np.array_split(main_comment_df, 40)\n"

In [75]:
# Predicting sentiment for round 2 manual labels
charlie_label_df = pd.read_csv('manual/charlie_comments.csv')
dillan_label_df = pd.read_csv('manual/dillan_comments.csv')
daniel_label_df = pd.read_csv('manual/daniel_comments.csv')

comparison = pd.DataFrame()
comparison['charlie_label'] = charlie_label_df.sentiment_label
comparison['dillan_label'] = dillan_label_df.sentiment_label
comparison['daniel_label'] = daniel_label_df.sentiment_label

In [76]:
comparison.head()

Unnamed: 0,charlie_label,dillan_label,daniel_label
0,1,1,1
1,-1,-1,-1
2,1,-1,1
3,1,1,-1
4,1,1,0


### Prediction Loop

In [67]:
def make_prediction(batch_idx, text_idx, text, tokenizer, model):
    try:
        encoded_input = tokenizer(text, return_tensors='pt')
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        ranking = np.argsort(scores)
        ranking = ranking[::-1]
        pred_idx = np.argmax(scores)
    except RuntimeError:
        print('ERROR at batch %i text %i! Checking text length: %i' % (batch_idx, text_idx, len(text.split())))
        return ('error', 0)
    
    return (config.id2label[pred_idx], scores[pred_idx])

In [68]:
def make_batch_prediction(batch_idx, output_filename, batch, tokenizer, model):
    text_list = list(batch.body)
    prediction_list = [make_prediction(batch_idx, text_idx, text, tokenizer, model) for text_idx, text in enumerate(text_list)]
    
    batch['predicted_label'] = [pair[0] for pair in prediction_list]
    batch['confidence'] = [pair[1] for pair in prediction_list]
    batch.to_csv(output_filename, index=False)
    
    print(batch.predicted_label.value_counts(normalize=True))

#### 1. Manual label validation

In [77]:
validation_texts = charlie_label_df.body.fillna('')
comparison['body'] = validation_texts

output = [make_prediction(0, idx, text, tokenizer, model) for idx, text in enumerate(validation_texts)]
comparison['model_label'] = [pair[0] for pair in output]
comparison['model_label'] = comparison['model_label'].replace({'positive': 1, 'negative': -1, 'neutral': 0, 'error': 0})
comparison['model_confidence'] = [pair[1] for pair in output]

ERROR at batch 0 text 310! Checking text length: 549


In [78]:
comparison

Unnamed: 0,charlie_label,dillan_label,daniel_label,body,model_label,model_confidence
0,1,1,1,congrats on passing!!!!!!! this is something t...,1,0.974098
1,-1,-1,-1,bro…relax,0,0.690845
2,1,-1,1,I had the same problem coming in as a freshman...,1,0.376017
3,1,1,-1,"Well, can’t give ya comfort because I think th...",0,0.558477
4,1,1,0,Grades don’t really matter. I graduated years ...,0,0.478843
...,...,...,...,...,...,...
428,0,1,0,"It wasn’t a hard assignment, I started at 10:3...",1,0.629152
429,1,1,1,Don’t be scared!!! If you can get through 280 ...,1,0.910908
430,-1,0,0,Just bomb that part of hw and it's just less t...,-1,0.693472
431,0,-1,0,"it takes 15 minutes to read the instructions, ...",-1,0.559133


In [90]:
# Counting the number of annotators that the model agrees with for each text
level_of_match = (comparison.charlie_label == comparison.model_label).astype(int) + (comparison.dillan_label == comparison.model_label).astype(int) + (comparison.daniel_label == comparison.model_label.astype(str)).astype(int)
level_of_match

0      3
1      0
2      2
3      0
4      1
      ..
428    1
429    3
430    1
431    1
432    1
Length: 433, dtype: int64

In [91]:
level_of_match.value_counts()

3    221
2     95
1     71
0     46
dtype: int64

======================================================================================================
#### 2. Full dataset prediction

In [None]:
'''
for idx, batch in enumerate(batches):
    output_filename = 'full_data_batch_label/labeled/batch_%i.csv' % idx
    make_batch_prediction(idx, output_filename, batch, tokenizer, model)
'''