Reference: https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest

In [38]:
import os
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
import pandas as pd
from scipy.special import softmax

In [40]:
RAW_COMMENT_PATH = 'data_2018_2023/comments.csv'
RAW_POST_PATH = 'data_2018_2023/posts.csv'

CHARLIE_LABEL_PATH = 'manual/charlie_comments.csv'
DANIEL_LABEL_PATH = 'manual/daniel_comments.csv'
DILLAN_LABEL_PATH = 'manual/dillan_comments.csv'

COMMENT_OUTPUT_DIR = 'full_data_batch_label/labeled_comments'
POST_OUTPUT_DIR = 'full_data_batch_label/labeled_posts'

In [36]:
def make_prediction(batch_idx, text_idx, text, tokenizer, model):
    '''Acquire sentiment prediction for a single text.'''
    try:
        encoded_input = tokenizer(text, return_tensors='pt')
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        ranking = np.argsort(scores)
        ranking = ranking[::-1]
        pred_idx = np.argmax(scores)
    except:
        print('Failed to predict sentiment for batch %i text %i!' % (batch_idx, text_idx))
        return ('error', 0)
    
    return (config.id2label[pred_idx], scores[pred_idx])


def make_batch_prediction(batch_idx, output_filename, batch, tokenizer, model):
    '''Acquire sentiment prediction for a batch of texts'''
    text_list = list(batch.body)
    prediction_list = [make_prediction(batch_idx, text_idx, text, tokenizer, model) for text_idx, text in enumerate(text_list)]
    
    batch['predicted_label'] = [pair[0] for pair in prediction_list]
    batch['confidence'] = [pair[1] for pair in prediction_list]
    batch.to_csv(output_filename, index=False)
    
    print(batch.predicted_label.value_counts(normalize=True))

### Model setup

In [2]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
# Example prediction task

text = "It's meh"
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
ranking = np.argsort(scores)
ranking = ranking[::-1]
scores

array([0.61255765, 0.32987192, 0.05757042], dtype=float32)

In [4]:
for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

1) negative 0.6126
2) neutral 0.3299
3) positive 0.0576


### Input Data Setup
Load data for the main prediction task

In [30]:
# Predicting sentiment for the full dataset
main_comment_df = pd.read_csv(RAW_COMMENT_PATH)
main_post_df = pd.read_csv(RAW_POST_PATH)

main_comment_df.body = main_comment_df.body.fillna('')
main_post_df.title_text_combined = main_post_df.title_text_combined.fillna('')

# Rename the text field in post df to be consistent with comment df
main_post_df.rename({'title_text_combined': 'body'}, axis=1, inplace=True)

# Split into 40 or 4 batches as checkpoints to recover from interruptions
comment_batches = np.array_split(main_comment_df, 40)
post_batches = np.array_split(main_post_df, 4)

Or load manually annotated subset for model validation

In [14]:
# Predicting sentiment for round 2 manual labels
charlie_label_df = pd.read_csv(CHARLIE_LABEL_PATH)
dillan_label_df = pd.read_csv(DILLAN_LABEL_PATH)
daniel_label_df = pd.read_csv(DANIEL_LABEL_PATH)

comparison = pd.DataFrame()
comparison['charlie_label'] = charlie_label_df.sentiment_label
comparison['dillan_label'] = dillan_label_df.sentiment_label
comparison['daniel_label'] = daniel_label_df.sentiment_label
comparison.head()

Unnamed: 0,charlie_label,dillan_label,daniel_label
0,1,1,1
1,-1,-1,-1
2,1,-1,1
3,1,1,-1
4,1,1,0


### Prediction Loop

#### 1. Manual label validation

In [21]:
validation_texts = charlie_label_df.body.fillna('')
comparison['body'] = validation_texts

output = [make_prediction(0, idx, text, tokenizer, model) for idx, text in enumerate(validation_texts)]
comparison['model_label'] = [pair[0] for pair in output]
comparison['model_label'] = comparison['model_label'].replace({'positive': 1, 'negative': -1, 'neutral': 0, 'error': 0})
comparison['model_confidence'] = [pair[1] for pair in output]
comparison.head()

Failed to predict sentiment for batch 0 text 310!


Unnamed: 0,charlie_label,dillan_label,daniel_label,body,model_label,model_confidence
0,1,1,1,congrats on passing!!!!!!! this is something t...,1,0.974098
1,-1,-1,-1,bro…relax,0,0.690845
2,1,-1,1,I had the same problem coming in as a freshman...,1,0.376017
3,1,1,-1,"Well, can’t give ya comfort because I think th...",0,0.558477
4,1,1,0,Grades don’t really matter. I graduated years ...,0,0.478843


In [28]:
# Counting the number of annotators that the model agrees with for each text
accuracy = (comparison.charlie_label == comparison.model_label).astype(int) + (comparison.dillan_label == comparison.model_label).astype(int) + (comparison.daniel_label == comparison.model_label.astype(str)).astype(int)
accuracy = pd.DataFrame(accuracy.value_counts()).reset_index()
accuracy.columns = ['level_of_match', 'number_of_match']
accuracy

Unnamed: 0,level_of_match,number_of_match
0,3,221
1,2,95
2,1,71
3,0,46


<b>Validation Results
* Total number of observations: 433
* Percentage of model predictions that matches at least two human labels: 0.73
* Percentage of model predictions that matches at least one human label: 0.89

======================================================================================================
#### 2. Full dataset prediction

<b>Comments Sentiment Predictions
* Rough estimation of sentiment distribution: Neutral-50%, Positive-25%, Negative-25%, Error-0.1%

In [None]:
for idx, batch in enumerate(comment_batches):
    output_filename = os.path.join(COMMENT_OUTPUT_DIR, 'batch_%i.csv' % idx)
    make_batch_prediction(idx, output_filename, batch, tokenizer, model)

<b>Posts Sentiment Predictions
* Rough estimation of sentiment distribution: Neutral-73%, Positive-9%, Negative-18%, Error-0.5%

In [None]:
for idx, batch in enumerate(post_batches):
    output_filename = os.path.join(POST_OUTPUT_DIR, 'batch_%i.csv' % idx)
    make_batch_prediction(idx, output_filename, batch, tokenizer, model)