# v6 - BERT, Huggingface, Deep Learning, Sentiment Analysis

https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest 

In [3]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Preprocess: TwitterTokenizer
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(
    preserve_case=True,
    reduce_len=True,
)


In [4]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(MODEL)
text = "Covid cases are increasing fast!"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
# # TF
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)
# text = "Covid cases are increasing fast!"
# encoded_input = tokenizer(text, return_tensors='tf')
# output = model(encoded_input)
# scores = output[0][0].numpy()
# scores = softmax(scores)
# Print labels and scores
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")



Downloading: 100%|██████████| 929/929 [00:00<00:00, 483kB/s]
Downloading: 100%|██████████| 899k/899k [00:02<00:00, 405kB/s] 
Downloading: 100%|██████████| 456k/456k [00:01<00:00, 229kB/s]  
Downloading: 100%|██████████| 239/239 [00:00<00:00, 65.3kB/s]
Downloading: 100%|██████████| 501M/501M [01:10<00:00, 7.08MB/s] 
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertFo

1) negative 0.7236
2) neutral 0.2287
3) positive 0.0477


In [7]:
def load_train_data(path_pos='data/twitter-datasets/train_pos_full.txt', path_neg='data/twitter-datasets/train_neg_full.txt'):
    # Load data, txt as csv
    #data_path = 'data/twitter-datasets/'
    df_train_pos = pd.read_csv(path_pos, sep = '\t', names = ['tweet'])
    df_train_pos['label'] = 1
    df_train_neg = pd.read_csv(path_neg, sep = '\t', names = ['tweet'], on_bad_lines='skip')
    df_train_neg['label'] = -1
    df_train = pd.concat([df_train_pos, df_train_neg], ignore_index=True)
    print('Train set: ', df_train.shape)
    print('Train set positives: ', df_train_pos.shape)
    print('Train set negatives: ', df_train_neg.shape)
    return df_train   


def load_test_data():
    # Load test data: id, tweet for each row
    data_path = 'data/twitter-datasets/'
    df_test = pd.read_csv(data_path + 'test_data.txt', header=None, names=['line'], sep='\t')
    # Extract id and tweet, limit split by 1 so we don't split the tweet (this is v0, at least we keep it intact)
    df_test['id'] = df_test['line'].apply(lambda x: x.split(',',1)[0]) 
    df_test['tweet'] = df_test['line'].apply(lambda x: x.split(',',1)[1])
    df_test = df_test.drop('line', axis=1)
    return df_test

In [6]:
# we can easily map the highest between negative and positive to -1 and 1 and discard the neutral class
# let's run on a few tweets without even training a model and just check 
import pandas as pd
tweets = load_train_data()

Train set:  (2458295, 2)
Train set positives:  (1218655, 2)
Train set negatives:  (1239640, 2)


In [8]:
tweets.head()

Unnamed: 0,tweet,label
0,<user> i dunno justin read my mention or not ....,1
1,"because your logic is so dumb , i won't even c...",1
2,<user> just put casper in a box ! looved the...,1
3,<user> <user> thanks sir > > don't trip lil ma...,1
4,visiting my brother tmr is the bestest birthda...,1


In [20]:
# let's run on first 1000 tweets and see how it performs vs the actual labels
tweets_1000 = tweets[:1000]
tweets_1000['tweet'] = tweets_1000['tweet'].apply(lambda x: preprocess(x))

encoded_tweets_1000 = tweets_1000['tweet'].apply(lambda x: tokenizer(x, return_tensors='pt'))
output_1000 = encoded_tweets_1000.apply(lambda x: model(**x))
scores_1000 = output_1000.apply(lambda x: x[0][0].detach().numpy())
scores_1000 = scores_1000.apply(lambda x: softmax(x))


# Print labels and scores
ranking = scores_1000.apply(lambda x: np.argsort(x))
ranking = ranking[::-1]
for i in range(scores_1000.shape[0]):
    l = config.id2label[ranking[i][0]]
    s = scores_1000[i][ranking[i][0]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_1000['tweet'] = tweets_1000['tweet'].apply(lambda x: preprocess(x))


1) negative 0.0298
2) positive 0.0052
3) negative 0.005
4) negative 0.0046
5) negative 0.0034
6) negative 0.0014
7) negative 0.0115
8) negative 0.0375
9) negative 0.01
10) negative 0.0132
11) negative 0.0115
12) negative 0.0025
13) negative 0.0167
14) negative 0.0095
15) negative 0.0163
16) positive 0.1604
17) negative 0.003
18) negative 0.0302
19) negative 0.0574
20) negative 0.0617
21) positive 0.0104
22) negative 0.0066
23) positive 0.0314
24) negative 0.016
25) positive 0.0275
26) negative 0.0024
27) positive 0.0793
28) negative 0.0982
29) negative 0.016
30) positive 0.0901
31) negative 0.0038
32) negative 0.0165
33) positive 0.0798
34) negative 0.0024
35) negative 0.0025
36) negative 0.0022
37) negative 0.0034
38) positive 0.0296
39) negative 0.0313
40) negative 0.0017
41) positive 0.1006
42) negative 0.0145
43) negative 0.0045
44) negative 0.0034
45) negative 0.0032
46) negative 0.0038
47) positive 0.0255
48) negative 0.0058
49) positive 0.0558
50) negative 0.0055
51) negative 0.

In [45]:
# let's refine: we argsort, take the best, if it's neutral, we take the second best
def get_best_label(scores, binary=True):
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    if binary:
        if config.id2label[ranking[0]] == 'neutral':
            return config.id2label[ranking[1]]
        else:
            return config.id2label[ranking[0]]
    else:
        return config.id2label[ranking[0]]


    

In [22]:
tweets_1000['pred'] = scores_1000.apply(lambda x: get_best_label(x))
tweets_1000['pred'] = tweets_1000['pred'].apply(lambda x: 1 if x == 'positive' else -1)
tweets_1000['correct'] = tweets_1000['pred'] == tweets_1000['label']
tweets_1000.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_1000['pred'] = scores_1000.apply(lambda x: get_best_label(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_1000['pred'] = tweets_1000['pred'].apply(lambda x: 1 if x == 'positive' else -1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_1000['correct'] = tweets_1000['pred'] == 

Unnamed: 0,tweet,label,pred,correct
0,<user> i dunno justin read my mention or not ....,1,1,True
1,"because your logic is so dumb , i won't even c...",1,-1,False
2,<user> just put casper in a box ! looved the...,1,1,True
3,<user> <user> thanks sir > > don't trip lil ma...,1,1,True
4,visiting my brother tmr is the bestest birthda...,1,1,True


In [23]:
# let's see how many we got right
tweets_1000['correct'].value_counts() / tweets_1000.shape[0]

True     0.801
False    0.199
Name: correct, dtype: float64

In [24]:
# lets compare the scores with the actual labels
tweets_1000['label'].value_counts()

1    1000
Name: label, dtype: int64

In [57]:
# This is with absolutely zero training.
# lets randomize the tweets and do this on 1000 tweets again so we also have negative tweets
#tweets_1000 = tweets.sample(1000)

# but first let's add a helper function to do the same thing as above: preprocess, tokenize, predict, get best label
def predict_label(tweets):
    tweets['tweet'] = tweets['tweet'].apply(lambda x: preprocess(x))
    encoded_tweets = tweets['tweet'].apply(lambda x: tokenizer(x, return_tensors='pt'))
    output = encoded_tweets.apply(lambda x: model(**x))
    scores = output.apply(lambda x: x[0][0].detach().numpy())
    scores = scores.apply(lambda x: softmax(x))
    tweets['pred_binary'] = scores.apply(lambda x: get_best_label(x, binary=True))
    tweets['pred_binary'] = tweets['pred_binary'].apply(lambda x: 1 if x == 'positive' else -1)
    # -1, 1, 0 for negative, positive, neutral
    tweets['pred'] = scores.apply(lambda x: get_best_label(x, binary=False))
    tweets['pred'] = tweets['pred'].apply(lambda x: 1 if x == 'positive' else -1 if x == 'negative' else 0)
    return tweets

def pred_test(tweets):
    tweets = predict_label(tweets)
    tweets['correct'] = tweets['pred_binary'] == tweets['label']
    tweets['not_sure'] = tweets['pred'] == 0
    print("Correct:",tweets['correct'].value_counts() / tweets.shape[0])
    print("Out of:", tweets['label'].value_counts())
    print("Not sure:", tweets['not_sure'].value_counts())
    # how many neutral tweets do we have? How many would be classified correctly if we just take the second best?
    print("Not Sure Tweets:", tweets_1000['pred'].value_counts()[0])
    print("Not Sure but correct:", tweets_1000[(tweets_1000['pred'] == 0) & (tweets_1000['correct'] == True)].shape[0])
    print("Not Sure but incorrect:", tweets_1000[(tweets_1000['pred'] == 0) & (tweets_1000['correct'] == False)].shape[0])
    # confusion matrix
    from sklearn.metrics import confusion_matrix
    confusion_matrix(tweets_1000['label'], tweets_1000['pred'])
    return tweets


In [36]:
tweets_1000 = tweets.sample(1000)
tweets_1000 = pred_test(tweets_1000)

True     0.679
False    0.321
Name: correct, dtype: float64
Out of:  1    511
-1    489
Name: label, dtype: int64


In [37]:
# let's save the model 
model.save_pretrained('data/out/trained/roberta-base-sentiment-analysis')

In [38]:
# We can fine tune the model on our data
tweets_1000.head(25)

Unnamed: 0,tweet,label,pred,correct
409198,just reading lots of peoples bios you guys are...,1,1,True
1822788,at the barbeeerrr wish i had a phone,-1,-1,True
380559,<user> haha thats coo and yeah dude ill hit yo...,1,1,True
1985239,<user> your so mean to me i went to give you a...,-1,-1,True
343549,<user> i will text you or something,1,-1,False
663883,<user> <user> a nice nokia tablet ? i would en...,1,1,True
825233,kickin it at greenhouse nyc w jb & <user> 2sup...,1,1,True
1993245,<user> just got the messages asking for my num...,-1,-1,True
1478877,<user> well we should hang out more ~ ! i miss...,-1,1,False
1599953,<user> i haven't seen you since perks ! you're...,-1,-1,True


In [39]:
# confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(tweets_1000['label'], tweets_1000['pred'])

array([[417,  72],
       [249, 262]])

In [51]:
# How could we improve this?
# let's save also the neutral tweets and just classify the best, let's see how good the actual model is
# added code for Binary = True/False in prediction of best label:

tweets_1000 = tweets.sample(1000)
tweets_1000 = pred_test(tweets_1000)
tweets_1000.head(25)

Correct: True     0.661
False    0.339
Name: correct, dtype: float64
Out of: -1    506
 1    494
Name: label, dtype: int64
Not sure: False    613
True     387
Name: not_sure, dtype: int64


Unnamed: 0,tweet,label,pred_binary,pred,correct,not_sure
1748396,can't wait to ask the doctor about these cramp...,-1,-1,-1,True,False
993776,civil wars on gma,1,-1,0,False,True
15297,<user> with angie & barbie <3 whoresss,1,-1,0,False,True
336303,<user> thanks,1,1,1,True,False
795914,<user> did you go to playboy the other day ? i...,1,1,0,True,True
155903,<user> just a little lol but those things are ...,1,1,1,True,False
2095262,just want to be home laying in my bed right now,-1,1,1,False,False
333092,<user> she's got to 10k i followed her ageees ...,1,1,1,True,False
1690015,"dsquared 2 men's tennis sn018 v01631 sneaker ,...",-1,1,0,False,True
1037602,<user> <user> not pissed at all ! i want my ba...,1,1,1,True,False


In [55]:
# how many neutral tweets do we have? How many would be classified correctly if we just take the second best?
print("Not Sure Tweets:", tweets_1000['pred'].value_counts()[0])
print("Not Sure but correct:", tweets_1000[(tweets_1000['pred'] == 0) & (tweets_1000['correct'] == True)].shape[0])
print("Not Sure but incorrect:", tweets_1000[(tweets_1000['pred'] == 0) & (tweets_1000['correct'] == False)].shape[0])
# confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(tweets_1000['label'], tweets_1000['pred'])

Not Sure Tweets: 387
Not Sure but correct: 186
Not Sure but incorrect: 201


array([[201, 219,  86],
       [  0,   0,   0],
       [ 52, 168, 274]])

---------