In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m61.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.1-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.1 tokenizers-0.13.2 transformers-4.26.1


In [2]:
import pickle as pkl
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import numpy as np
import pandas as pd

In [3]:
"""
Mounting G-drive
"""

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
"""
Loading dataset for analysis
"""

df = pd.read_csv(r"/content/drive/MyDrive/Airline_Sentiment_analysis/dataset/Usecase3_Customer_Sentiment_Dataset.csv")

In [5]:
df.head()

Unnamed: 0,airline_sentiment,airline,text
0,neutral,Virgin America,@VirginAmerica What @dhepburn said.
1,positive,Virgin America,@VirginAmerica plus you've added commercials t...
2,neutral,Virgin America,@VirginAmerica I didn't today... Must mean I n...
3,negative,Virgin America,@VirginAmerica it's really aggressive to blast...
4,negative,Virgin America,@VirginAmerica and it's a really big bad thing...


In [6]:
def tweet_preprocessor(tweet):
    # precprcess tweet
    tweet_words = []

    for word in tweet.split(' '):
        if word.startswith('@') and len(word) > 1:
            word = '@user'
        
        elif word.startswith('http'):
            word = "http"
        tweet_words.append(word)

    tweet_proc = " ".join(tweet_words)
    return tweet_proc

In [30]:
df['text'] = df['text'].apply(lambda x: tweet_preprocessor(x))

In [7]:
# load model and tokenizer
roberta = "cardiffnlp/twitter-roberta-base-sentiment"

model = AutoModelForSequenceClassification.from_pretrained(roberta)
tokenizer = AutoTokenizer.from_pretrained(roberta)

Downloading (…)lve/main/config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [8]:
labels = ['negative', 'neutral', 'positive']

In [11]:
def sent_analyze(tweet):
    # sentiment analysis
    tweet = tweet_preprocessor(tweet)
    encoded_tweet = tokenizer(tweet, return_tensors='pt')
    # output = model(encoded_tweet['input_ids'], encoded_tweet['attention_mask'])
    output = model(**encoded_tweet)

    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    

    a = (np.argmax(scores))
    for i in range(len(scores)):
    
        l = labels[i]
        s = scores[i]
        # print(l,s)
    # print(labels[a])

    return labels[a]

In [12]:
df["RoBERTa_Label"] = df.text.apply(lambda x: sent_analyze(x))

In [13]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

In [14]:
# Compute the confusion matrix
cm = confusion_matrix(df['airline_sentiment'], df['RoBERTa_Label'])
print(cm)

[[6526 2162  490]
 [ 383 2144  572]
 [  38  193 2132]]


In [15]:
precision, recall, f1_score, support = precision_recall_fscore_support(df['airline_sentiment'], df['RoBERTa_Label'], labels=labels)

#Results
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1_score)

Precision: [0.9393983  0.47655034 0.66750157]
Recall: [0.71104816 0.69183608 0.90224291]
F1 score: [0.80942636 0.56435904 0.7673205 ]


In [31]:
# printing and checking the false cases (Labelled vs pretrained output)
for label in labels:
    false_df = df[(df['airline_sentiment'] == label) & (df['RoBERTa_Label'] != label)]

false_df

Unnamed: 0,airline_sentiment,airline,text,RoBERTa_Label
1,positive,Virgin America,@user plus you've added commercials to the exp...,negative
40,positive,Virgin America,"@user View of downtown Los Angeles, the Hollyw...",neutral
114,positive,Virgin America,@user come back to #PHL already. We need you t...,negative
148,positive,Virgin America,@user #flight home to #dc #sunset #globe in' #...,neutral
183,positive,Virgin America,😎 RT @user You’ve met your match. Got status o...,neutral
...,...,...,...,...
13856,positive,American,@user #AmericanView Sweet Home Chicago http,neutral
13977,positive,American,@user I tried that. They won't book us with an...,negative
14281,positive,American,@user Hmm. Looks like you looked at my tweet f...,neutral
14382,positive,American,"@user I'm flying with your competitor today, s...",negative


In [20]:
df.shape

(14640, 4)

In [34]:
false_positive = df[(df['airline_sentiment'] == "negative") & (df['RoBERTa_Label'] == "positive")]

false_positive

Unnamed: 0,airline_sentiment,airline,text,RoBERTa_Label
55,negative,Virgin America,@user hi! i'm so excited about your $99 LGA-&g...,positive
82,negative,Virgin America,@user you're the best!! Whenever I (begrudging...,positive
106,negative,Virgin America,@user called your service line and was hung up...,positive
217,negative,Virgin America,@user I was really looking forward to my fligh...,positive
288,negative,Virgin America,@user husband and I ordered three drinks via m...,positive
...,...,...,...,...
14519,negative,American,"@user because of you, I am doing the one thing...",positive
14532,negative,American,@user missing a full days of work thanks guys,positive
14563,negative,American,@user All flts to JFK Cancelled Flightled Thx ...,positive
14572,negative,American,"@user When I left Orlando, I was 2nd in line f...",positive


In [33]:
false_negative = df[(df['airline_sentiment'] == "positive") & (df['RoBERTa_Label'] == "negative")]

false_negative

Unnamed: 0,airline_sentiment,airline,text,RoBERTa_Label
1,positive,Virgin America,@user plus you've added commercials to the exp...,negative
114,positive,Virgin America,@user come back to #PHL already. We need you t...,negative
307,positive,Virgin America,@user I mean. Probably inappropriate while on ...,negative
663,positive,United,@user private jet would have been cool! Do doe...,negative
1187,positive,United,@user The DEN b44 agent (9:30am) was amazing. ...,negative
1613,positive,United,@user can Cancelled Flight my flight anytime i...,negative
1851,positive,United,@user This is probably the least dependable ai...,negative
2517,positive,United,@user thank you.\nIt's my daughters 13th bd pa...,negative
2716,positive,United,@user you suck. @user you're the best.,negative
2866,positive,United,@user On the plane but thanks! Maybe don't let...,negative


In [28]:
false_positive.shape

(490, 4)

In [29]:
false_negative.shape

(38, 4)

In [35]:
# Looks like, most of false negatives are actually negatives. Also, false positive contains sarcasm. sometimes appreciation and depriciation in one tweet, which is dicy for LM.