In [22]:
import re
import torch
import pandas as pd
import numpy as np
from transformers import pipeline
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification,AutoConfig
from transformers import RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer

In [17]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [2]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
config = RobertaConfig.from_json_file('/content/drive/MyDrive/mml/models/roberta_finetuned/config.json')
model = RobertaForSequenceClassification.from_pretrained('/content/drive/MyDrive/mml/models/roberta_finetuned/model.safetensors', config=config)
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [3]:
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    return text

def predict_labels(df):
    # Lists to store predictions and probabilities
    predictions = []
    probabilities = []

    # Process data sequentially
    for text in df['combined']:
        # Preprocess the text
        tokens = preprocess_text(text)

        # Tokenize the text
        tokenized_text = tokenizer(tokens, padding=True, truncation=True, return_tensors='pt')

        # Model inference
        with torch.no_grad():
            outputs = model(**tokenized_text)
            logits = outputs.logits
            probs = torch.softmax(logits, dim=1)
            predictions.append(torch.argmax(probs, dim=1).tolist())
            probabilities.append(probs.squeeze().tolist())

    # Add predictions and probabilities to the DataFrame
    df.loc[:, 'predicted_label'] = [p[0] for p in predictions]
    df.loc[:, 'confidence score'] = [round(p[0], 5) for p in probabilities]

    return df

In [4]:
def predict_irony(df):
    """
    Predict irony labels for each text in the DataFrame.

    Args:
    - df (DataFrame): DataFrame containing the text data in a column named 'combined'.

    Returns:
    - df (DataFrame): DataFrame with an additional column 'irony_predictions' containing the irony labels.
    """
    # Create an empty list to store the predictions
    irony_predictions = []

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-irony")
    model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-irony")

    # Process data sequentially
    for text_list in df['combined']:
        # Join the list of words into a single text string
        text = " ".join(text_list)

        # Tokenize the text
        inputs = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors="pt")

        # Model inference
        with torch.no_grad():
            outputs = model(**inputs)
            predicted_class_idx = torch.argmax(outputs.logits).item()
            predicted_label = model.config.id2label[predicted_class_idx]

        # Append the predicted label to the list of predictions
        irony_predictions.append(predicted_label)

    # Add the list of predictions as a new column in the DataFrame
    df['irony_predictions'] = irony_predictions
    return df


In [23]:
def add_sentiment(df, text_column):
    # Create an empty list to store the predictions
    sentiments = []

    MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    config = AutoConfig.from_pretrained(MODEL)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)

    for text_list in df['combined']:
        # Join the list of words into a single text string
        text = " ".join(text_list)

        # Tokenize the text
        inputs = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors="pt")

        # Model inference
        with torch.no_grad():
            outputs = model(**inputs)
            predicted_class_idx = torch.argmax(outputs.logits).item()
            predicted_label = model.config.id2label[predicted_class_idx]

        # Append the predicted label to the list of predictions
        sentiments.append(predicted_label)

    # Add the list of predictions as a new column in the DataFrame
    df['sentiment'] = sentiments
    return df

In [28]:
# Load and preprocess the dataset
dataset_path = "/content/drive/MyDrive/mml/datasets/cleaned_happy.csv"
df = pd.read_csv(dataset_path)

In [30]:
df_sample = df.sample(frac=1).reset_index(drop=True).head(50)
df_sample

Unnamed: 0,Title,Num_Comments,Body,Subreddit,combined,ID
0,I just opened a stuck door using wits and will...,7,We had a stuck aluminum door that wasn’t locke...,CongratsLikeImFive,"['opened', 'stuck', 'door', 'using', 'wits', '...",f080cfa3dc0c1e171aef06e1c17355eaf94e225205ad3a...
1,Why did so many Late Medieval English peasants...,4,"I have been reading ""The Ties That Bound"" by B...",AskHistorians,"['late', 'medieval', 'english', 'peasants', 'd...",9hk6oa
2,Cheesecake Brought Us Together: A Tale of Conn...,0,&#x200B;\n\nhttps://preview.redd.it/jduqxtn57y...,masteringcheesecake,"['cheesecake', 'brought', 'together', 'tale', ...",1b90hbl
3,celebrating kindness and care!,1,Just wanting to share some nice feels! \n\n\n...,polyamory,"['celebrating', 'kindness', 'care', 'wanting',...",oxusq6
4,What's behind the fluorescent red color of bre...,19,My rainbow shiners (Notropis chrosomus) are br...,AskScience,"[""what's"", 'behind', 'fluorescent', 'red', 'co...",ba27e5412c0786857e2717e501085dc5b7413980c02bdc...
5,Here’s a second sign of Spring that makes me h...,2,I hope it does the same for you!,happy,"['second', 'sign', 'spring', 'makes', 'happy',...",897fa5784a53030ba38638583f62dcba2189594aade1a2...
6,Your nation's intriguing mysteries,26,"Some things can't be explained in our world, a...",worldbuilding,"[""nation's"", 'intriguing', 'mysteries', ""can't...",wb1y6t
7,Old Cartoon,1,"I remember from a couple of decades ago, a car...",NoStupidQuestions,"['old', 'cartoon', 'remember', 'couple', 'deca...",a7c545510f9647e4238809e9d5968b31454bc6fb28e8b1...
8,"Did anything good happen in 2023? Actually, yes!",1,- WHO approved a new and affordable malaria v...,Positive,"['anything', 'happen', 'actually', 'yes', 'app...",909d2c1c29992a4a699aa3151056fec33978046658308c...
9,[N] Ooops... OpenAI CTO Mira Murati on which d...,270,Is it only me or there is a massive lawsuit co...,MachineLearning,"['n', 'ooops', 'openai', 'cto', 'mira', 'murat...",b1e2b514103544519b6ee1d20fd3b030a73518e6d05c8a...


In [31]:
df_with_sentiment = add_sentiment(df_sample, 'combined')
df_with_sentiment

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Unnamed: 0,Title,Num_Comments,Body,Subreddit,combined,ID,sentiment
0,I just opened a stuck door using wits and will...,7,We had a stuck aluminum door that wasn’t locke...,CongratsLikeImFive,"['opened', 'stuck', 'door', 'using', 'wits', '...",f080cfa3dc0c1e171aef06e1c17355eaf94e225205ad3a...,neutral
1,Why did so many Late Medieval English peasants...,4,"I have been reading ""The Ties That Bound"" by B...",AskHistorians,"['late', 'medieval', 'english', 'peasants', 'd...",9hk6oa,neutral
2,Cheesecake Brought Us Together: A Tale of Conn...,0,&#x200B;\n\nhttps://preview.redd.it/jduqxtn57y...,masteringcheesecake,"['cheesecake', 'brought', 'together', 'tale', ...",1b90hbl,neutral
3,celebrating kindness and care!,1,Just wanting to share some nice feels! \n\n\n...,polyamory,"['celebrating', 'kindness', 'care', 'wanting',...",oxusq6,neutral
4,What's behind the fluorescent red color of bre...,19,My rainbow shiners (Notropis chrosomus) are br...,AskScience,"[""what's"", 'behind', 'fluorescent', 'red', 'co...",ba27e5412c0786857e2717e501085dc5b7413980c02bdc...,neutral
5,Here’s a second sign of Spring that makes me h...,2,I hope it does the same for you!,happy,"['second', 'sign', 'spring', 'makes', 'happy',...",897fa5784a53030ba38638583f62dcba2189594aade1a2...,neutral
6,Your nation's intriguing mysteries,26,"Some things can't be explained in our world, a...",worldbuilding,"[""nation's"", 'intriguing', 'mysteries', ""can't...",wb1y6t,neutral
7,Old Cartoon,1,"I remember from a couple of decades ago, a car...",NoStupidQuestions,"['old', 'cartoon', 'remember', 'couple', 'deca...",a7c545510f9647e4238809e9d5968b31454bc6fb28e8b1...,neutral
8,"Did anything good happen in 2023? Actually, yes!",1,- WHO approved a new and affordable malaria v...,Positive,"['anything', 'happen', 'actually', 'yes', 'app...",909d2c1c29992a4a699aa3151056fec33978046658308c...,neutral
9,[N] Ooops... OpenAI CTO Mira Murati on which d...,270,Is it only me or there is a massive lawsuit co...,MachineLearning,"['n', 'ooops', 'openai', 'cto', 'mira', 'murat...",b1e2b514103544519b6ee1d20fd3b030a73518e6d05c8a...,neutral


In [26]:
df_irony = predict_irony(df_sample)
df_irony

Unnamed: 0,Title,Num_Comments,Body,Subreddit,combined,ID,sentiment,irony_predictions
0,Have you ever seen someone hug all his/her sur...,0,"Also, have you ever seen someone hug a tree?",Hugs,"['ever', 'seen', 'someone', 'hug', 'surroundin...",ff45f79b89be455ec8e87df4d277413241c13ddfa059f6...,neutral,non_irony
1,Positive experiences with Lexapro?,119,I hear a whole lot about people having negativ...,lexapro,"['positive', 'experiences', 'lexapro', 'hear',...",sq1txv,neutral,non_irony
2,Hit an alligator 🐊,21,"Earlier today on the anti deer post, I had men...",motorcycles,"['hit', 'alligator', '🐊', 'earlier', 'today', ...",wli93w,neutral,non_irony
3,In a world obsessed with productivity,1,\nwhat's the one unconventional habit that has...,Inspiration,"['world', 'obsessed', 'productivity', ""what's""...",a97e7197e2326be2916ca4a76bb6e72abf19c6f5ebd373...,neutral,non_irony
4,I was raised in toxic House and my father died,1,Me and my sister were raised in toxic environm...,mentalhealth,"['raised', 'toxic', 'house', 'father', 'died',...",18jqtme,neutral,non_irony
5,I'm grateful for today,1,I'm grateful for today. I'm grateful for the p...,Gratitude,"['grateful', 'today', 'grateful', 'today', 'gr...",e013c3980a0dbfc62d908a0fffc55a98249765fcfb9f48...,neutral,non_irony
6,Grateful I could afford it alone,5,Grateful for HER choice to eat! \n\nMaaaaaaaaa...,Gratitude,"['grateful', 'afford', 'alone', 'grateful', 'c...",c067ee3c8219cedf0fb035f7b426d164017ed87410cf16...,neutral,non_irony
7,Feeling trapped and a failure,3,"A week ago, I couldn't sleep all night because...",Encouragement,"['feeling', 'trapped', 'failure', 'week', 'ago...",80257c8782893ef6b5199ede49abdb5cfc7a115dbf4901...,neutral,non_irony
8,future litter,2,both mom and dad are AKC registered I am in Am...,Rottweiler,"['future', 'litter', 'mom', 'dad', 'akc', 'reg...",136q4zu,neutral,non_irony
9,Weekly Casual Conversation Thread,386,Welcome to [r/Chicago](https://www.reddit.com/...,chicago,"['weekly', 'casual', 'conversation', 'thread',...",178ya6s,neutral,non_irony


In [27]:
# Call the function to predict labels
df_predicted = predict_labels(df_irony)

df_predicted

Unnamed: 0,Title,Num_Comments,Body,Subreddit,combined,ID,sentiment,irony_predictions,predicted_label,confidence score
0,Have you ever seen someone hug all his/her sur...,0,"Also, have you ever seen someone hug a tree?",Hugs,"['ever', 'seen', 'someone', 'hug', 'surroundin...",ff45f79b89be455ec8e87df4d277413241c13ddfa059f6...,neutral,non_irony,0,0.99998
1,Positive experiences with Lexapro?,119,I hear a whole lot about people having negativ...,lexapro,"['positive', 'experiences', 'lexapro', 'hear',...",sq1txv,neutral,non_irony,0,0.99991
2,Hit an alligator 🐊,21,"Earlier today on the anti deer post, I had men...",motorcycles,"['hit', 'alligator', '🐊', 'earlier', 'today', ...",wli93w,neutral,non_irony,0,0.9999
3,In a world obsessed with productivity,1,\nwhat's the one unconventional habit that has...,Inspiration,"['world', 'obsessed', 'productivity', ""what's""...",a97e7197e2326be2916ca4a76bb6e72abf19c6f5ebd373...,neutral,non_irony,0,0.99996
4,I was raised in toxic House and my father died,1,Me and my sister were raised in toxic environm...,mentalhealth,"['raised', 'toxic', 'house', 'father', 'died',...",18jqtme,neutral,non_irony,0,0.99989
5,I'm grateful for today,1,I'm grateful for today. I'm grateful for the p...,Gratitude,"['grateful', 'today', 'grateful', 'today', 'gr...",e013c3980a0dbfc62d908a0fffc55a98249765fcfb9f48...,neutral,non_irony,0,0.99983
6,Grateful I could afford it alone,5,Grateful for HER choice to eat! \n\nMaaaaaaaaa...,Gratitude,"['grateful', 'afford', 'alone', 'grateful', 'c...",c067ee3c8219cedf0fb035f7b426d164017ed87410cf16...,neutral,non_irony,0,0.9989
7,Feeling trapped and a failure,3,"A week ago, I couldn't sleep all night because...",Encouragement,"['feeling', 'trapped', 'failure', 'week', 'ago...",80257c8782893ef6b5199ede49abdb5cfc7a115dbf4901...,neutral,non_irony,1,0.0041
8,future litter,2,both mom and dad are AKC registered I am in Am...,Rottweiler,"['future', 'litter', 'mom', 'dad', 'akc', 'reg...",136q4zu,neutral,non_irony,0,0.99997
9,Weekly Casual Conversation Thread,386,Welcome to [r/Chicago](https://www.reddit.com/...,chicago,"['weekly', 'casual', 'conversation', 'thread',...",178ya6s,neutral,non_irony,0,0.99998


In [None]:
df.to_csv("/content/drive/MyDrive/mml/datasets/predictions.csv", index=False)