In [1]:
import pandas as pd

In [2]:
file_path = "gold_data.tsv"

In [3]:
df = pd.read_csv(file_path, sep="\t")

print(df.head())
print(df.info())

   split post_id comment_id   parent_id  \
0  train  pmvxko    hcl8hgm  t1_hcl8cj8   
1   test  pu79mu    he18r6i   t3_pu79mu   
2  train  quuhr8    hksqs65   t3_quuhr8   
3  train  xbm6xz    io0kw4p   t3_xbm6xz   
4  train  qtu5tz    hkmv1b0   t3_qtu5tz   

                                      tagged_comment  \
0  [SENT] I ’m so glad that these things can be d...   
1  [SENT] Is it me or is this game fiery and fast...   
2  [SENT] Lol all the 🤡 saying [IN] was only goin...   
3  [IN] lol people writing off the season before ...   
4                      [SENT] I cant stop smiling !    

                 ref_expressions               ref_pos             ref_tags  \
0                             []                    []                   []   
1                             []                    []                   []   
2  ['this team ', 'the Titans ']  [(29, 38), (75, 85)]  ['[IN]', '[OTHER]']   
3                     ['[SENT]']              [(0, 6)]             ['[IN]']   
4        

In [5]:
df['tagged_comment']

0       [SENT] I ’m so glad that these things can be d...
1       [SENT] Is it me or is this game fiery and fast...
2       [SENT] Lol all the 🤡 saying [IN] was only goin...
3       [IN] lol people writing off the season before ...
4                           [SENT] I cant stop smiling ! 
                              ...                        
1494              [SENT] So ... Jordan Addison round 1 ? 
1495    [SENT] What needs to happen for [IN] to make t...
1496    [SENT] I think by toss I meant moreso the inef...
1497    [SENT] It was a new drive , not like [IN] was ...
1498    [SENT] Our [IN] should start holding since app...
Name: tagged_comment, Length: 1499, dtype: object

In [6]:
type(df)

pandas.core.frame.DataFrame

In [7]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset

In [8]:


# Load the trained model and tokenizer
model_path = './hyperbole_model'  # Adjust this path to where your model is saved
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

# Set the device
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [10]:
# Function to predict hyperboles
def predict_hyperboles(texts, model, tokenizer, device, batch_size=32):
    model.eval()
    encoded_texts = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors='pt')
    dataset = TensorDataset(encoded_texts['input_ids'], encoded_texts['attention_mask'])
    dataloader = DataLoader(dataset, batch_size=batch_size)
    
    predictions = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask = [b.to(device) for b in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            pred = torch.argmax(logits, dim=1)
            predictions.extend(pred.cpu().numpy())
    
    return predictions

# Assuming df is your DataFrame with 1500 rows and 'tagged_comment' is the column with text
texts = df['tagged_comment'].tolist()

# Get predictions
predictions = predict_hyperboles(texts, model, tokenizer, device)

# Add predictions to the DataFrame
df['hyperbole_prediction'] = predictions

# If you want to convert numeric predictions to labels
label_map = {0: 'Not Hyperbole', 1: 'Hyperbole'}  # Adjust based on your model's output
df['hyperbole_label'] = df['hyperbole_prediction'].map(label_map)

# Display the first few rows to verify
print(df[['tagged_comment', 'hyperbole_prediction', 'hyperbole_label']].head())

# You can now use df for further analysis or save it
# df.to_csv('classified_comments.csv', index=False)

                                      tagged_comment  hyperbole_prediction  \
0  [SENT] I ’m so glad that these things can be d...                     0   
1  [SENT] Is it me or is this game fiery and fast...                     0   
2  [SENT] Lol all the 🤡 saying [IN] was only goin...                     0   
3  [IN] lol people writing off the season before ...                     0   
4                      [SENT] I cant stop smiling !                      0   

  hyperbole_label  
0   Not Hyperbole  
1   Not Hyperbole  
2   Not Hyperbole  
3   Not Hyperbole  
4   Not Hyperbole  


In [26]:
filtered_df = df[df['hyperbole_prediction'] == 1]
filtered_df

Unnamed: 0,split,post_id,comment_id,parent_id,tagged_comment,ref_expressions,ref_pos,ref_tags,confs,explanation,...,timestamp,team,opp,username_anon,flair,votes,win_prob,gametime,hyperbole_prediction,hyperbole_label
8,train,xtn0z3,iqs2tc0,t3_xtn0z3,[SENT] Soft as butter under the sun,[],[],[],[5],No explicit or implicit references to tag.,...,1.664734e+09,bills,bills,8,,2.0,0.133895,0.365695,1,Hyperbole
41,test,sb0yee,htx9a96,t3_sb0yee,[SENT] Lmao [IN] bouncing off of [OUT] cracked...,"['Weddle', 'gronk']","[(12, 18), (35, 40)]","['[IN]', '[OUT]']","[5, 5]","The comment is about Weddle and Gronk, who are...",...,1.642969e+09,rams,rams,41,White & Yellow #88,13.0,0.616637,0.099277,1,Hyperbole
51,train,rp05qa,hq3nsjg,t3_rp05qa,[SENT] Fuck [OUT] . [SENT] All [OUT] 've been ...,"['Washington', 'they', 'our guys']","[(12, 22), (36, 40), (67, 75)]","['[OUT]', '[OUT]', '[IN]']","[5, 5, 5]","The commenter is a Cowboys supporter, and thei...",...,1.640574e+09,cowboys,cowboys,51,,1.0,0.999686,0.517689,1,Hyperbole
62,train,s5prqn,hszlory,t3_s5prqn,[SENT] All these commentators getting on my mf...,[],[],[],[5],No explicit or implicit references to tag.,...,1.642388e+09,steelers,steelers,62,:Never1::Never2: Never say never but... never,5.0,0.023289,0.516682,1,Hyperbole
91,train,rz96nq,hrtxca6,t3_rz96nq,[SENT] Well any damage on this drive is 100 % ...,['our'],"[(46, 49)]",['[IN]'],[5],"The use of the word 'our' is inclusive, sugges...",...,1.641679e+09,chiefs,chiefs,90,Patrick Mahomes #2,13.0,0.899691,0.149920,1,Hyperbole
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1459,train,qyz6le,hljrbk9,t3_qyz6le,[SENT] bruh up the middle with [IN] on every f...,['gaskin'],"[(31, 37)]",['[IN]'],[5],The commenter mentions a recurring issue with ...,...,1.637524e+09,dolphins,dolphins,1314,,1.0,0.584845,0.594861,1,Hyperbole
1460,train,xzpr1u,irny119,t3_xzpr1u,[SENT] Streams are absolute shit today . [SENT...,[],[],[],[5],No explicit or implicit references to tag.,...,1.665340e+09,jaguars,jaguars,1315,:JAX:,1.0,0.682548,0.481377,1,Hyperbole
1477,train,rjka12,hp4ntrm,t3_rjka12,[SENT] Wow [IN] got fucked on every flag tonight,['we'],"[(11, 13)]",['[IN]'],[5],The word 'we' usually refers to the in-group t...,...,1.639886e+09,patriots,patriots,26,,1.0,0.046078,0.908594,1,Hyperbole
1486,train,106hqsg,j3iuuou,t3_106hqsg,[SENT] [IN] was pissed [OTHER] did n't throw t...,"['McCarthy', 'the refs']","[(7, 15), (27, 35)]","['[IN]', '[OTHER]']","[5, 5]",The comment indicates frustration with the off...,...,1.673215e+09,cowboys,cowboys,1336,,3.0,0.470100,0.132481,1,Hyperbole


In [28]:
for x in filtered_df['tagged_comment']:
    print(x)

[SENT] Soft as butter under the sun 
[SENT] Lmao [IN] bouncing off of [OUT] cracked me up 
[SENT] Fuck [OUT] . [SENT] All [OUT] 've been doing is popping [IN] unnecessarily hard . 
[SENT] All these commentators getting on my mf nerves . 
[SENT] Well any damage on this drive is 100 % [IN] fault . 
[SENT] I miss the old defend every blade of grass mentality . 
[SENT] [IN] is the worst OC in the NFL 
[OUT] That ’s fucking grounding 
[SENT] Madden is the only place I can force [IN] to win . 
[SENT] My fucking heart 
[SENT] [IN] were never going to catch that Absolutely horrendous throw 
[SENT] Done for the day . [IN] Absolutely fucking unacceptable . 
[SENT] it makes sense why [IN] is out there , [IN] runs hard as shit every single time [IN] touches the ball . 
[SENT] This might be my least favorite commentator ever . 
[SENT] That post route throw to [IN] was a thing of beauty 
[SENT] [OUT] damn near killed the whole quarter 
[SENT] God I hate [IN] so much . [IN] This is utterly pathetic .

In [None]:
filtered_df.to_csv('hyperbole.tsv', sep='\t', index=False)