In [1]:
import json
import pandas as pd

# Load JSON file
data = []
with open("data/Sarcasm_Headlines_Dataset_v2.json", "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line.strip()))  # Load each line separately

df = pd.DataFrame(data)

In [7]:
import spacy

spacy.cli.download("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [13]:
from collections import Counter

nlp = spacy.load("en_core_web_sm")

def extract_entities(texts):
    person_counter = Counter()
    for text in texts:
        doc = nlp(text)
        for ent in doc.ents:
            if ent.label_ in ["PERSON", "ORG", "GPE"]:
                person_counter[ent.text] += 1
    return person_counter

# Run separately on sarcastic and non-sarcastic subsets
sarcastic_counts = extract_entities(df[df["is_sarcastic"] == 1]["headline"].tolist())
nonsarcastic_counts = extract_entities(df[df["is_sarcastic"] == 0]["headline"].tolist())


In [14]:
print(sarcastic_counts.most_common(10))
print(nonsarcastic_counts.most_common(10))

[('u.s.', 212), ('white house', 100), ('bush', 98), ('clinton', 84), ('congress', 68), ('gop', 61), ('america', 58), ('senate', 47), ('god', 43), ('obama', 42)]
[('donald trump', 285), ('u.s.', 231), ('gop', 209), ('america', 158), ('hillary clinton', 133), ('trump', 102), ("donald trump's", 96), ('senate', 84), ('california', 76), ('congress', 70)]


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch

print(torch.cuda.is_available())  # Should print True if CUDA is enabled
print(torch.version.cuda) 

tokenizer = AutoTokenizer.from_pretrained("charaydes/deberta-v3-large-finetuned-sarcasm")
model = AutoModelForSequenceClassification.from_pretrained("charaydes/deberta-v3-large-finetuned-sarcasm").to("cuda") 

bush = "bush makes last-minute push to appeal to whites"
donald_trump = "donald trump makes last-minute push to appeal to whites"
headlines = [bush, donald_trump]

inputs = tokenizer(headlines, padding=True, truncation=True, return_tensors="pt").to("cuda")

batch_size = 16
dataset = TensorDataset(inputs["input_ids"], inputs["attention_mask"])
loader = DataLoader(dataset, batch_size=batch_size)

all_preds = []
torch.cuda.empty_cache()
with torch.no_grad():
    for batch in loader:
        input_ids, attention_mask = [x.to("cuda") for x in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        all_preds.extend(preds.cpu().tolist())

print(all_preds)

True
12.1


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[1, 0]
