# Step 2 - 

In [14]:
%pip install tqdm
%pip install ollama

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re

# Sentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Basic NLP
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/chrisbutterworth/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [33]:
# Open CSV file and convert to dataframe
df = pd.read_csv('reddit_corpus.csv')

In [35]:
# AI analysis

prompt_minzero = """You are a helpful research assistant who is analyzing Reddit posts.

DEFINITION:
Rage bait is content deliberately designed to elicit anger or outrage by being frustrating, provocative, or offensive, typically posted in order to increase traffic to or engagement.

TASK:
Read the text below and decide whether it contains any rage bait.

Give only a one digit response, 1 if the text contains rage bait or 0 if it does not - no additional information is needed.
"""
def classify(text, prompt, aimodel):
    # Connect to Ollama
    response: ChatResponse = chat(model=aimodel, messages=[
        {
            'role': 'system',
            'content': prompt
        },
        {
            'role': 'user',
            'content': text,
        },
    ])

    answer = response.message.content

    # If the model doesn't 100% follow the prompt and gives an explanation
    # let's cut it down to last 3 letters 
    if (answer != "1" and answer != "0"):
        answer = answer[:1]

    return answer

def classify_batch(texts, prompt, model="deepseek-r1:8b"):
    predictions = []
    for text in tqdm(texts):
        prediction = classify(text, prompt, model)
        predictions.append(prediction)

    return predictions


# Gemma3
predictions_gemma = classify_batch(df['text'].tolist(), prompt_minzero, "gemma3:4b")
df['is_rage_bait'] = predictions_gemma

# Update CSV

df.to_csv("reddit_reviewed_full.csv", index=False)

"""
This was purely used to create a head start and then manually reviewed to double 
check scores and fix incorrect responses
"""

100%|███████████████████████████████████████| 1200/1200 [10:29<00:00,  1.91it/s]


In [92]:
# Analysis 1 - Question frequency & distribution

df['num_questions'] = df['text'].str.count("\\?")

# Compare rage bait vs. control
rage_bait_q_rate = df[df['is_rage_bait']=="1"]['has_question'].mean();
control_q_rate = df[df['is_rage_bait']=="0"]['has_question'].mean()

print(f"Questions in rage bait: {rage_bait_q_rate:.1%}")
print(f"Questions in control: {control_q_rate:.1%}")

print(df['is_rage_bait'].value_counts().get('1', 0))
print(df['is_rage_bait'].value_counts().get('0', 0))
# df.to_csv("reddit_reviewed_an1.csv", index=False)




Questions in rage bait: 51.8%
Questions in control: 45.7%
820
339
339


  print(df['is_rage_bait'].value_counts().get(1, 0))


In [72]:
# Analysis 2 - Question type classification

def classify_question_type(text):
    text_lower = text.lower()
    if 'why do' in text_lower or 'why does' in text_lower:
        return 'why_do'
    elif 'does anyone' in text_lower or 'is it' in text_lower:
        return 'does_anyone_else'
    elif 'when did' in text_lower or 'when will' in text_lower:
        return 'when_temporal'
    elif 'how is' in text_lower and 'not' in text_lower:
        return 'how_is_not'
    elif "what's wrong with" in text_lower:
        return 'whats_wrong'
    else:
        return 'other_question'

df['question_type'] = df['text'].apply(classify_question_type)

# Analyze distribution
rage_type_counts = df[df['is_rage_bait']=="1"]['question_type'].value_counts()
control_type_counts = df[df['is_rage_bait']=="0"]['question_type'].value_counts()

df.to_csv("reddit_reviewed_an2.csv", index=False)
print('Rage question type counts')
print(rage_type_counts)

print('Control question type counts')
print(control_type_counts)

Rage question type counts
question_type
other_question      761
why_do               33
does_anyone_else     26
Name: count, dtype: int64
Control question type counts
question_type
other_question      303
why_do               28
does_anyone_else      8
Name: count, dtype: int64


In [74]:
# Analysis 3: Presupposition trigger counting

presupposition_triggers = {
    'factive_verbs': ['know', 'realize', 'forget', 'remember', 'regret', 'aware'],
    'change_of_state': ['stop', 'start', 'become', 'cease', 'continue', 'turn'],
    'temporal': ['still', 'always', 'never', 'anymore', 'yet'],
    'universals': ['everyone', 'all', 'every', 'nobody', 'none', 'no one']
}

def count_triggers(text, trigger_list):
    count = 0
    text_lower = text.lower()
    for trigger in trigger_list:
        count += len(re.findall(r'\b' + trigger + r'\b', text_lower))
    return count

for category, triggers in presupposition_triggers.items():
    df[f'{category}_count'] = df['text'].apply(lambda x: count_triggers(x, triggers))

# Compare rage bait vs. control
for category in presupposition_triggers.keys():
    rage_bait_avg = df[df['is_rage_bait']=="1"][f'{category}_count'].mean()
    control_avg = df[df['is_rage_bait']=="0"][f'{category}_count'].mean()
    print(f"{category}: Rage bait avg={rage_bait_avg:.2f}, Control avg={control_avg:.2f}")

df.to_csv("reddit_reviewed_an3.csv", index=False)

factive_verbs: Rage bait avg=0.01, Control avg=0.03
change_of_state: Rage bait avg=0.02, Control avg=0.02
temporal: Rage bait avg=0.04, Control avg=0.02
universals: Rage bait avg=0.05, Control avg=0.03


In [77]:
# Analysis 4: Sentiment analysis

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    scores = analyzer.polarity_scores(text)
    return scores['compound']  # -1 (very negative) to +1 (very positive)

df['sentiment'] = df['text'].apply(get_sentiment)

# Compare
print(f"Rage bait sentiment: {df[df['is_rage_bait']=="1"]['sentiment'].mean():.3f}")
print(f"Control sentiment: {df[df['is_rage_bait']=="0"]['sentiment'].mean():.3f}")

df.to_csv("reddit_review_an4.csv", index=False)

Rage bait sentiment: -0.036
Control sentiment: 0.062


In [79]:
# Analysis 5 - Engagement correlation

import pandas as pd
import numpy as np
from scipy import stats

# Engagement metrics (use whichever you have)
engagement_metric = 'num_comments'  # or 'upvotes' or combined score

# Features to test
features = {
    'num_questions': 'Question count',
    'has_question': 'Question presence',
    'sentiment': 'Sentiment',
    'universals_count': 'Universal quantifiers',
    'temporal_count': 'Temporal markers',
    'factive_verbs_count': 'Factive verbs',
    'change_of_state_count': 'Change-of-state verbs'
}

print("Correlations with engagement (comments):\n")
results = {}

for feature, label in features.items():
    if feature in df.columns:
        corr = df[feature].corr(df[engagement_metric])
        # Statistical significance
        _, p_value = stats.pearsonr(df[feature].dropna(), 
                                     df[engagement_metric].dropna())
        results[label] = {'r': corr, 'p': p_value}
        sig = "***" if p_value < .001 else "**" if p_value < .01 else "*" if p_value < .05 else "ns"
        print(f"{label:30s} r = {corr:6.3f}  (p = {p_value:.3f}) {sig}")

# Sort by correlation strength
sorted_results = sorted(results.items(), key=lambda x: abs(x[1]['r']), reverse=True)

Correlations with engagement (comments):

Question count                 r = -0.083  (p = 0.004) **
Question presence              r = -0.087  (p = 0.003) **
Sentiment                      r = -0.018  (p = 0.525) ns
Universal quantifiers          r =  0.069  (p = 0.018) *
Temporal markers               r = -0.007  (p = 0.801) ns
Factive verbs                  r = -0.004  (p = 0.897) ns
Change-of-state verbs          r = -0.014  (p = 0.640) ns
