In [1]:
import importlib
import pandas as pd
import torch
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import Dataset

from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from Project.utils.storage import truth_db as t_db
from Project.utils.storage import bluesky_db as b_db


importlib.reload(t_db)
importlib.reload(b_db)


truth_db = t_db.SQLiteTruthSaver(db_name='../db/truthsocial.db')
bluesky_db = b_db.SQLiteBlueSkySaver(db_name='../db/bluesky.db')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# THE PLAN:

# 1. Label some comments from both socials, with Gemini

# 2. Fine tune a HuggingFace model

# 3. Run the model, don't care about performance honestly


In [2]:
truth_db.cursor.execute("""
SELECT content, gemini_label
FROM Posts NATURAL JOIN CommentAnalysis
""")
truth_data = truth_db.cursor.fetchall()

bluesky_db.cursor.execute("""
SELECT content, gemini_label
FROM Posts NATURAL JOIN CommentAnalysis
""")
bluesky_data = bluesky_db.cursor.fetchall()

data = truth_data + bluesky_data

df = pd.DataFrame(data, columns=['content', 'label'])
df['label'] = [{'Neutral': 0, 'Republican': 1, 'Democratic': 2}.get(l) for l in df['label']]

In [3]:
df

Unnamed: 0,content,label
0,<p>Score!</p>,0
1,<p>The 170 can make maple syrup!</p>,0
2,<p>Exactly and they’re not just in one party !...,0
3,<p>why dont you go away</p>,0
4,<p>Democrats are destroying America. Why? Is i...,1
...,...,...
4395,California led the way on this one. Don’t forg...,0
4396,📌I see Gavin Newsom saying that a man who is r...,2
4397,That's the point. So there's nothing left but ...,2
4398,A stroller does not have to cost a thousand bu...,0


In [4]:
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['label']  # Ensures balanced splits
)

print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")

Train size: 3520, Test size: 880


In [5]:
# Model
model_name = "matous-volf/political-leaning-politics"
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3  # 0, 1, 2
)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("launch/POLITICS", use_fast=False)
def tokenize_function(data):
    return tokenizer(
        data['content'],
        truncation=True,
        padding=True,
        max_length=512
    )

In [6]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


Map: 100%|██████████| 3520/3520 [00:02<00:00, 1365.87 examples/s]
Map: 100%|██████████| 880/880 [00:00<00:00, 1227.19 examples/s]


In [7]:
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(train_df['label']),
    y=train_df['label']
)
class_weights = torch.tensor(class_weights, dtype=torch.float)

print(f"Class weights: {class_weights}")

Class weights: tensor([0.5090, 2.1295, 1.7671])


In [8]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, predictions)
    report = classification_report(labels, predictions, output_dict=True)

    return {
        'accuracy': accuracy,
        'f1_macro': report['macro avg']['f1-score'],
        'f1_weighted': report['weighted avg']['f1-score']
    }

training_args = TrainingArguments(
    output_dir='./results',
    logging_dir='./logs',
    num_train_epochs=3,
    warmup_steps=100,                   # Start with a smaller LR
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,                  # Regularization
    eval_steps=200,                     # How often it does evaluation
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,             # Needed if metric is f1 instead of loss
    report_to=None,
)

model.gradient_checkpointing_enable()


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [9]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
200,No log,0.67356,0.718182,0.555087,0.678162
400,No log,0.860501,0.661364,0.617458,0.678215
600,0.738700,0.834527,0.745455,0.687885,0.751763
800,0.738700,0.772857,0.726136,0.682134,0.737337
1000,0.501100,0.971516,0.709091,0.676428,0.721736
1200,0.501100,1.080729,0.692045,0.651945,0.705056


TrainOutput(global_step=1320, training_loss=0.5616306940714518, metrics={'train_runtime': 1225.9534, 'train_samples_per_second': 8.614, 'train_steps_per_second': 1.077, 'total_flos': 2778477691207680.0, 'train_loss': 0.5616306940714518, 'epoch': 3.0})

In [10]:
# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:")
for key, value in results.items():
    print(f"{key}: {value}")

Evaluation Results:
eval_loss: 0.8345268964767456
eval_accuracy: 0.7454545454545455
eval_f1_macro: 0.6878851613264981
eval_f1_weighted: 0.7517625157383435
eval_runtime: 18.5469
eval_samples_per_second: 47.447
eval_steps_per_second: 5.931
epoch: 3.0


In [13]:
# Save model
model.save_pretrained('../data/bluesky_truth_model')
tokenizer.save_pretrained('../data/bluesky_truth_model')

('../data/bluesky_truth_model\\tokenizer_config.json',
 '../data/bluesky_truth_model\\special_tokens_map.json',
 '../data/bluesky_truth_model\\vocab.json',
 '../data/bluesky_truth_model\\merges.txt',
 '../data/bluesky_truth_model\\added_tokens.json')