In [2]:
from datasets import load_dataset

dataset = load_dataset("lelouch0204/cleaned_allsides_v2")

df = dataset["train"].to_pandas()

display(df.head(20))

Generating train split: 100%|██████████| 6209/6209 [00:00<00:00, 8899.62 examples/s]


Unnamed: 0,Title,Link,Text,Source,Bias,text_length,clean_text,lemmatized,keywords,cluster,__index_level_0__
0,"As Trump throws immigration into uncertainty, ...",https://abcnews.go.com/Politics/trump-throws-i...,As President Donald Trump starts to defend his...,abcnews,lean left,5179,as president donald trump starts to defend his...,president donald trump start defend executive ...,"[child, lawsuit, citizenship, monica, states, ...",2,1
1,RFK Jr. reports up to $1.2M in credit card deb...,https://abcnews.go.com/Politics/rfk-jr-reports...,From a multimillion-dollar law firm payout to ...,abcnews,lean left,5260,from a multimillion dollar law firm payout to ...,multimillion dollar law firm payout figure end...,"[kennedy, disclosure, million, liability, earn...",1,4
2,Trump moves to brand Houthis as a foreign terr...,https://abcnews.go.com/Politics/trump-moves-br...,President Donald Trump issued an executive ord...,abcnews,lean left,5151,president donald trump issued an executive ord...,president donald trump issue executive order i...,"[houthis, designation, group, yemen, terrorist...",1,5
3,Sen. John Fetterman on flurry of Trump executi...,https://abcnews.go.com/Politics/sen-john-fette...,President Donald Trump launched his plan to re...,abcnews,lean left,6384,president donald trump launched his plan to re...,president donald trump launch plan reshape ame...,"[fetterman, abc, agree, pardon, news, lago, lo...",2,7
4,"Heads of Oath Keepers, Proud Boys released fro...",https://abcnews.go.com/Politics/oath-keepers-p...,"Enrique Tarrio, the former head of the Proud B...",abcnews,lean left,3939,enrique tarrio the former head of the proud bo...,enrique tarrio head proud boy stewart rhode he...,"[rhode, keeper, capitol, oath, violent, convic...",2,9
5,Lightning-rod immigration bill set to go to Tr...,https://abcnews.go.com/Politics/house-vote-lak...,The House passed an amended version of the Lak...,abcnews,lean left,6033,the house passed an amended version of the lak...,house pass amend version laken riley act wedne...,"[bill, vote, senate, riley, immigration, pass,...",3,10
6,DC judges slam Trump Jan. 6 pardons as 'revisi...,https://abcnews.go.com/Politics/dc-judges-slam...,In the days since President Donald Trump hande...,abcnews,lean left,5123,in the days since president donald trump hande...,day president donald trump hand pardon commuta...,"[judge, capitol, dismissal, officer, occur, pa...",2,11
7,How Trump might change the world in his 2nd term,https://abcnews.go.com/Politics/trump-wield-am...,Donald Trump returned to office as president o...,abcnews,lean left,4797,donald trump returned to office as president o...,donald trump return office president monday re...,"[hamas, israel, ukraine, america, peace, trump...",4,13
8,"Trump's pardons for rioters 'disturbing,' form...",https://abcnews.go.com/Politics/trumps-pardons...,President Donald Trump's sweeping pardons and ...,abcnews,lean left,7295,president donald trump sweeping pardons and co...,president donald trump sweeping pardon commuta...,"[pardon, rioter, enforcement, officer, law, at...",2,14
9,Bidenâ€™s letter to Trump wished him 'all the ...,https://abcnews.go.com/Politics/trump-calls-bi...,Former President Joe Biden wished President Do...,abcnews,lean left,2022,former president joe biden wished president do...,president joe biden wish president donald trum...,"[letter, trump, biden, oval, leave, office, pr...",2,15


In [4]:
print(df["Bias"].value_counts())

Bias
left          2967
right         1273
lean left      776
center         644
lean right     549
Name: count, dtype: int64


In [6]:
LABEL2ID = {
    "left": 1,
    "lean left": 2,
    "center": 3,
    "lean right": 4,
    "right": 5
}

In [7]:
import json
import pandas as pd
import random
from tqdm import tqdm
import os
from itertools import combinations

def build_ordinal_simcse_pairs(df, output_path, 
                               max_pos_per_label=3000, 
                               max_neg_per_label=6000):
    """
    Build ordinal supervised SimCSE training pairs from a DataFrame
    with 'cleaned_text' and 'label' columns.
    
    Args:
        df: pandas DataFrame with 'cleaned_text' and 'label'
        output_path: path to save JSON
        max_pos_per_label: max positive pairs per label
        max_neg_per_label: max negative pairs per label
    """
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    # Convert labels to numeric 1–5
    df["id_label"] = df["Bias"].map(LABEL2ID)
    print(f"ID Label dist: {df['id_label'].value_counts().to_dict()}")

    # Group texts by label
    groups = {i: df[df["id_label"] == i]["clean_text"].tolist() 
              for i in LABEL2ID.values()}

    # Remove empty groups
    groups = {k: v for k, v in groups.items() if len(v) > 0}
    valid_labels = list(groups.keys())

    pairs = []
    print(f"Valid labels: {valid_labels}")

    # -----------------------------
    # Positive pairs
    # -----------------------------
    print("Building positive pairs...")
    for label, texts in groups.items():
        n_texts = len(texts)
        if n_texts < 2:
            continue  # skip too small classes

        # If class is small, take all combinations
        if n_texts < 50:
            for s1, s2 in combinations(texts, 2):
                pairs.append({
                    "sentence1": s1,
                    "sentence2": s2,
                    "label1": label,
                    "label2": label
                })
        else:
            # Otherwise, randomly sample up to max_pos_per_label
            for _ in range(max_pos_per_label):
                s1, s2 = random.sample(texts, 2)
                pairs.append({
                    "sentence1": s1,
                    "sentence2": s2,
                    "label1": label,
                    "label2": label
                })

    # -----------------------------
    # Negative pairs
    # -----------------------------
    print("Building negative pairs...")
    for label1, texts1 in tqdm(groups.items()):
        n_texts1 = len(texts1)
        if n_texts1 == 0:
            continue

        # Sample up to max_neg_per_label per label
        for _ in range(min(max_neg_per_label, n_texts1)):
            s1 = random.choice(texts1)

            # Pick a different non-empty label
            other_labels = [l for l in valid_labels if l != label1 and len(groups[l]) > 0]
            if not other_labels:
                continue
            label2 = random.choice(other_labels)
            s2 = random.choice(groups[label2])

            pairs.append({
                "sentence1": s1,
                "sentence2": s2,
                "label1": label1,
                "label2": label2
            })

    # Shuffle final dataset
    random.shuffle(pairs)
    print(f"Total training pairs: {len(pairs)}")

    # Save to JSON
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(pairs, f, indent=2)

    print(f"Saved to {output_path}")


In [25]:
build_ordinal_simcse_pairs(df=df, 
                           output_path="./data/allsides/ordinal_simcse_pairs.json")

ID Label dist: {1: 2967, 5: 1273, 2: 776, 3: 644, 4: 549}
Valid labels: [1, 2, 3, 4, 5]
Building positive pairs...
Building negative pairs...


100%|██████████| 5/5 [00:00<00:00, 429.21it/s]

Total training pairs: 21209





Saved to ./data/allsides/ordinal_simcse_pairs.json


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# -----------------------------
# Load your existing PKL files
# -----------------------------
train_df = pd.read_pickle("./data/allsides/dataset_train_split.pkl")
test_df = pd.read_pickle("./data/allsides/dataset_test_split.pkl")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

# Inspect columns
print(train_df.columns)

Train shape: (4967, 11)
Test shape: (1242, 11)
Index(['Title', 'Link', 'Text', 'Source', 'Bias', 'text_length', 'clean_text',
       'lemmatized', 'keywords', 'cluster', '__index_level_0__'],
      dtype='object')


In [2]:
label_col = "Bias"   # change if needed
text_col = "clean_text"  # or whatever your column is

In [3]:
embed_train_df, classifier_train_df = train_test_split(
    train_df,
    test_size=0.20,
    random_state=42,
    stratify=train_df[label_col]   # keeps class distribution!!!
)

print("Embedding Train:", embed_train_df.shape)
print("Classifier Train:", classifier_train_df.shape)
print("Test:", test_df.shape)

embed_train_df[label_col].value_counts(normalize=True).round(3)

Embedding Train: (3973, 11)
Classifier Train: (994, 11)
Test: (1242, 11)


Bias
left          0.478
right         0.205
lean left     0.125
center        0.104
lean right    0.088
Name: proportion, dtype: float64

In [4]:
embed_train_df.to_pickle("./data/allsides/embed_train.pkl")
classifier_train_df.to_pickle("./data/allsides/classifier_train.pkl")

In [8]:
build_ordinal_simcse_pairs(df=embed_train_df, 
                           output_path="./data/allsides/ordinal_simcse_pairs_embed.json")

ID Label dist: {1: 1899, 5: 814, 2: 497, 3: 412, 4: 351}
Valid labels: [1, 2, 3, 4, 5]
Building positive pairs...
Building negative pairs...


100%|██████████| 5/5 [00:00<00:00, 429.57it/s]

Total training pairs: 18973





Saved to ./data/allsides/ordinal_simcse_pairs_embed.json
