In [72]:
import fasttext

In [73]:
import numpy as np
data = np.fromfile(
"/data/paloma/tokenized_paloma_c4_100_domains_validation.bin",
dtype=np.uint16
)

In [1]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
tokens = np.fromfile("/data/c-cye/assignment4-data/cc_tokenized/CC-MAIN-20250418151910-20250418181910-00091.bin",dtype=np.uint16)

In [10]:
testtext = tokenizer.decode(tokens)
docs = testtext.split("<|endoftext|>")

In [None]:
testfile = "/data/c-cye/assignment4-data/cc_filtered/CC-MAIN-20250418151910-20250418181910-00091.txt"
with open(testfile, "r") as f:
    testtext = f.read()
tokenized = tokenizer.tokenize(testtext)

In [None]:
decoded_paloma = tokenizer.decode(data)
# save to file
with open("decoded_paloma.txt", "w") as f:
    f.write(decoded_paloma)

In [4]:
# clean newlines, whitespace, and empty strings from decoded_paloma
docs = [doc.strip().replace("\n", " ") for doc in decoded_paloma.split("<|endoftext|>") if doc.strip()]
len(docs)

14059

# Train a Paloma Classifier

In [14]:
# write fasttext file
with open("fasttext_paloma.txt", "w") as f:
    for doc in docs:
        f.write(f"__label__paloma {doc}\n")

In [68]:
import random
from collections import Counter
from typing import List, Tuple

def read_fasttext_file(filepath: str) -> List[str]:
    """Read FastText formatted file and return list of lines."""
    with open(filepath, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f if line.strip()]

def extract_label(line: str) -> str:
    """Extract label from FastText formatted line (assumes __label__xxx format)."""
    return line.split()[0]

def balance_data(data: List[str]) -> List[str]:
    """Balance the dataset by undersampling to match the minority class."""
    # group lines by label
    label_groups = {}
    for line in data:
        label = extract_label(line)
        if label not in label_groups:
            label_groups[label] = []
        label_groups[label].append(line)
    
    # find minimum class size
    min_size = min(len(group) for group in label_groups.values())
    
    print(f"Original distribution: {[(label, len(group)) for label, group in label_groups.items()]}")
    print(f"Balancing to {min_size} samples per class")
    
    # sample min_size examples from each class
    balanced_data = []
    for label, group in label_groups.items():
        balanced_data.extend(random.sample(group, min_size))
    
    return balanced_data

def shuffle_and_split_fasttext(file1_path: str, file2_path: str, 
                              train_split: float = 0.8, 
                              balance: bool = True,
                              output_train: str = 'train.txt',
                              output_test: str = 'test.txt',
                              random_seed: int = 42) -> None:
    """
    process two fasttext files: combine, shuffle, balance, and split.
    
    Args:
        file1_path: Path to first FastText file
        file2_path: Path to second FastText file  
        train_split: Fraction for training set (0.0 to 1.0)
        balance: Whether to balance classes by undersampling
        output_train: Output path for training set
        output_test: Output path for test set
        random_seed: Random seed for reproducibility
    """
    
    # set random seed for reproducibility
    random.seed(random_seed)
    
    # read both files
    print("reading files...")
    data1 = read_fasttext_file(file1_path)
    data2 = read_fasttext_file(file2_path)
    
    # combine data
    all_data = data1 + data2
    print(f"combined {len(data1)} + {len(data2)} = {len(all_data)} samples")
    
    # show original distribution
    labels = [extract_label(line) for line in all_data]
    print(f"label distribution: {Counter(labels)}")

    # rebalance classes
    if balance:
        all_data = balance_data(all_data)
        labels = [extract_label(line) for line in all_data]
        print(f"Balanced distribution: {Counter(labels)}")
    
    # shuffle the combined data
    random.shuffle(all_data)
    print("data shuffled")
    
    # split into train/test
    split_idx = int(len(all_data) * train_split)
    train_data = all_data[:split_idx]
    test_data = all_data[split_idx:]
    
    print(f"split: {len(train_data)} train, {len(test_data)} test")
    
    # check distribution in splits
    train_labels = [extract_label(line) for line in train_data]
    test_labels = [extract_label(line) for line in test_data]
    print(f"train distribution: {Counter(train_labels)}")
    print(f"test distribution: {Counter(test_labels)}")
    
    # write output files
    with open(output_train, 'w', encoding='utf-8') as f:
        for line in train_data:
            f.write(line + '\n')
    
    with open(output_test, 'w', encoding='utf-8') as f:
        for line in test_data:
            f.write(line + '\n')
    
    print(f"Files saved: {output_train}, {output_test}")

In [70]:
shuffle_and_split_fasttext(
        file1_path='positive_data_cleaner.txt',
        file2_path='fasttext_paloma.txt',
        train_split=0.9,
        balance=True,
        output_train='paloma.train',
        output_test='paloma.test',
        random_seed=42
    )

reading files...
combined 15000 + 14059 = 29059 samples
label distribution: Counter({'__label__high-quality': 15000, '__label__paloma': 14059})
Original distribution: [('__label__high-quality', 15000), ('__label__paloma', 14059)]
Balancing to 14059 samples per class
Balanced distribution: Counter({'__label__high-quality': 14059, '__label__paloma': 14059})
data shuffled
split: 25306 train, 2812 test
train distribution: Counter({'__label__paloma': 12674, '__label__high-quality': 12632})
test distribution: Counter({'__label__high-quality': 1427, '__label__paloma': 1385})
Files saved: paloma.train, paloma.test


In [76]:
model = fasttext.train_supervised(input='paloma.train', autotuneValidationFile='paloma.test')

Progress: 100.0% Trials:    9 Best score:  0.982219 ETA:   0h 0m 0s
Training again with best arguments
Read 20M words
Number of words:  897603
Number of labels: 2
Progress: 100.0% words/sec/thread:  519900 lr:  0.000000 avg.loss:  0.022311 ETA:   0h 0m 0s


In [77]:
model.save_model('paloma.bin')

In [78]:
positive_test = read_fasttext_file("positive_data_filtered.txt")

In [None]:
labels, vals = model.predict(positive_test)

  vals = [float(val) for val in vals]


# Data Aggregation

In [2]:
OUT_DIR = '/data/c-cye/assignment4-data/cc_filtered'

In [3]:
# collect all jsons and average fields
import glob
import os
import json

jsons = glob.glob(os.path.join(OUT_DIR, "*.json"))

data = [json.load(open(f)) for f in jsons]
fields = data[0].keys()
average = {key: 0 for key in fields}
for field in fields:
    for d in data:
        average[field] += d[field]

total_records = average['total_records']
# for field in fields:
#     average[field] /= total_records

In [4]:
average

{'total_records': 113329836,
 'after_language_filter': 45948917,
 'after_gopher_filter': 31268737,
 'after_nsfw_filter': 31121354,
 'after_toxic_filter': 31041103,
 'after_quality_filter': 3903567,
 'after_dedup': 3882421}