# Text Classifier Data Loading

This notebook loads the Dell Research Harvard newswire dataset and prepares it for multi-class text classification.

## 1. Setup and Data Loading

In [1]:
# Import required libraries
from datasets import load_dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load the Dell Research Harvard newswire dataset
print("Loading dataset from HuggingFace...")
dataset = load_dataset("dell-research-harvard/newswire",
    data_files=["1968_data_clean.json", "1969_data_clean.json"],
    trust_remote_code=True
)

print(f"Dataset structure: {dataset}")
print(f"Number of articles: {dataset['train'].num_rows}")

Loading dataset from HuggingFace...
Dataset structure: DatasetDict({
    train: Dataset({
        features: ['article', 'byline', 'dates', 'newspaper_metadata', 'antitrust', 'civil_rights', 'crime', 'govt_regulation', 'labor_movement', 'politics', 'protests', 'ca_topic', 'ner_words', 'ner_labels', 'wire_city', 'wire_state', 'wire_country', 'wire_coordinates', 'wire_location_notes', 'people_mentioned', 'cleaned_article', 'cluster_size', 'year'],
        num_rows: 65191
    })
})
Number of articles: 65191


## 2. Label Mapping Configuration

In [2]:
# Convert to pandas for easier manipulation
df = dataset["train"].to_pandas()

# Define simple hardcoded label mapping: string class to integer 1-7, 0 for no class
LABEL_MAP = {
    'antitrust': 1,
    'civil_rights': 2, 
    'crime': 3,
    'govt_regulation': 4,
    'labor_movement': 5,
    'politics': 6,
    'protests': 7
}

# Create reverse mapping for reference
ID_TO_LABEL = {v: k for k, v in LABEL_MAP.items()}
ID_TO_LABEL[0] = 'no_class'

print("Label mapping:")
print("   0: no_class")
for label, id in LABEL_MAP.items():
    print(f"   {id}: {label}")

print(f"\nDataset info:")
print(f"   Shape: {df.shape}")
print(f"   Label columns: {list(LABEL_MAP.keys())}")

Label mapping:
   0: no_class
   1: antitrust
   2: civil_rights
   3: crime
   4: govt_regulation
   5: labor_movement
   6: politics
   7: protests

Dataset info:
   Shape: (65191, 23)
   Label columns: ['antitrust', 'civil_rights', 'crime', 'govt_regulation', 'labor_movement', 'politics', 'protests']


## 3. Label Conversion

In [3]:
def get_multi_class_labels(row):
    """Convert binary label columns to list of integer IDs for multi-class classification"""
    active_labels = []
    for label_name in LABEL_MAP.keys():
        if row[label_name] == 1:
            active_labels.append(LABEL_MAP[label_name])
    
    if not active_labels:
        return [0]  # [0] for no_class
    else:
        return sorted(active_labels)  # Return sorted list of active label IDs

# Apply label conversion
print("Converting labels to multi-class format...")
df['label'] = df.apply(get_multi_class_labels, axis=1)

# Create final dataset with text and label columns
data = pd.DataFrame({
    'text': df['cleaned_article'],
    'label': df['label']
})

print("Label conversion complete!")
print(f"Total samples: {len(data)}")

# Show label distribution
label_counts = data['label'].apply(tuple).value_counts()
print(f"Unique label combinations: {len(label_counts)}")
print(f"\nTop 15 most common label combinations:")
for i, (label_tuple, count) in enumerate(label_counts.head(15).items()):
    label_list = list(label_tuple)
    label_names = [ID_TO_LABEL[id] for id in label_list]
    percentage = (count / len(data)) * 100
    print(f"   {label_list} ({label_names}): {count} ({percentage:.1f}%)")

Converting labels to multi-class format...
Label conversion complete!
Total samples: 65191
Unique label combinations: 54

Top 15 most common label combinations:
   [0] (['no_class']): 29339 (45.0%)
   [6] (['politics']): 17646 (27.1%)
   [3] (['crime']): 5107 (7.8%)
   [4, 6] (['govt_regulation', 'politics']): 3177 (4.9%)
   [5] (['labor_movement']): 1697 (2.6%)
   [5, 6] (['labor_movement', 'politics']): 1455 (2.2%)
   [4] (['govt_regulation']): 1051 (1.6%)
   [7] (['protests']): 585 (0.9%)
   [2, 3] (['civil_rights', 'crime']): 548 (0.8%)
   [2] (['civil_rights']): 487 (0.7%)
   [3, 7] (['crime', 'protests']): 461 (0.7%)
   [2, 5] (['civil_rights', 'labor_movement']): 450 (0.7%)
   [2, 6] (['civil_rights', 'politics']): 426 (0.7%)
   [2, 3, 7] (['civil_rights', 'crime', 'protests']): 414 (0.6%)
   [2, 4, 6] (['civil_rights', 'govt_regulation', 'politics']): 387 (0.6%)


## 4. Train/Validation/Test Split

In [5]:
# Create train/test split
print("Creating train/val/test split...")
X_train, X_temp, y_train, y_temp = train_test_split(
    data['text'], data['label'],
    test_size=0.3, 
    random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5, 
    random_state=42
)
# Create final train and test datasets
train_data = pd.DataFrame({'text': X_train, 'label': y_train}).reset_index(drop=True)
val_data = pd.DataFrame({'text': X_val, 'label': y_val}).reset_index(drop=True)
test_data = pd.DataFrame({'text': X_test, 'label': y_test}).reset_index(drop=True)

print("Train/val/test split complete!")
print(f"Training set: {train_data.shape}")
print(f"Validation set: {val_data.shape}")
print(f"Test set: {test_data.shape}")

# Show distribution in both sets
print(f"\nTop 10 training label combinations:")
train_label_counts = train_data['label'].apply(tuple).value_counts()
for i, (label_tuple, count) in enumerate(train_label_counts.head(10).items()):
    label_list = list(label_tuple)
    label_names = [ID_TO_LABEL[id] for id in label_list]
    percentage = (count / len(train_data)) * 100
    print(f"   {label_list} ({label_names}): {count} ({percentage:.1f}%)")

print(f"\nTop 10 validation label combinations:")
val_label_counts = val_data['label'].apply(tuple).value_counts()
for i, (label_tuple, count) in enumerate(val_label_counts.head(10).items()):
    label_list = list(label_tuple)
    label_names = [ID_TO_LABEL[id] for id in label_list]
    percentage = (count / len(val_data)) * 100
    print(f"   {label_list} ({label_names}): {count} ({percentage:.1f}%)")

print(f"\nTop 10 test label combinations:")
test_label_counts = test_data['label'].apply(tuple).value_counts()
for i, (label_tuple, count) in enumerate(test_label_counts.head(10).items()):
    label_list = list(label_tuple)
    label_names = [ID_TO_LABEL[id] for id in label_list]
    percentage = (count / len(test_data)) * 100
    print(f"   {label_list} ({label_names}): {count} ({percentage:.1f}%)")

Creating train/val/test split...
Train/val/test split complete!
Training set: (45633, 2)
Validation set: (9779, 2)
Test set: (9779, 2)

Top 10 training label combinations:
   [0] (['no_class']): 20588 (45.1%)
   [6] (['politics']): 12314 (27.0%)
   [3] (['crime']): 3544 (7.8%)
   [4, 6] (['govt_regulation', 'politics']): 2205 (4.8%)
   [5] (['labor_movement']): 1180 (2.6%)
   [5, 6] (['labor_movement', 'politics']): 1040 (2.3%)
   [4] (['govt_regulation']): 753 (1.7%)
   [7] (['protests']): 424 (0.9%)
   [2, 3] (['civil_rights', 'crime']): 377 (0.8%)
   [3, 7] (['crime', 'protests']): 337 (0.7%)

Top 10 validation label combinations:
   [0] (['no_class']): 4400 (45.0%)
   [6] (['politics']): 2677 (27.4%)
   [3] (['crime']): 763 (7.8%)
   [4, 6] (['govt_regulation', 'politics']): 484 (4.9%)
   [5] (['labor_movement']): 260 (2.7%)
   [5, 6] (['labor_movement', 'politics']): 210 (2.1%)
   [4] (['govt_regulation']): 154 (1.6%)
   [2] (['civil_rights']): 92 (0.9%)
   [2, 3] (['civil_rights'

## 5. Final Dataset Examples and Summary

In [None]:
train_data.to_parquet("../data/train_data.parquet", index=False)
val_data.to_parquet("../data/val_data.parquet", index=False)
test_data.to_parquet("../data/test_data.parquet", index=False)