In [2]:
#!/usr/bin/env python3
"""
Build Reuters-21578 train/val/test splits with binary topic columns.
"""

import glob
import os
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.model_selection import train_test_split

# ==============================
# Configuration
# ==============================
DATA_DIR = "/home/michaelschlee/ownCloud/GIT/classifyfusion/data/reuters"
SGM_PATTERN = os.path.join(DATA_DIR, "reut2-*.sgm")
VAL_FRACTION = 0.1
RANDOM_STATE = 42

# ==============================
# Parse SGML files
# ==============================
docs = []

for filepath in sorted(glob.glob(SGM_PATTERN)):
    print(f"Parsing {os.path.basename(filepath)} ...")
    with open(filepath, "r", encoding="latin-1") as f:
        soup = BeautifulSoup(f.read(), "html.parser")

    for r in soup.find_all("reuters"):
        split = r.get("lewissplit", "NOT-USED")
        if split not in {"TRAIN", "TEST"}:
            continue

        # extract topics
        topics_tag = r.topics
        if topics_tag is None:
            continue
        topics = [d.get_text().strip() for d in topics_tag.find_all("d")]
        if not topics:
            continue

        # combine title + body
        title = r.title.get_text().strip() if r.title else ""
        body = r.body.get_text().strip() if r.body else ""
        text = (title + "\n\n" + body).strip()
        if not text:
            continue

        docs.append({
            "id": int(r["newid"]),
            "split": split,
            "topics": topics,
            "text": text,
        })

df = pd.DataFrame(docs)
print(f"\nParsed {len(df)} labeled documents with TRAIN/TEST split.\n")

# ==============================
# Create binary topic columns
# ==============================
all_topics = sorted({t for ts in df["topics"] for t in ts})
print(f"Found {len(all_topics)} unique topic labels.\n")

for label in all_topics:
    df[label] = df["topics"].apply(lambda ts, lab=label: int(lab in ts))

df_out = df[["text", "split"] + all_topics]

# ==============================
# Create splits
# ==============================
train_df = df_out[df_out["split"] == "TRAIN"].drop(columns=["split"])
test_df = df_out[df_out["split"] == "TEST"].drop(columns=["split"])

print(f"TRAIN docs: {len(train_df)}")
print(f"TEST docs: {len(test_df)}")

train_df, val_df = train_test_split(
    train_df, test_size=VAL_FRACTION, random_state=RANDOM_STATE
)

print(f"→ Final sizes: train={len(train_df)}, val={len(val_df)}, test={len(test_df)}\n")

# ==============================
# Save to CSV
# ==============================
train_path = os.path.join(DATA_DIR, "train.csv")
val_path = os.path.join(DATA_DIR, "val.csv")
test_path = os.path.join(DATA_DIR, "test.csv")

train_df.to_csv(train_path, index=False)
val_df.to_csv(val_path, index=False)
test_df.to_csv(test_path, index=False)

print(f"✅ Wrote:\n  {train_path}\n  {val_path}\n  {test_path}\n")


Parsing reut2-000.sgm ...
Parsing reut2-001.sgm ...
Parsing reut2-002.sgm ...
Parsing reut2-003.sgm ...
Parsing reut2-004.sgm ...
Parsing reut2-005.sgm ...
Parsing reut2-006.sgm ...
Parsing reut2-007.sgm ...
Parsing reut2-008.sgm ...
Parsing reut2-009.sgm ...
Parsing reut2-010.sgm ...
Parsing reut2-011.sgm ...
Parsing reut2-012.sgm ...
Parsing reut2-013.sgm ...
Parsing reut2-014.sgm ...
Parsing reut2-015.sgm ...
Parsing reut2-016.sgm ...
Parsing reut2-017.sgm ...
Parsing reut2-018.sgm ...
Parsing reut2-019.sgm ...
Parsing reut2-020.sgm ...
Parsing reut2-021.sgm ...

Parsed 10741 labeled documents with TRAIN/TEST split.

Found 117 unique topic labels.



  df[label] = df["topics"].apply(lambda ts, lab=label: int(lab in ts))
  df[label] = df["topics"].apply(lambda ts, lab=label: int(lab in ts))
  df[label] = df["topics"].apply(lambda ts, lab=label: int(lab in ts))
  df[label] = df["topics"].apply(lambda ts, lab=label: int(lab in ts))
  df[label] = df["topics"].apply(lambda ts, lab=label: int(lab in ts))
  df[label] = df["topics"].apply(lambda ts, lab=label: int(lab in ts))
  df[label] = df["topics"].apply(lambda ts, lab=label: int(lab in ts))
  df[label] = df["topics"].apply(lambda ts, lab=label: int(lab in ts))
  df[label] = df["topics"].apply(lambda ts, lab=label: int(lab in ts))
  df[label] = df["topics"].apply(lambda ts, lab=label: int(lab in ts))
  df[label] = df["topics"].apply(lambda ts, lab=label: int(lab in ts))
  df[label] = df["topics"].apply(lambda ts, lab=label: int(lab in ts))
  df[label] = df["topics"].apply(lambda ts, lab=label: int(lab in ts))
  df[label] = df["topics"].apply(lambda ts, lab=label: int(lab in ts))
  df[l

TRAIN docs: 7733
TEST docs: 3008
→ Final sizes: train=6959, val=774, test=3008

✅ Wrote:
  /home/michaelschlee/ownCloud/GIT/classifyfusion/data/reuters/train.csv
  /home/michaelschlee/ownCloud/GIT/classifyfusion/data/reuters/val.csv
  /home/michaelschlee/ownCloud/GIT/classifyfusion/data/reuters/test.csv



In [4]:
df_train = pd.read_csv("/home/michaelschlee/ownCloud/GIT/classifyfusion/data/reuters/train.csv")
df_train

Unnamed: 0,text,acq,alum,austdlr,barley,bop,can,carcass,castor-oil,castorseed,...,tapioca,tea,tin,trade,veg-oil,wheat,wool,wpi,yen,zinc
0,CYCLOPS <CYL> SAYS DIXONS AGREEMENTS BINDING\n...,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,HOME SAVINGS BANK <HMSB> SETS INITIAL QUARTERL...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,KASLER CORP <KASL> 1ST QTR JAN 31 NET\n\nShr p...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,BINKS MFG CO <BIN> REGULAR DIVIDEND SET\n\nQtl...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NEW HARDING GROUP SETS FIRST PAYOUT SINCE 1978...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6954,"JAPAN ASKS TRADERS, EXPORTERS TO CUT DOLLAR SA...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6955,ALLIED-SIGNAL INC TO SELL LINOTYPE GROUP TO CO...,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6956,COMPUTER HORIZONS <CHRZ> IN ACQUISITION\n\nCom...,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6957,TELE-COMMUNICATIONS <TCOMA> SELLS CABLE SYSTEM...,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
import os
import pandas as pd
from sklearn.datasets import fetch_rcv1
from sklearn.datasets import get_data_home
from sklearn.model_selection import train_test_split
from nltk.corpus import reuters
import nltk

# Iterative stratifier for multilabel splits (installed in the environment)
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
import numpy as np

nltk.download('reuters')
nltk.download('punkt')

# === Configuration ===
output_dir = "/home/michaelschlee/ownCloud/GIT/classifyfusion/data/reuters"
os.makedirs(output_dir, exist_ok=True)

# 10 most common topics in the Reuters-21578 ModApte split
modapte10 = [
    "earn", "acq", "money-fx", "grain", "crude",
    "trade", "interest", "ship", "wheat", "corn"
]

# === Collect documents ===
docs = []
for doc_id in reuters.fileids():
    topics = [t for t in reuters.categories(doc_id) if t in modapte10]
    if not topics:
        continue  # skip if doc doesn't belong to the top 10 topics
    text = reuters.raw(doc_id)
    split = "train" if doc_id.startswith("train") else "test"
    docs.append({"id": doc_id, "text": text, "split": split, **{t: int(t in topics) for t in modapte10}})

df = pd.DataFrame(docs)
print(f"Loaded {len(df)} documents across {len(modapte10)} topics.")

# === Train / Validation / Test split ===
train_df = df[df["split"] == "train"].drop(columns=["split"]).reset_index(drop=True)
test_df = df[df["split"] == "test"].drop(columns=["split"]).reset_index(drop=True)

print(f"TRAIN docs: {len(train_df)}")
print(f"TEST docs: {len(test_df)}")

# Use MultilabelStratifiedShuffleSplit but fall back if any topic is too rare
counts = train_df[modapte10].sum().sort_values()
print("\nCounts per topic (train):")
print(counts)
rare = counts[counts < 2]
if len(rare) > 0:
    print("\nWarning: some topics have fewer than 2 examples and prevent a stratified multilabel split:")
    print(rare)
    print("Falling back to a non-stratified random split.")
    train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)
else:
    y_multilabel = train_df[modapte10].values
    msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
    orig_train = train_df.copy()
    train_idx, val_idx = next(msss.split(np.zeros(len(orig_train)), y_multilabel))
    train_df = orig_train.iloc[train_idx].reset_index(drop=True)
    val_df = orig_train.iloc[val_idx].reset_index(drop=True)

print(f"→ Final sizes: train={len(train_df)}, val={len(val_df)}, test={len(test_df)}\n")

# === Save to CSV ===
train_path = os.path.join(output_dir, "train.csv")
val_path = os.path.join(output_dir, "val.csv")
test_path = os.path.join(output_dir, "test.csv")

train_df.to_csv(train_path, index=False)
val_df.to_csv(val_path, index=False)
test_df.to_csv(test_path, index=False)

print(f"✅ Wrote:\n  {train_path}\n  {val_path}\n  {test_path}\n")


[nltk_data] Downloading package reuters to
[nltk_data]     /home/michaelschlee/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/michaelschlee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loaded 9034 documents across 10 topics.
TRAIN docs: 6489
TEST docs: 2545

Counts per topic (train):
corn         181
ship         197
wheat        212
interest     347
trade        368
crude        389
grain        433
money-fx     538
acq         1650
earn        2877
dtype: int64
→ Final sizes: train=5842, val=647, test=2545

✅ Wrote:
  /home/michaelschlee/ownCloud/GIT/classifyfusion/data/reuters/train.csv
  /home/michaelschlee/ownCloud/GIT/classifyfusion/data/reuters/val.csv
  /home/michaelschlee/ownCloud/GIT/classifyfusion/data/reuters/test.csv

✅ Wrote:
  /home/michaelschlee/ownCloud/GIT/classifyfusion/data/reuters/train.csv
  /home/michaelschlee/ownCloud/GIT/classifyfusion/data/reuters/val.csv
  /home/michaelschlee/ownCloud/GIT/classifyfusion/data/reuters/test.csv



In [7]:
df_train = pd.read_csv("/home/michaelschlee/ownCloud/GIT/classifyfusion/data/reuters/train.csv")
df_train

Unnamed: 0,id,text,earn,acq,money-fx,grain,crude,trade,interest,ship,wheat,corn
0,training/10,COMPUTER TERMINAL SYSTEMS &lt;CPML> COMPLETES ...,0,1,0,0,0,0,0,0,0,0
1,training/1000,NATIONAL AMUSEMENTS AGAIN UPS VIACOM &lt;VIA> ...,0,1,0,0,0,0,0,0,0,0
2,training/10000,ROGERS &lt;ROG> SEES 1ST QTR NET UP SIGNIFICAN...,1,0,0,0,0,0,0,0,0,0
3,training/10008,QUESTECH INC &lt;QTEC> YEAR NET\n Shr loss ni...,1,0,0,0,0,0,0,0,0,0
4,training/10011,CANADA OIL EXPORTS RISE 20 PCT IN 1986\n Cana...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
5837,training/999,U.K. MONEY MARKET SHORTAGE FORECAST REVISED DO...,0,0,1,0,0,0,1,0,0,0
5838,training/9992,KNIGHT-RIDDER INC &lt;KRN> SETS QUARTERLY\n Q...,1,0,0,0,0,0,0,0,0,0
5839,training/9993,TECHNITROL INC &lt;TNL> SETS QUARTERLY\n Qtly...,1,0,0,0,0,0,0,0,0,0
5840,training/9994,NATIONWIDE CELLULAR SERVICE INC &lt;NCEL> 4TH ...,1,0,0,0,0,0,0,0,0,0


In [8]:
# Count how many labels each article has
df["label_count"] = df[modapte10].sum(axis=1)

# How many have multiple labels?
multi_label_count = (df["label_count"] > 1).sum()
single_label_count = (df["label_count"] == 1).sum()
no_label_count = (df["label_count"] == 0).sum()

print(f"Articles with multiple labels: {multi_label_count}")
print(f"Articles with a single label: {single_label_count}")
print(f"Articles with no label: {no_label_count}")
print(f"Total: {len(df)}")


Articles with multiple labels: 833
Articles with a single label: 8201
Articles with no label: 0
Total: 9034
