In [1]:
import re
import random
from datasets import load_dataset, Dataset
from tqdm import tqdm

In [2]:
def split_into_sentences(text):
    # Very simple sentence splitter
    for s in re.split(r"[.!?]\s+", text):
        s = s.strip()
        if len(s.split()) >= 6:
            yield s

def clean(text):
    text = text.replace("\n", " ").strip()
    return text

In [5]:
def load_gdelt():
    print("Loading GDELT ...")
    ds = load_dataset("gdelt", "events", split="train", streaming=True)

    political_codes = {"POL", "GOV", "ELC", "PRO"}

    sents = []

    for item in tqdm(ds, desc="GDELT"):
        themes = item.get("themes", []) or []
        if any(code in themes for code in political_codes):
            text = clean(item.get("source_text", "") or "")

            for s in split_into_sentences(text):
                sents.append(s)

        if len(sents) > 300_000:
            break

    return sents

In [12]:
def load_ccnews():
    print("Loading CC-News...")
    ds = load_dataset("cc_news", split="train", streaming=True)

    political_domains = [
        "politico.com", "thehill.com",
        "cnn.com", "foxnews.com",
        "nytimes.com", "washingtonpost.com",
        "breitbart.com", "reuters.com"
    ]

    sents = []
    for item in tqdm(ds, desc="cc_news"):
        url = item.get("url", "") or ""
        if any(d in url for d in political_domains):
            text = clean(item["text"])
            for s in split_into_sentences(text):
                sents.append(s)

        if len(sents) > 300_000:
            break

    return sents

In [13]:
def load_news_category():
    print("Loading NewsCategoryDataset...")
    ds = load_dataset("Yale-LILY/news-category-dataset", split="train")

    sents = []
    for item in tqdm(ds, desc="news_category"):
        if item["category"].lower() == "politics":
            text = clean(item["short_description"])
            for s in split_into_sentences(text):
                sents.append(s)

    return sents

In [14]:
def load_ag_news():
    print("Loading AG News...")
    ds = load_dataset("ag_news", split="train")

    sents = []
    for item in tqdm(ds, desc="ag_news"):
        label = item["label"]
        text = clean(item["text"])

        # Class 1 = "World" → political, geopolitics, elections, etc.
        if label == 1:
            for s in split_into_sentences(text):
                sents.append(s)

    return sents

In [15]:
def load_gdelt():
    print("Loading GDELT...")
    ds = load_dataset("gdelt", "events", split="train", streaming=True)

    codes = {"POL", "GOV", "ELC", "PRO"}
    sents = []

    for item in tqdm(ds, desc="gdelt"):
        themes = item.get("themes", []) or []
        if any(c in themes for c in codes):
            text = clean(item.get("source_text", "") or "")
            for s in split_into_sentences(text):
                sents.append(s)

        if len(sents) > 300_000:
            break

    return sents

In [18]:
print("Building political corpus...")

ccnews_sents       = load_ccnews()
# newscat_sents      = load_news_category()
agnews_sents       = load_ag_news()
# gdelt_sents        = load_gdelt()

all_sents = ccnews_sents + agnews_sents
print(f"Collected {len(all_sents)} raw sentences")

all_sents = list(set(all_sents))
print(f"After dedupe: {len(all_sents)}")

random.shuffle(all_sents)

with open("political_corpus.txt", "w", encoding="utf-8") as f:
    for s in all_sents:
        f.write(s + "\n")

print("Done!")

Building political corpus...
Loading CC-News...


cc_news: 556042it [02:01, 4570.84it/s]


Loading AG News...


ag_news: 100%|██████████| 120000/120000 [00:03<00:00, 39393.91it/s]


Collected 343354 raw sentences
After dedupe: 267398
Done!


In [22]:
print(all_sents[:5])

['The driver on pole in Bahrain has been the winner in five of the 12 races to date', '"Recently we have seen Chinese military aircraft operating further south and that is bringing them closer to the main Okinawa island and other parts of the island chain," Japan\'s top military commander, Admiral Katsutoshi Kawano, told a briefing in Tokyo', 'You can follow him on Twitter and on Facebook.', '($1 = 0.9355 euros) (Reporting by Valentina Za)', 'NEW DELHI — India and Pakistan have declared a cease-fire along their disputed border in Kashmir, a move welcomed with uneasiness among the population in the area, where a series of such agreements have failed in the past']


In [25]:
import os

print("Current working directory:", os.getcwd())

with open("test.txt", "w", encoding="utf-8") as f:
    for s in ["This is a political sentence.", "Elections are coming."]:
        f.write(s + "\n")

print("File written to:", os.path.abspath("test.txt"))

Current working directory: /content
File written to: /content/test.txt


In [27]:
from google.colab import drive
drive.mount('/content/drive')

PATH = "/content/drive/MyDrive/political_corpus.txt"

with open(PATH, "w", encoding="utf-8") as f:
    for s in all_sents:
        f.write(s + "\n")

print("Saved to:", PATH)

KeyboardInterrupt: 