## Data Scraper
The dataset here is for demonstration, scraped with a single keyword.  
We used multiple keywords to reach the desirable data size.

### Install and Import Required Libraries

In [None]:
%pip install -r requirements.txt

In [None]:
import requests
# from bs4 import BeautifulSoup
import json
from newsapi import NewsApiClient
# from itertools import product
import pandas as pd
import os
import glob
import trafilatura
import time
import random
import csv
# from requests.adapters import HTTPAdapter
# from urllib3.util.retry import Retry


### API  Key

In [20]:
key = input()
print(f"API KEY = {key}")

API KEY = 39520930797f48f29aa1abf9ec3a5cab


### Search for Unique IDs (Names) of the News Providers (in English)

In [4]:
newsapi = NewsApiClient(api_key=key)
sources = newsapi.get_sources(language="en")

ids = [source['id'] for source in sources['sources']]

# Write out a text file of all source ids
with open('source_ids.txt', 'w', encoding='utf-8') as f:
    for id in ids:
        f.write(f"{id}\n")

# This is just for checking the data structure        
with open('sources.txt', 'w', encoding='utf-8') as f:
    f.write(json.dumps(sources, indent=2))


### Scraping Example

In [5]:
# An example of using NewsAPI to search for articles with keyword "trump" in all english providers
# Free account limits the result to 100 articles when there is more
url = (f'https://newsapi.org/v2/everything?'
       f'q=trump&'
       f'language=en&'
       f'apiKey={key}')
response = requests.get(url)
result = response.json()
result

{'status': 'ok',
 'totalResults': 67020,
 'articles': [{'source': {'id': 'wired', 'name': 'Wired'},
   'author': 'David Gilbert',
   'title': 'How Donald Trump Lost Control of the Epstein Spin Cycle',
   'description': "Donald Trump has spent years benefiting from the QAnon's Jeffrey Epstein obsession. That’s all changing.",
   'url': 'https://www.wired.com/story/how-donald-trump-lost-control-of-the-epstein-spin-cycle/',
   'urlToImage': 'https://media.wired.com/photos/691d1d514b7bcaeae916085d/191:100/w_1280,c_limit/politics_epstein_conspiracies_trump.jpg',
   'publishedAt': '2025-11-19T16:00:00Z',
   'content': 'For almost a decade, President Donald Trump has managed to control the conspiracy theory spin around disgraced financier and registered sex offender Jeffrey Epstein. The conspiracy theories benefited… [+2344 chars]'},
  {'source': {'id': 'wired', 'name': 'Wired'},
   'author': 'Mila Fiordalisi',
   'title': 'Europe Is Bending the Knee to the US on Tech Policy',
   'description

### Preparing Lists for Keywords and News Providers
Provider bias reference: https://mediabiasfactcheck.com/  
The text files of providers of the three categories are manually created by checking the above site with the provider ids we extracted in the previous cell.

In [21]:
# Left news providers
with open('left_prov.txt', 'r', encoding='utf-8') as f:
   left_prov = [line.strip() for line in f if line.strip()]
    
# Centre news providers
with open('centre_prov.txt', 'r', encoding='utf-8') as f:
    centre_prov = [line.strip() for line in f if line.strip()]
    
# Right news providers
with open('right_prov.txt', 'r', encoding='utf-8') as f:
    right_prov = [line.strip() for line in f if line.strip()]

### Assign Keyword for search
NewsAPI free account has a limit of 50 request per 12 hours.  
Therefore, only one keyword can be used at a time as we have 13 x 3 = 39 providers.  
Please check **key.txt** for all available key we generated.

In [22]:
keyword = input()
print(f"KEYWORD TO SEARCH = {keyword}")

KEYWORD TO SEARCH = tariff


### Scraper

In [23]:
result_list_l = []
result_list_c = []
result_list_r = []

for provider in left_prov:
    url = (f'https://newsapi.org/v2/everything?'
           f'q={keyword}&'
           f'language=en&'
           f'sources={provider}&'
           f'apiKey={key}')
    response = requests.get(url)
    result = response.json()
    
    # check the API response status before storing the result
    if result.get("status") == "ok":
        for article in result["articles"]:
            result_list_l.append(article["url"])
    else:
        raise ValueError(f"API error: {result.get('message', 'Unknown error')}")

    
for provider in centre_prov:
    url = (f'https://newsapi.org/v2/everything?'
           f'q={keyword}&'
           f'language=en&'
           f'sources={provider}&'
           f'apiKey={key}')
    response = requests.get(url)
    result = response.json()
    
    # check the API response status before storing the result
    if result.get("status") == "ok":
        for article in result["articles"]:
            result_list_c.append(article["url"])
    else:
        raise ValueError(f"API error: {result.get('message', 'Unknown error')}")
    
for provider in right_prov:
    url = (f'https://newsapi.org/v2/everything?'
           f'q={keyword}&'
           f'language=en&'
           f'sources={provider}&'
           f'apiKey={key}')
    response = requests.get(url)
    result = response.json()
    
    # check the API response status before storing the result
    if result.get("status") == "ok":
        for article in result["articles"]:
            result_list_r.append(article["url"])
    else:
        raise ValueError(f"API error: {result.get('message', 'Unknown error')}")
    

print(len(list(set(result_list_l))))
print(len(list(set(result_list_c))))
print(len(list(set(result_list_r))))




211
36
386


### Data 
Combine the data (URLs) extracted into a singel csv file, label the data according to the provider bias.

In [24]:
left_articles = list(set(result_list_l))
centre_articles = list(set(result_list_c))
right_articles = list(set(result_list_r))

df_left = pd.DataFrame({'url': left_articles, 'label': 'left'})
df_centre = pd.DataFrame({'url': centre_articles, 'label': 'centre'})
df_right = pd.DataFrame({'url': right_articles, 'label': 'right'})

df_all = pd.concat([df_left, df_centre, df_right], ignore_index=True)
print(f"Total article count: {len(df_all)}")

# Check the dataset already there and store the current set as a new one
base_name = "dataset_"
suffix = 0
while os.path.exists(f"{base_name}{suffix}.csv"):
    suffix += 1

filename = f"{base_name}{suffix}.csv"

df_all.to_csv(filename, index=False, encoding='utf-8')
print(f"Saved to {filename}")



Total article count: 633
Saved to dataset_0.csv


### Actual News Content Scraper

In [25]:
# Choose the desired URLs dataset for content scraping
dataset_num = input()
INPUT_FILE = f"dataset_{dataset_num}.csv" 
OUTPUT_FILE = f"content_output/dataset_{dataset_num}_content.csv"
USER_AGENT = "Student-Assignment-Bot/1.5 (educational use)"

df = pd.read_csv(INPUT_FILE)
urls = df['url'].tolist()
labels = df['label'].tolist()

In [26]:
# Open a file for output, write the header
with open(OUTPUT_FILE, 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['content', 'label'])  

In [27]:
for i, url in enumerate(urls):
    
    try:
        headers = {"User-Agent": USER_AGENT}
        response = requests.get(url, headers=headers, timeout=30, stream=True)
        text = None
        
        # Check if the response status is okay
        if response.status_code == 200:
            content_type = response.headers.get('Content-Type', '').lower()
            
            # Filter out pages that direct to a video or radio link
            if 'text' not in content_type and 'html' not in content_type:
                    response.close()
                    continue
            
            # Extract texts
            text = trafilatura.extract(response.text)
        
        # Clean out non necessary new lines.
        # Store None if no text is extracted
        clean_text = text.replace('\n', ' ').replace('\r', '') if text else None
        
        # Exclude the situation when no text is extracted
        # Store text into the ouput file with proper label
        if clean_text:
            with open(OUTPUT_FILE, 'a', newline='', encoding='utf-8') as f:
                writer = csv.writer(f) 
                writer.writerow([clean_text, labels[i]])
        
        # Print a message every 100 articles are processed, as this can take time 
        if i % 100 == 0:
            print(f"Processed {i}/{len(urls)} articles...")

        time.sleep(random.uniform(0.5, 1.5))

    # Print out failure URLs for easy debugging
    except Exception as e:
        print(f"Failed on {url}: {e}")
        time.sleep(2)
        continue

print(f"Content saved to {OUTPUT_FILE}")

Processed 0/633 articles...
Processed 100/633 articles...
Processed 200/633 articles...
Processed 300/633 articles...
Processed 400/633 articles...
Processed 500/633 articles...
Processed 600/633 articles...
Content saved to content_output/dataset_0_content.csv


### Combining Content Output
Remove duplicated entries

In [28]:
output_folder = "final_dataset"
os.makedirs(output_folder, exist_ok=True)
files = glob.glob("content_output/*.csv")

dfs = []
for f in files:
    df = pd.read_csv(f, skiprows=1, names=["content", "label"])
    dfs.append(df)

combined = pd.concat(dfs, ignore_index=True)
combined = combined.drop_duplicates(subset="content")

# Print out total entries of each category and the final total
label_counts = combined["label"].value_counts()
print("Counts per label:")
print(label_counts)
print(f"Total entries: {len(combined)}")

filename = os.path.join(output_folder, "final_dataset.csv")
combined.to_csv(filename, index=False, encoding='utf-8')

Counts per label:
label
right     372
left      189
centre     34
Name: count, dtype: int64
Total entries: 595


### Normalise between all three categories
This cell is used to extract the first 3200 articles from each catefory to balance the entry and prevent bias during training.

In [30]:
df = pd.read_csv("final_dataset/final_dataset.csv")

labels = ["left", "centre", "right"]

# Use 32 here for demonstration
# We use 3200 during the actual scrpaing process
n = 32

subsets = {}

for label in labels:
    subset = df[df["label"] == label].head(n)
    filename = f"final_dataset/balanced_{label}.csv"
    subset.to_csv(filename, index=False)
    subsets[label] = subset

combined = pd.concat(subsets.values(), ignore_index=True)

combined.to_csv("final_dataset/balanced_all.csv", index=False)