In [2]:
import os
import json
import pandas as pd

In [1]:
interest = "ai"

In [8]:
articles = []

for file in os.listdir(f"data/{interest}/json"):
    if file.endswith(".json"):
        with open(f"data/{interest}/json/{file}", "r") as f:
            data = json.load(f)
            articles.append(data)

df = pd.DataFrame(articles)
df.to_csv(f"data/{interest}.csv", index=False, encoding="utf-8")

In [9]:
df.isna().sum()

authors               0
date_download         0
date_modify           0
date_publish          0
description        6094
filename              0
image_url             0
language             43
localpath        100917
maintext              0
source_domain         0
text             100917
title               240
title_page       100917
title_rss        100917
url                   0
dtype: int64

# Load data


In [3]:
stories = None

with open(f"data/sources_by_interest/story_ids_{interest}.json", "r", encoding="utf-8") as f:
    stories = json.load(f)

In [None]:
def extract_bias_data(data, sid, interest):
    """
    Extract only the variables useful for media bias analysis.
    
    Retained fields:
      - title: The article title.
      - description: The article description.
      - date: The publication date.
      - url: The article URL.
    
    From sourceInfo:
      - source_name: Media outlet name.
      - source_bias: The declared bias of the source.
      - source_factuality: Factuality rating.
      - source_originalBias: Original bias label.
      - source_owners: Comma-separated list of owner names.
      - source_biasRatings: Semicolon-separated bias ratings from different reviewers (format: ReviewerName:PoliticalBias).
      - source_lat & source_lon: Location coordinates.
      - source_place: Comma-separated list of place names (e.g., country).
    """
    result = {}
    result["interest_slug"] = interest
    result["story_id"] = sid
    # Article-level fields
    result["title"] = data.get("title")
    result["description"] = data.get("description")
    result["date"] = data.get("date")
    result["url"] = data.get("url")
    
    # Source-level fields
    source_info = data.get("sourceInfo", {})
    result["source_name"] = source_info.get("name")
    result["source_bias"] = source_info.get("bias")
    result["source_factuality"] = source_info.get("factuality")
    result["source_originalBias"] = source_info.get("originalBias")
    
    # Extract owners (if any)
    owners = source_info.get("owners", [])
    result["source_owners"] = ", ".join(owner.get("name", "") for owner in owners)
    
    # Extract bias ratings from different reviewers
    bias_ratings = source_info.get("biasRatings", [])
    ratings_list = []
    for br in bias_ratings:
        reviewer = br.get("reviewer", {})
        reviewer_name = reviewer.get("name", "")
        political_bias = br.get("politicalBias", "")
        ratings_list.append(f"{reviewer_name}:{political_bias}")
    result["source_biasRatings"] = "; ".join(ratings_list)
    
    # Include source location (if available)
    location = source_info.get("location") or {}
    result["source_lat"] = location.get("lat")
    result["source_lon"] = location.get("lon")
    
    # Include place names (e.g., country)
    places = source_info.get("place", [])
    result["source_place"] = ", ".join(p.get("name", "") for p in places)
    
    # Include article information
    ref_id = data.get("refId")
    if not ref_id:
        return result

    article_file = f"data/{interest}/json/{ref_id}.json"
    if not os.path.exists(article_file):
        return result

    article_data = None
    with open(article_file, "r") as f:
        article_data = json.load(f)
    
    result["article_title"] = article_data.get("title")
    result["article_description"] = article_data.get("description")
    result["article_image_url"] = article_data.get("image_url")
    result["article_text"] = article_data.get("maintext")
    result["article_date_publish"] = article_data.get("date_publish")
    result["article_authors"] = "; ".join(article_data.get("authors", []))
    
    return result

In [10]:
data = []

for sid, story in stories.items():
    if len(story["sources"]) < 2:
        continue
    
    for source in story["sources"]:
        article = extract_bias_data(source, sid, interest)
        data.append(article)

df = pd.DataFrame(data)
df.to_csv(f"data/{interest}_articles.csv", index=False, encoding="utf-8")


In [11]:
df

Unnamed: 0,interest_slug,story_id,title,description,date,url,source_name,source_bias,source_factuality,source_originalBias,...,source_biasRatings,source_lat,source_lon,source_place,article_title,article_description,article_image_url,article_text,article_date_publish,article_authors
0,ai,rss_10567_1740461771022_4,Nvidia's H20 chip orders jump as Chinese firms...,The surge in orders underlines Nvidia's domina...,2025-02-25T02:23:42.000Z,https://www.reuters.com/technology/artificial-...,Reuters,center,veryHigh,center,...,Ad Fontes Media:center; All Sides:center; Medi...,52.621203,-2.109763,United Kingdom,Exclusive: Nvidia's H20 chip orders jump as Ch...,The surge in orders underlines Nvidia's domina...,https://www.reuters.com/resizer/v2/EIP5MPMKXJP...,"Summary\nCompanies\nAlibaba, ByteDance, Tencen...",2025-02-25 00:54:57,Fanny Potkin; Che Pan
1,ai,rss_10567_1740461771022_4,DeepSeek’s AI models boost Nvidia chip demand ...,"Tencent, Alibaba and ByteDance are said to hav...",2025-02-25T01:59:54.000Z,https://www.scmp.com/tech/big-tech/article/329...,South China Morning Post,center,high,center,...,Ad Fontes Media:center; All Sides:center; Medi...,22.375863,114.096992,Hong Kong,DeepSeek’s AI models drive surging orders for ...,"Tencent, Alibaba and ByteDance are said to hav...",https://cdn.i-scmp.com/sites/default/files/sty...,"The surge in orders, which is being reported f...",2025-02-25 00:00:00,
2,ai,rss_10567_1740461771022_4,Nvidia Sees Rush For H20 AI Chip Orders As Dee...,"Chinese tech giants Tencent, Alibaba, and Byte...",2025-02-25T02:23:42.000Z,https://www.benzinga.com/25/02/43931074/chines...,Benzinga,center,veryHigh,center,...,Ad Fontes Media:center; Media Bias/Fact Check:...,40.737516,-73.975301,"New York, New York, United States",Nvidia Sees Rush For H20 AI Chip Orders As Dee...,"Chinese tech giants Tencent, Alibaba, and Byte...",https://cdn.benzinga.com/files/images/story/20...,"Chinese tech giants Tencent Holdings TCEHY, Al...",2025-02-24 21:23:42,Ananya Gairola
3,ai,rss_10567_1740461771022_4,Nvidia gets a DeepSeek-inspired boost in China...,Nvidia’s H20 chips are in high demand in China...,2025-02-25T15:59:09.000Z,https://fortune.com/2025/02/25/nvidia-china-de...,Fortune,center,high,center,...,Ad Fontes Media:center; All Sides:center; Medi...,40.705370,-73.980030,"New York, New York, United States",Nvidia gets a DeepSeek-inspired boost in China...,Nvidia’s H20 chips are in high demand in China...,https://fortune.com/img-assets/wp-content/uplo...,© 2025 Fortune Media IP Limited. All Rights Re...,2025-02-25 15:59:09,Beatrice Nolan
4,ai,rss_10567_1740461771022_4,"Alibaba, ByteDance, and Tencent boost orders f...",Nvidia's H20 chip gains traction in China amid...,2025-02-25T07:54:16.000Z,https://seekingalpha.com/news/4412923-alibaba-...,Seeking Alpha,leanRight,high,leanRight,...,Media Bias/Fact Check:leanRight,37.146554,-95.506712,United States,"Alibaba, ByteDance, and Tencent boost orders f...",Nvidia's H20 chip gains traction in China amid...,https://static.seekingalpha.com/cdn/s3/uploads...,( 2min )\nChinese companies are increasingly p...,2025-02-25 07:54:16,Arundhati Sarkar
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109639,ai,rss_25139_1737979365756_29,Lambda Labs’ COO has left the AI cloud provide...,"Lambda Labs, a Nvidia partner, has lost its ch...",2025-01-27T11:26:07.000Z,https://dnyuz.com/2025/01/27/lambda-labs-coo-h...,DNyuz,leanRight,mixed,leanRight,...,Media Bias/Fact Check:leanRight,,,,Lambda Labs’ COO has left the AI cloud provide...,"Lambda Labs, a Nvidia partner, has lost its ch...",https://dnyuz.com/wp-content/uploads/2025/01/L...,"Lambda Labs, a Nvidia partner, has lost its ch...",2025-01-27 07:26:07,Business Insider; www.facebook.com
109640,ai,rss_25139_1737979365756_29,Lambda Labs' COO has left the AI cloud provide...,Lambda Labs COO Mitesh Agrawal has left to hea...,2025-01-27T11:16:06.000Z,https://jingletree.com/lambda-labs-coo-has-lef...,Jingletree,unknown,unknown,unknown,...,,,,,404 Page Not Found,New aggregator pulling together the best news ...,https://jingletree.com/site/assets/img/og-imag...,404 That’s an error.\nThe requested URL was no...,,
109641,ai,rss_19797_1737976475535_4,3 best AI altcoins under $1 that could deliver...,If you’re trying to decide on the best AI altc...,2025-01-29T00:11:46.000Z,https://globalnewsit.com/3-best-ai-altcoins-un...,GlobalNewsIt,unknown,unknown,unknown,...,,,,,3 best AI altcoins under $1 that could deliver...,3 best AI altcoins under $1 that could deliver...,https://coinjournal.net/wp-content/uploads/202...,If you’re trying to decide on the best AI altc...,2025-01-29 00:11:46,CryptoExpert
109642,ai,rss_19797_1737976475535_4,3 AI altcoins under $1 that could deliver mass...,"Analyzing PropiChain, ChainGPT, and Chromia — ...",2025-01-27T20:06:00.000Z,https://crypto.news/3-ai-altcoins-under-1-that...,crypto.news,unknown,unknown,unknown,...,,,,,3 AI altcoins under $1 that could deliver mass...,"Analyzing PropiChain, ChainGPT, and Chromia — ...",https://crypto.news/app/uploads/2024/08/crypto...,"Analyzing PropiChain, ChainGPT, and Chromia — ...",2025-01-27 20:06:00,Guest Post
