In [34]:
import os
import json
import pandas as pd

In [35]:
interest = "israeli-palestinian-conflict"

In [36]:
# articles = []

# for file in os.listdir(f"data/{interest}/json"):
#     if file.endswith(".json"):
#         with open(f"data/{interest}/json/{file}", "r") as f:
#             data = json.load(f)
#             articles.append(data)

# df = pd.DataFrame(articles)

In [37]:
# df.isna().sum()

# Load data


In [38]:
stories = None

with open(f"data/sources_by_interest/story_ids_{interest}.json", "r", encoding="utf-8") as f:
    stories = json.load(f)

In [None]:
def extract_bias_data(data, sid, interest):
    """
    Extract only the variables useful for media bias analysis.
    
    Retained fields:
      - title: The article title.
      - description: The article description.
      - date: The publication date.
      - url: The article URL.
    
    From sourceInfo:
      - source_name: Media outlet name.
      - source_bias: The declared bias of the source.
      - source_factuality: Factuality rating.
      - source_originalBias: Original bias label.
      - source_owners: Comma-separated list of owner names.
      - source_biasRatings: Semicolon-separated bias ratings from different reviewers (format: ReviewerName:PoliticalBias).
      - source_lat & source_lon: Location coordinates.
      - source_place: Comma-separated list of place names (e.g., country).
    """
    result = {}
    result["interest_slug"] = interest
    result["story_id"] = sid
    # Article-level fields
    result["title"] = data.get("title")
    result["description"] = data.get("description")
    result["date"] = data.get("date")
    result["url"] = data.get("url")
    result["lang"] = data.get("lang")
    result["paywall"] = data.get("paywall")
    
    # Source-level fields
    source_info = data.get("sourceInfo", {})
    result["source_name"] = source_info.get("name")
    result["source_slug"] = source_info.get("slug")
    result["source_bias"] = source_info.get("bias")
    result["source_factuality"] = source_info.get("factuality")
    result["source_originalBias"] = source_info.get("originalBias")
    result["source_storyCount"] = source_info.get("storyCount")
    
    # Extract owners (if any)
    owners = source_info.get("owners", [])
    result["source_owners"] = ", ".join(owner.get("name", "") for owner in owners)
    
    # Extract bias ratings from different reviewers
    bias_ratings = source_info.get("biasRatings", [])
    ratings_list = []
    for br in bias_ratings:
        reviewer = br.get("reviewer", {})
        reviewer_name = reviewer.get("name", "")
        political_bias = br.get("politicalBias", "")
        ratings_list.append(f"{reviewer_name}:{political_bias}")
    result["source_biasRatings"] = "; ".join(ratings_list)
    
    # Include source location (if available)
    location = source_info.get("location") or {}
    result["source_lat"] = location.get("lat")
    result["source_lon"] = location.get("lon")
    
    # Include place names (e.g., country)
    places = source_info.get("place", [])
    result["source_place"] = ", ".join(p.get("name", "") for p in places)
    
    # Include article information
    ref_id = data.get("refId")
    if not ref_id:
        return result

    article_file = f"data/{interest}/json/{ref_id}.json"
    if not os.path.exists(article_file):
        return result

    article_data = None
    with open(article_file, "r") as f:
        article_data = json.load(f)
    
    result["article_title"] = article_data.get("title")
    result["article_description"] = article_data.get("description")
    result["article_image_url"] = article_data.get("image_url")
    result["article_text"] = article_data.get("maintext")
    result["article_date_publish"] = article_data.get("date_publish")
    result["article_authors"] = "; ".join(article_data.get("authors", []))
    
    return result

In [40]:
import csv

data = []

for sid, story in stories.items():
    if len(story["sources"]) < 2:
        continue
    
    for source in story["sources"]:
        article = extract_bias_data(source, sid, interest)
        data.append(article)

df = pd.DataFrame(data)
df.to_csv(f"data/{interest}_articles.csv", index=False, encoding="utf-8", quoting=csv.QUOTE_NONNUMERIC)


In [41]:
df

Unnamed: 0,interest_slug,story_id,title,description,date,url,lang,paywall,source_name,source_slug,...,source_biasRatings,source_lat,source_lon,source_place,article_title,article_description,article_image_url,article_text,article_date_publish,article_authors
0,israeli-palestinian-conflict,ad2c7886-e3d1-4630-bbde-d1965f491ef1,Gaza: Six babies have died from cold in two we...,Five newborns and a two-month-old have reporte...,2025-02-25T21:31:42.000Z,https://www.bbc.com/news/articles/c62k676gk34o,en,no,BBC News,bbc-news_bf95f4,...,Ad Fontes Media:center; All Sides:center; Medi...,52.449128,-1.798570,United Kingdom,"Six babies have died from cold in two weeks, m...",Five newborns and a two-month-old have reporte...,https://ichef.bbci.co.uk/news/1024/branded_new...,At least six babies have died in the past two ...,2025-02-25 21:31:42,David Gritten
1,israeli-palestinian-conflict,ad2c7886-e3d1-4630-bbde-d1965f491ef1,Medics say 6 babies have died from the cold in...,Palestinian medics say at least six infants ha...,2025-02-25T15:36:46.000Z,https://www.independent.co.uk/news/gaza-israel...,en,sometimes,The Independent,the-independent_ff73a6,...,Ad Fontes Media:leanLeft; All Sides:leanLeft; ...,51.510428,-0.106444,"London, Greater London, England, United Kingdom",Six babies died from cold in Gaza over two wee...,Gaza health ministry records 15 deaths from hy...,https://static.independent.co.uk/2025/02/25/11...,At least six infants have died from hypothermi...,2025-02-26 03:11:37,Wafaa Shurafa; Samy Magdy
2,israeli-palestinian-conflict,ad2c7886-e3d1-4630-bbde-d1965f491ef1,Medics say 6 babies have died from the cold in...,At least six infants have died from hypothermi...,2025-02-25T17:06:59.000Z,https://www.ctvnews.ca/world/israel-hamas-war/...,en,no,CTV News,ctv-news,...,Media Bias/Fact Check:center,56.081144,-107.789090,Canada,"6 babies died from cold in Gaza, medics say",At least six infants have died from hypothermi...,https://www.ctvnews.ca/resizer/v2/TSNHSZMVGQV3...,The Shopping Trends team is independent of the...,2025-02-25 20:00:38,The Associated Press
3,israeli-palestinian-conflict,ad2c7886-e3d1-4630-bbde-d1965f491ef1,At least 6 infants dead in Gaza due to cold we...,At least six infants have died from cold-relat...,2025-02-25T22:00:34.000Z,https://www.cbc.ca/news/world/infants-dead-hyp...,en,no,CBC News,cbc-news,...,Ad Fontes Media:center; All Sides:leanLeft; Me...,56.776486,-108.378064,Canada,Medical officials in Gaza sound the alarm afte...,At least six infants have died from cold-relat...,https://i.cbc.ca/1.7467722.1740495952!/fileIma...,WARNING: This story contains an image of an in...,2025-02-25 18:02:36,Sara Jabakhanji; CBC News
4,israeli-palestinian-conflict,ad2c7886-e3d1-4630-bbde-d1965f491ef1,Medics say 6 babies have died from the cold in...,Palestinian medics say at least six infants ha...,2025-02-25T16:53:09.000Z,https://apnews.com/article/israel-palestinians...,en,no,Associated Press News,associated-press-news,...,Ad Fontes Media:center; All Sides:left; Media ...,37.300260,-94.708464,United States,Medics say 6 babies have died from the cold in...,Palestinian medics say at least six infants ha...,https://dims.apnews.com/dims4/default/9997738/...,"DEIR AL-BALAH, Gaza Strip (AP) — At least six ...",2025-02-25 15:36:46,Wafaa Shurafa; Samy Magdy; apnews.com; samy-magdy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118695,israeli-palestinian-conflict,rss_9636_1739016580753_13,"From Sartre to Mélenchon, Michel Onfray traces...","MAINTENANCE. Evidence in support, Michel Onfra...",2025-02-26T14:00:04.000Z,https://www.lejdd.fr/Societe/de-sartre-a-melen...,fr,no,lejdd.fr,lejddfr,...,,,,,"De Sartre à Mélenchon, Michel Onfray retrace l...","ENTRETIEN. Preuves à l’appui, Michel Onfray an...",https://www.lejdd.fr/lmnr/var/jdd/public/media...,Vous faites justice de l’idée reçue selon laqu...,2025-02-26 00:00:00,Propos recueillis par; Éric Naulleau
118696,israeli-palestinian-conflict,931f3782-2de1-4d8f-9e76-c42922b76a97,Those refusing to leave Gaza ‘probably Hamas’ ...,"We spoke to Fleur Hassan-Nahoum, the Special E...",2025-02-05T20:58:16.000Z,https://www.channel4.com/news/those-refusing-t...,en,no,Channel 4,channel-4,...,All Sides:leanLeft; Media Bias/Fact Check:lean...,51.490004,-0.137822,"London, Greater London, England, United Kingdom",Those refusing to leave Gaza ‘probably Hamas’ ...,"We spoke to Fleur Hassan-Nahoum, the Special E...",https://fournews-assets-prod-s3-ew1-nmprod.s3....,"We spoke to Fleur Hassan-Nahoum, the Special E...",2025-02-05 00:00:00,Krishnan Guru-Murthy; facebook.com
118697,israeli-palestinian-conflict,931f3782-2de1-4d8f-9e76-c42922b76a97,"Israeli envoy slams media, leaders for backing...",New Delhi: Israeli Ambassador to India Reuven ...,2025-02-26T13:48:59.000Z,https://www.orissapost.com/israeli-envoy-slams...,en,no,Orissa POST,orissa-post,...,,,,,,,,,,
118698,israeli-palestinian-conflict,rss_7287_1738455800744_9,Fourth non-bullying exchange of hostages and p...,"In contrast to Thursday's chaos and reprisals,...",2025-02-01T23:00:00.000Z,https://www.lavanguardia.com/internacional/202...,es,no,La Vanguardia,la-vanguardia,...,,37.176733,-3.596118,"Granada, Andalusia, Spain",Cuarto intercambio sin sobresaltos de rehenes ...,El cese al fuego entre Israel y Hamas vivió po...,https://www.lavanguardia.com/files/og_thumbnai...,El cese al fuego entre Israel y Hamas vivió po...,2025-02-02 00:00:00,Janira Gómez Muñoz
