### Metadata source

In [2]:
import sys
import os
import pandas as pd
import json
from tqdm import tqdm
from dotenv import load_dotenv
from pydantic import BaseModel
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain_core.rate_limiters import InMemoryRateLimiter

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.articles import create_static_metadata, create_keywords_tags_fuzzy, keywords_dict
from src.llm import get_gemini_llm_client
from src.prompts import get_metadata_prompt

if not load_dotenv():
    raise Exception('Error loading .env file. Make sure to place a valid OPEN_AI_KEY in the .env file.')

In [3]:
rate_limiter = InMemoryRateLimiter(
    requests_per_second=0.5,  # <-- Gemini Free Tier
    check_every_n_seconds=0.1,
)

llm_client = get_gemini_llm_client(
    max_tokens=1024,
    temperature=0.2,
    rate_limiter=rate_limiter,
)

Setup the paths to data sources

In [4]:
ARTICLES_CLEAN_DIR = os.path.join("..", "data", "articles_clean")
METADATA_PATH = os.path.join("..", "data", "metadata.csv")

Extract metadata

In [5]:
class ArticleTags(BaseModel):
    tags: list[str]

In [6]:
def process_article(filename: str, with_tags: bool = True, tags_type: ('llm', 'keywords') = 'keywords'):
    """
    Process an article and return its metadata.
    :param filename: The name of the article file.
    :param with_tags: Whether to include tags in the metadata.
    :param tags_type: The type of tags to include in the metadata.
    :return: The metadata of the article.
    """    
    article_path = os.path.join(ARTICLES_CLEAN_DIR, filename)
    with open(article_path, "r", encoding="utf-8") as file:
        article = json.load(file)
    
    # Create static metadata for the article
    article_metadata = create_static_metadata(article, filename)
    
    if with_tags:
        if tags_type == 'llm':
            # Prepare the prompt for the LLM using the article's text
            tags_prompt = get_metadata_prompt()
            query = tags_prompt.format(article_text=article["text"])
            
            # Invoke the LLM with structured output to extract tags
            llm = llm_client.with_structured_output(ArticleTags)
            response = llm.invoke([query])
            tags = response.tags
        elif tags_type == 'keywords':
            keywords = keywords_dict()
            tags = create_keywords_tags_fuzzy(article["text"], keywords)
        else:
            raise ValueError(f"Invalid tags_type: {tags_type}. Must be ('llm', 'keywords').")
    else:
        tags = []
    
    article_metadata["tags"] = tags
    return article_metadata

In [7]:
# List all cleaned article files
articles = os.listdir(ARTICLES_CLEAN_DIR)
metadata = []

# Adjust the max_workers based on available resources (None is max)
with ThreadPoolExecutor(max_workers=None) as executor:
    futures = {executor.submit(process_article, filename): filename for filename in articles}
    for future in tqdm(as_completed(futures), total=len(futures)):
        try:
            result = future.result()
            metadata.append(result)
        except Exception as e:
            print(f"Error processing file {futures[future]}: {e}")


df_metadata = pd.DataFrame(metadata)
df_metadata.to_csv(METADATA_PATH, index=False)

100%|██████████| 87754/87754 [1:19:56<00:00, 18.30it/s]  


In [8]:
# Articles length statistics
df_metadata["words_count"].describe()

count    87754.000000
mean       511.983670
std        351.851569
min          1.000000
25%        281.000000
50%        422.000000
75%        632.000000
max       5931.000000
Name: words_count, dtype: float64

Categories by WZ

In [9]:
df_metadata["category"].describe()

count       87754
unique         47
top       Politik
freq        31919
Name: category, dtype: object

In [10]:
df_metadata["category"].value_counts()

category
Politik                                  31919
Wirtschaft                               16512
Kommentare                               11408
Gastkommentare                            7253
Wissen                                    5632
Europaarchiv                              5341
Leitartikel                               3292
Analysen                                  2478
Reflexionen                               2308
Recht                                      652
Auf Justitias Spuren                       196
Leserforum                                 170
Klimawandel                                 52
Wiener Zeitung - seit 1703                  47
Sterbehilfe                                 44
1914                                        39
100 Jahre Republik                          38
Stadtentwicklung                            28
Wald                                        27
100 Jahre Verfassung                        26
EU für mich                                 26
Asyl

In [11]:
# Tags statistics (only available if with_tags=True)
df_metadata.explode("tags")["tags"].value_counts()

tags
other               73976
COVID                5415
Fake News            4610
Digitalization       1149
Demographics          990
Innovation            823
Financial Crises      612
Sustainability        138
AI                     37
Local Journalism        4
Name: count, dtype: int64

In [12]:
# Missing authors
df_metadata["author"].isnull().sum()

np.int64(77)

Metadata df

In [13]:
df_metadata.head(5)

Unnamed: 0,id,title,author,published_at,words_count,filename,category,section,tags
0,cbbd50ec-c07b-4fcf-b7bf-dd0b7bacc887,100.000 Plätze mehr in zehn Jahren,Alexandra Grass,2003-10-08 00:00,264,100000-platze-mehr-in-zehn-jahren.json,Politik,Nachrichten,other
1,a0799ad0-f4ed-4989-8c9a-afa6e88677c6,1.700 Lehrlinge ohne Ausbildung,Werner Grotte,2004-10-06 00:00,255,1700-lehrlinge-ohne-ausbildung.json,Wirtschaft,Nachrichten,other
2,9866b0a2-9c97-4a49-b5c3-d6803a3f9eac,14 Gemeinden erheben schwere Vorwürfe gegen Ra...,Kid Möchel,2011-06-17 18:28,530,14-gemeinden-erheben-schwere-vorwurfe-gegen-ra...,Wirtschaft,Nachrichten,other
3,1e9ee2dd-f1cc-42c3-a61b-47cd64dd3fca,10.000 syrische Babys - geboren in einem ander...,Maysoon Mohammad Khalaf Al-Hijazat,2017-08-16 13:23,522,10000-syrische-babys-geboren-in-einem-anderen-...,Gastkommentare,Meinung,other
4,4fe39ec3-426f-4608-9def-225229a4a476,1700 Euro steuerfrei: Was der SPÖ-Plan bringt,Karl Ettinger,2019-08-26 18:14,824,1700-euro-steuerfrei-was-der-spo-plan-bringt.json,Politik,Nachrichten,other
