### Metadata source

In [None]:
import sys
import os
import pandas as pd
import json
from tqdm import tqdm
from dotenv import load_dotenv
from pydantic import BaseModel
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain_core.rate_limiters import InMemoryRateLimiter

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.articles import create_static_metadata, create_keywords_tags_fuzzy, keywords_dict
from src.llm import get_gemini_llm_client
from src.prompts import get_metadata_prompt

if not load_dotenv():
    raise Exception('Error loading .env file. Make sure to place a valid OPEN_AI_KEY in the .env file.')

In [None]:
rate_limiter = InMemoryRateLimiter(
    requests_per_second=0.5,  # <-- Gemini Free Tier
    check_every_n_seconds=0.1,
)

llm_client = get_gemini_llm_client(
    max_tokens=1024,
    temperature=0.2,
    rate_limiter=rate_limiter,
)

Setup the paths to data sources

In [None]:
ARTICLES_CLEAN_DIR = os.path.join("..", "data", "articles_clean")
METADATA_PATH = os.path.join("..", "data", "metadata.csv")

Extract metadata

In [None]:
class ArticleTags(BaseModel):
    tags: list[str]

In [None]:
def process_article(filename: str, with_tags: bool = True, tags_type: ('llm', 'keywords') = 'keywords'):
    """
    Process an article and return its metadata.
    :param filename: The name of the article file.
    :param with_tags: Whether to include tags in the metadata.
    :param tags_type: The type of tags to include in the metadata.
    :return: The metadata of the article.
    """    
    article_path = os.path.join(ARTICLES_CLEAN_DIR, filename)
    with open(article_path, "r", encoding="utf-8") as file:
        article = json.load(file)
    
    # Create static metadata for the article
    article_metadata = create_static_metadata(article, filename)
    
    if with_tags:
        if tags_type == 'llm':
            # Prepare the prompt for the LLM using the article's text
            tags_prompt = get_metadata_prompt()
            query = tags_prompt.format(article_text=article["text"])
            
            # Invoke the LLM with structured output to extract tags
            llm = llm_client.with_structured_output(ArticleTags)
            response = llm.invoke([query])
            tags = response.tags
        elif tags_type == 'keywords':
            keywords = keywords_dict()
            tags = create_keywords_tags_fuzzy(article["text"], keywords)
        else:
            raise ValueError(f"Invalid tags_type: {tags_type}. Must be ('llm', 'keywords').")
    else:
        tags = []
    
    article_metadata["tags"] = tags
    return article_metadata

In [None]:
# List all cleaned article files
articles = os.listdir(ARTICLES_CLEAN_DIR)
metadata = []

# Adjust the max_workers based on available resources (None is max)
with ThreadPoolExecutor(max_workers=None) as executor:
    futures = {executor.submit(process_article, filename): filename for filename in articles[:1000]}
    for future in tqdm(as_completed(futures), total=len(futures)):
        try:
            result = future.result()
            metadata.append(result)
        except Exception as e:
            print(f"Error processing file {futures[future]}: {e}")


df_metadata = pd.DataFrame(metadata)
df_metadata.to_csv(METADATA_PATH, index=False)

In [None]:
# Articles length statistics
df_metadata["words_count"].describe()

Categories by WZ

In [None]:
df_metadata["category"].describe()

In [None]:
df_metadata["category"].value_counts()

In [None]:
# Tags statistics (only available if with_tags=True)
df_metadata.explode("tags")["tags"].value_counts()

In [None]:
# Missing authors
df_metadata["author"].isnull().sum()

Metadata df

In [None]:
df_metadata.head(5)