# Filtering the dataset

Create a small subset of full data

In [None]:
import sys
import os
import pandas as pd
import json

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
METADATA_PATH = os.path.join("..", "data", "metadata.csv")
ARTICLES_CLEAN_DIR = os.path.join("..", "data", "articles_clean")

FILTERED_METADATA_PATH = os.path.join("..", "data", "filtered_metadata.csv")

Available metadata fields

In [None]:
metadata = pd.read_csv(METADATA_PATH)
metadata["published_at"] = pd.to_datetime(metadata["published_at"])
metadata.head()

Analysis

In [None]:
metadata["published_at"].describe()

In [None]:
metadata["category"].describe()

In [None]:
metadata["category"].value_counts()  # 47 different categories

In [None]:
metadata["section"].value_counts()

In [None]:
metadata["word_count"].describe()

In [None]:
# Missing data
metadata["author"].isnull().sum()

Filter the metadata df according to your needs

Hint: Try to not make the dataset too big for processing reasons (try around to get a feeling what your machine can handle)

In [None]:
filtered_metadata = metadata[
    # Date
    (metadata["published_at"] >= "2007-01-01") &
    (metadata["published_at"] < "2012-01-01") &

    # Authors
    # (metadata["author"].isin(["Christine Zeiner", "Silke Farmer"])) &

    # Category
    # (metadata["section"].isin(["Politik", "Wirtschaft"])) &  # ('Politik', 'Wirtschaft', 'Kommentare', 'Gastkommentare', 'Wissen', see above for all categories)

    # Section
    # (metadata["section"].isin(["Nachrichten", "Meinung"])) &  # ('Meinung', 'Nachrichten', 'Archiv', 'Themen', 'Dossiers')

    # Word count
    # (metadata["word_count"] >= 100) &

    # Tags
    (metadata["valid_indicator"]) &  # this is an estimate if the probabilities assigned to the category are actually valid, recommended to use but will reduce the dataset size
    (metadata["financial_crisis"] > 0.8) # ('financial_crisis', 'sustainability', 'fake_news', 'ai', 'digitalization', 'local_journalism', 'covid', 'demographics', 'innovation')
]
print(f"Expected number of articles: {filtered_metadata.shape[0]}")

Get filtered articles

In [None]:
def filter_articles(filtered_metadata, articles_dir):
    """
    Filter articles based on filtered metadata
    """
    articles = {}
    for _, row in filtered_metadata.iterrows():
        article_path = os.path.join(articles_dir, row["filename"])
        with open(article_path, "r", encoding="utf-8") as file:
            article = json.load(file)
            articles[row["filename"]] = article
    return articles

In [None]:
filtered_articles = filter_articles(filtered_metadata, ARTICLES_CLEAN_DIR)

In [None]:
# Check some articles from that sample
sample_id = 0

print(f"Number of articles: {len(filtered_articles)}\n")
print(f"Sample article metadata:\n {filtered_metadata.iloc[sample_id]}\n")
print(f"Sample article:\n {filtered_articles[list(filtered_articles.keys())[sample_id]]}")

In [None]:
# Store the filtered metadata to csv
if input("Do you want to store (overwrite) the filtered metadata to csv? (y/n): ") == "y":
    filtered_metadata.to_csv(FILTERED_METADATA_PATH, index=False)