# Raw Data

In [1]:
raw_data_path = "./data/20min-test-query-2020-jan-jun/raw-data/"
raw_file_name = "cd558cca-53cb-4ff9-a3f5-89f70e139051__2024_10_13T18_23_54.tsv"

# We can not share the full swissdox dataset, hence control the jupyter output while exporting
EXPORT = True

In [2]:
import pandas as pd
import os
from pathlib import Path

import seaborn as sns
from collections import Counter
import lxml.html
from bs4 import BeautifulSoup



In [None]:
with open(os.path.join(raw_data_path, raw_file_name)) as file:
    raw_file = file.readlines()

print(raw_file[0])
if EXPORT:
    print(raw_file[1].replace("\t", " <tab> ")[:60])
else:
    print(raw_file[1].replace("\t", " <tab> "))

In [None]:
df = pd.read_csv(os.path.join(raw_data_path, raw_file_name), sep="\t")
print(df.shape)
print(df.head(1).transpose())

In [None]:
print(df.info())

In [None]:
drop_nan_columns = ["rubric", "regional", "subhead"]
df = df.drop(drop_nan_columns, axis=1)
df.shape

In [None]:
print(df.info())

In [None]:
df['content']

# Doctype
doctype
doctype_description 

In [None]:
df.groupby(["doctype_description"]).count()

This is not info which rubric this is from... Does this have to be scraped?    
Would be interesting but is it relevant?

# HTML Tag statistics

In [10]:
def accumulate_counters(counter_list):
    accumulator_counter = Counter()
    for counter in counter_list:
        accumulator_counter += counter

    return accumulator_counter


In [None]:
df_de = df[df["language"] == "de"]
df_fr = df[df["language"] == "fr"]

print(f"all: {df.shape}, de: {df_de.shape}, fr: {df_fr.shape}")
print(f"difference?: {len(df) - len(df_de) - len(df_fr)}")

In [None]:
df_de.content[0:10]

In [13]:
for row in df_de["content"].iloc[0:10]:
    print(row) if not EXPORT else None

In [None]:
get_html_tag_occurence = lambda content: Counter({element.tag for element in lxml.html.fromstring(content).iter()})
tag_occurence = list(df_de["content"].apply(get_html_tag_occurence))
accumulator_occurence = accumulate_counters(tag_occurence)
pd.DataFrame(accumulator_occurence, index=["counts"]).sort_values(by=["counts"], ascending=False, axis=1)

* Every article has the following:
    * tx -> whole text wrapped
    * ld -> lead text (except one article)
    * p -> multiple text elements
        * If there is a last element with brackets this might be one or multiple authors:
            * `<p>(SDA)</p></tx>`


* The other tags should be checked if there is something relevant extractable
    * zt -> zwischentitel(subheading)
    * a -> anchor -> links to 
        * other websites or 
        * other articles `<a href="#showid=305209&amp;index=0">La Cumbre-Ausbruch bedroht seltene Tierarten</a>` -> not sure what this is referencing
        * other articles `<a href="https://www.20min.ch/ausland/news/story/Saemtliche-Fluege-wegen-Vulkanausbruch-anulliert-22651640">Taal</a>`
    * br -> breaks
        * These breaks are also part of twitter messages, as indicated by `<ka>` <br/>
            `<ka><p>¡Hey! ??<br/><br/>Tiempo de jugar... ??<br/><br/>¿Nos`...
    * ka -> external embeddings?
        * also a "darum gehts" box as indicated by:
            * `<ka><p>Darum gehts</p><p>`
            * each bullet point has their own `<p>...</p>`    
    * lg -> lead graphic (text of first picture on the article) -> include/exclude?
    * au -> author full name (not short name as in last p element)



Does it look similar for the french version?

In [None]:
get_html_tag_occurence = lambda content: Counter({element.tag for element in lxml.html.fromstring(content).iter()})
tag_occurence = list(df_fr["content"].apply(get_html_tag_occurence))
accumulator_occurence = accumulate_counters(tag_occurence)
pd.DataFrame(accumulator_occurence, index=["counts"]).sort_values(by=["counts"], ascending=False, axis=1)

Yes it looks simlilar.

In [None]:
get_html_tag_counts = lambda content: Counter(element.tag for element in lxml.html.fromstring(content).iter())
tags_counts = list(df_de["content"].apply(get_html_tag_counts))

tags_counts[:4]

In [None]:
accumulator_counter = accumulate_counters(tags_counts)
plot_df = pd.DataFrame(accumulator_counter, index=["counts"]).sort_values(by=["counts"], ascending=False, axis=1)
plot_df


In [None]:
sns.barplot(plot_df)

In [None]:
sns.barplot(plot_df.drop(["p"], axis=1))

# Let us also extract the following:

* `<ld>`: lead text
* last ``<p>`` if encased in (): author(s) 
* ``<zt>``: subheadings
* ``<ka>``: differentiate if external website or not

what is lg and au?

In [None]:
def get_lg(article:str):
    soup = BeautifulSoup(article)
    if soup.lg:
        return soup.lg.string
    else:
        return None
    
series_lg = df_de["content"].apply(get_lg)

for row in series_lg[~series_lg.isnull()].iloc[0:3]:
    print(row)

In [None]:
def get_au(article:str):
    soup = BeautifulSoup(article)
    if soup.au:
        return soup.au.string
    else:
        return None
    
series_au = df_de["content"].apply(get_au)

for row in series_au[~series_au.isnull()].iloc[0:3]:
    print(row)

In [None]:
print("articles with multiple authors:", series_au[~series_au.isnull()].str.contains(",").sum())
series_au.groupby(series_au).count().sort_values()

In [23]:
def get_lead(article:str ):
    soup = BeautifulSoup(article)
    if soup.ld:
        return soup.ld.string
    else:
        return None

In [None]:
df_de["content"].apply(get_lead)

In [25]:
def get_authors(article:str):
    """ If the last <p> element encapsulates any text with "()" then 
        this text is split by eitehr "," or "/" into a list of authors."""
    soup = BeautifulSoup(article)
    if soup.p:
        last_p =  soup.find_all("p")[-1].text
        if last_p[0] == "(" and last_p[-1] == ")":
            inner_last_p = last_p[1:-1]
            if "," in inner_last_p:
                return_value = inner_last_p.split(",")
            elif "/" in last_p[1:-1]:
                return_value = inner_last_p.split("/")
            else:
                return_value = [inner_last_p]

            # sanity check, there should hopefully not be a broken up string
            if len(return_value) > 3:
                print(return_value)
                
            return return_value
        else: 
            return None
    else:
        return None
    

In [None]:
authors_per_article = df_de["content"].apply(get_authors)
num_authors = authors_per_article.apply(lambda x: len(x) if x else 0)
sns.histplot(num_authors)

In [None]:
authors_per_article.isna().sum()

In [None]:
# validate nonstandard authors that are not 2 or 3 chars
for r in authors_per_article:
    if not r:
        continue

    if not any([True for x in r if "/" in x]):
        if len(r) == 1 and not (len(r[0]) == 3 or len(r[0]) == 2 ):
            print(r)


Check overlap of `<au>` and extracted authors

In [None]:
print("Total matches are more than the samples in the dataframe: ", authors_per_article.isna().sum() +series_au.isna().sum())

In [None]:
print("number of overlapping: ", sum(~authors_per_article.isna() & ~series_au.isna()))

In [None]:
print("number of both empty: ", sum(authors_per_article.isna() & series_au.isna()))

In [None]:
# Overlap is basically SDA, Reuters and DPA... in series_au, authors it is people
authors_per_article[~authors_per_article.isna() & ~series_au.isna()].head(10)

In [None]:
print("overlap example:")
for url in df_de[authors_per_article.isna() & series_au.isna()]["article_link"].iloc[-10:]:
    print(url)



"""for row in df_de["content"].iloc[0:10]:
    print(row)"""

In [34]:
def get_subheadings(article:str):
    soup = BeautifulSoup(article)
    if soup.zt:
        return [zt.text for zt in soup.find_all("zt")]
    else:
        return None

In [None]:
res = df_fr["content"].apply(get_subheadings)
res[4]

In [None]:
res[0:10]

# Conclusion on HTML tags

### Available tags:
* tx -> whole text wrapped
    * just remove the tag itself
* ld -> lead text
    * extract to column
* p -> multiple text elements
    * just remove the tags
* p -> last element iff matches authors
    * remove the authors and extract to column
    
* zt -> zwischentitel(subheading)
    * extract to column
* a -> anchor
    * remove the urls but keep the annotated text within tag
* br -> breaks
    * remove the tags
* ka -> external embeddings?
    * remove the tags
* lg -> lead graphic (text of first picture on the article) -> include/exclude?
    * strip out content of tag, this is in reference to one of many images
* au -> author full name (not short name as in last p element)
    * for now: create second column since the overlap with extracted authors from last `<p>` and `<au>` is small but `SDA`, `Reuters`... vs `<human name>`



# New Columns should be:
* id
* pubtime
* language
* char_count
* dateline
* head
* article_link
* content
* lead_text
* subheadings
* author_extracted (extracted from last `<p>`)
* author (extracted from `<au>` tag)
* text (cleaned ``content``)
    * remove just tags:
        * `<tx>`
        * `<br>`
        * `<ka>`
    * remove substring from content
        * last `<p>` if it is a match for author filter
        * urls from `<a>` tag -> check if general html tag remover also does this easily
        * `<lg>` text since it is related to the image not the article.