In [6]:
import spacy
import os
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from pipeline.service import FileService
import urllib.request

nlp_de = spacy.load('de_core_news_sm')
lemmatizer = WordNetLemmatizer()

cleaned_df = FileService.read_parquet_to_df(file_name='articles_cleaned', file_dir='../' + FileService.default_processed_path)

texts = list(cleaned_df[cleaned_df['language'] == 'de']['content'])
custom_german_stopwords:set = {
    " ", "\x96", "the", "to", "of", "20", "minuten",
}

with urllib.request.urlopen('https://raw.githubusercontent.com/solariz/german_stopwords/refs/heads/master/german_stopwords_full.txt') as f:
    german_stopwords_full = f.read().decode('utf-8')

# Test if https://github.com/solariz/german_stopwords/blob/master/german_stopwords_full.txt helps
# with open(os.path.normpath("./german_stopwords_full.txt"), "r") as f:
#    german_stopwords_full = f.readlines()

stop_words = set(stopwords.words("german")) | set(german_stopwords_full) | custom_german_stopwords

processed_texts = []
for idx in range(len(texts)):
    if idx % 100 == 0:
        print(f"At step: {idx} of {len(texts)}")
    doc = texts[idx]
    # Tokenize the document
    doc = nlp_de(str(doc).lower())  # Lowercase and tokenize
    tokenized_articles = [token.text for token in doc if not token.is_stop and not token.is_punct]

    # Lemmatize words and remove stopwords
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokenized_articles if token not in stop_words]

    processed_texts.append(lemmatized_tokens)

processed_texts

At step: 0 of 8044
At step: 100 of 8044
At step: 200 of 8044
At step: 300 of 8044
At step: 400 of 8044
At step: 500 of 8044
At step: 600 of 8044
At step: 700 of 8044
At step: 800 of 8044
At step: 900 of 8044
At step: 1000 of 8044
At step: 1100 of 8044
At step: 1200 of 8044
At step: 1300 of 8044
At step: 1400 of 8044
At step: 1500 of 8044
At step: 1600 of 8044
At step: 1700 of 8044
At step: 1800 of 8044
At step: 1900 of 8044
At step: 2000 of 8044
At step: 2100 of 8044
At step: 2200 of 8044
At step: 2300 of 8044
At step: 2400 of 8044
At step: 2500 of 8044
At step: 2600 of 8044
At step: 2700 of 8044
At step: 2800 of 8044
At step: 2900 of 8044
At step: 3000 of 8044
At step: 3100 of 8044
At step: 3200 of 8044
At step: 3300 of 8044
At step: 3400 of 8044
At step: 3500 of 8044
At step: 3600 of 8044
At step: 3700 of 8044
At step: 3800 of 8044
At step: 3900 of 8044
At step: 4000 of 8044
At step: 4100 of 8044
At step: 4200 of 8044
At step: 4300 of 8044
At step: 4400 of 8044
At step: 4500 of 8044


[['a380',
  'singapore',
  'airline',
  'landeanflug',
  'flughafen',
  'kloten',
  'abbrechen',
  'mal',
  'geklappt',
  'video',
  'leser',
  'reporter',
  'zeigt',
  'a380',
  'gigant',
  'durchstarten',
  'leser',
  'reporter',
  'berichtet',
  'traf',
  'a330',
  '300',
  'swiss',
  'flieger',
  'aviv'],
 ['konkret',
  'polizei',
  'durchsetzung',
  'kontrolle',
  'verbote',
  'bestehende',
  'beauftragten',
  'öffentlichkeit',
  'datenschutz',
  'bewilligte',
  'optisch',
  'elektronische',
  'überwachungsanlagen',
  'öffentlich',
  'zugänglicher',
  'räume',
  'echtzeitüberwachung',
  'einsetzen',
  'steht',
  'sonderverordnung',
  'regierungsrats',
  'gemäss',
  'verordnung',
  'bundesrats',
  'menschenansammlungen',
  'personen',
  'öffentlichen',
  'raum',
  'namentlich',
  'öffentlichen',
  'plätzen',
  'spazierwegen',
  'parkanlagen',
  'verboten',
  'ansammlungen',
  'personen',
  'einzelnen',
  'personen',
  'abstand',
  'mindestens',
  'metern',
  'einzuhalten',
  'poliz

Maybe we can optimize what we actually select... but how?

There are some problematic characters

In [2]:
# %%script echo skipping
import re

strip_chars = "".join(["«", "»"])
replace_empty = "".join(["-", "/", "|", "#", ".", "…"])


de_df = cleaned_df[cleaned_df['language'] == 'de']
article_list = list(de_df['content'])
article_list =  [re.sub(r'[«»]', '', article) for article in article_list]
article_list =  [re.sub(r'[-/|#.…]', ' ', article) for article in article_list]


Couldn't find program: 'echo'


In [13]:
# %%script echo skipping
from pipeline.service import FileService
import spacy
nlp_de = spacy.load('de_core_news_sm')

cleaned_df = FileService.read_parquet_to_df(file_name='articles_cleaned', file_dir='../data/processed')
de_df = cleaned_df[cleaned_df['language'] == 'de']
article_list = list(de_df['content'])

In [14]:
# %%script echo skipping
article_list = list(de_df['content'])
#article_list =  [re.sub(r'[«»]', '', article) for article in article_list]
#article_list =  [re.sub(r'[-/|#.…]', ' ', article) for article in article_list]


articles = nlp_de.pipe(article_list, disable=["tagger", "ner", "textcat"], n_process=4)

In [15]:
# %%script echo skipping
i = 0
for article in articles:
    i = i + 1
    if i > 3:
        break
    print(article)
    alphas = [(token, token.lemma_.lower()) for token in article if not token.is_alpha and not token.is_punct]
    print(alphas)

 Der A380 der Singapore Airline musste den Landeanflug auf den Flughafen Kloten abbrechen. Beim zweiten Mal hat es geklappt wie ein Video eines Leser Reporters zeigt. Nicht nur der A380 Gigant musste durchstarten. Wie ein Leser Reporter berichtet, traf es auch eine A330 300 der Swiss. Der Flieger kam von Tel Aviv. 
[( , ' '), (A380, 'a380'), (A380, 'a380'), (A330, 'a330'), (300, '300')]
 Konkret kann die Polizei zur Durchsetzung und Kontrolle der Verbote bestehende, von der Beauftragten für Öffentlichkeit und Datenschutz bewilligte optisch elektronische Überwachungsanlagen öffentlich zugänglicher Räume zur Echtzeitüberwachung einsetzen. So steht es in der Sonderverordnung des Regierungsrats. Gemäss Verordnung des Bundesrats sind unter anderem Menschenansammlungen von mehr als fünf Personen im öffentlichen Raum, namentlich auf öffentlichen Plätzen, auf Spazierwegen und in Parkanlagen, verboten. Bei Ansammlungen von bis zu fünf Personen sind zwischen den einzelnen Personen ein Abstand vo

In [16]:
# %%script echo skipping
i = 0
for article in articles:
    i = i + 1
    if i > 3:
        break
    print(article)
    alphas = [(token, token.lemma_.lower()) for token in article if  not token.is_punct and not token.is_space]
    print(alphas)

 Dies war das Beste, was der anscheinend völlig überforderte Restaurator …Screenshot … in Valencia zustande brachte. Die Kunst Katastrophe erinnert an das missglückte Jesus Fresko aus dem Jahr 2012.KEYSTONE Darum gehts Ein Restaurator war mit einem Auftrag völlig überfordert. Das Marienbildnis war kau mwiederzuerkennen. Es ist kein Einzelfall. Eine nach Medienberichten sehr wertvolle Kopie eines der Marienbildnisse des bedeutenden spanischen Barockmalers Bartolomé Esteban Murillo wurde in Valencia von einem mit der Ausbesserung beauftragten Restaurator bis zur Unkenntlichkeit verunstaltet. Die Nachrichtenagentur Europa Press veröffentlichte am Montag Davor und Danach Bilder, die das Ausmass der Pfuscharbeit verdeutlichen. Der Auftraggeber, ein Privatsammler, der für den Job 1200 Euro im Voraus bezahlt habe, sei aus dem Staunen nicht herausgekommen, als er das Ergebnis der völlig missglückten Arbeit gesehen habe. Der Urheber der Kunst Katastrophe erhielt den Berichten zufolge sogar eine

Manual selection of: https://universaldependencies.org/u/pos/

# Dictionaries with some POS removed
Dataset 1 - Throw away those:
* "ADP", adapositon
* "ADV", adverb
* "AUX", auxiliary
* "CCONJ", coordinating conunction
* "DET", determiner
* "INTJ", interjection
* "NUM", numeral
* "PART", particle
* "PRON", pronoun
* "PUNCT", punctuation
* "SCONJ", subordinating conjunction
* "SYM" symbol

I think it might make mostly sense to keep these double words?

In [18]:
import spacy
import os
from gensim import corpora
from pipeline.service import FileService

stopwords = {"#", "*", "--"}

cleaned_df = FileService.read_parquet_to_df(file_name='articles_cleaned', file_dir='../data/processed')
de_df = cleaned_df[cleaned_df['language'] == 'de']

nlp_de = spacy.load('de_core_news_sm')
nlp_de.Defaults.stop_words |= stopwords


In [20]:
articles = nlp_de.pipe(de_df['content'], disable=["tagger", "ner", "textcat"], n_process=4)

# Tags to be removed: https://universaldependencies.org/u/pos/
pos_to_remove= ["ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NUM", "PART", "PRON", "PUNCT", "SCONJ", "SYM", ]

tokenized_articles = []
i = 0
for article in articles:
   i += 1
   if i % 1000 == 0:
      print(f"At step: {i} of {len(de_df)}")

   article_tokens = []
   for token in article:
      if (
         token.pos_ not in pos_to_remove # Remove defined parts of speech
         and not token.is_stop # Token is not a stopword
         and not token.is_space
      ):
         article_tokens.append(token.lemma_.lower())

   tokenized_articles.append(article_tokens)

dictionary_german_removed_pos = corpora.Dictionary(tokenized_articles)
dictionary_german_removed_pos.save(fname_or_handle=os.path.normpath("../models/dictionaries/dictionary-german-removed-pos"))
print("Exported dictionary: dictionary_german_removed_pos")

At step: 1000 of 8044
At step: 2000 of 8044
At step: 3000 of 8044
At step: 4000 of 8044
At step: 5000 of 8044
At step: 6000 of 8044
At step: 7000 of 8044
At step: 8000 of 8044
Exported dictionary: dictionary_german_removed_pos


In [None]:
#dictionary_german_removed_pos.most_common(n=300)

# Dictionaries with only nouns
Inspired by https://aclanthology.org/U15-1013

In [21]:
articles = nlp_de.pipe(de_df['content'], disable=["tagger", "ner", "textcat"], n_process=4)

# The only tag to keep is nouns: https://universaldependencies.org/u/pos/
nouns = ["NOUN", "PROPN"]

tokenized_articles = []
i = 0
for article in articles:
   i += 1
   if i % 1000 == 0:
      print(f"At step: {i} of {len(de_df)}")

   article_tokens = []
   for token in article:
      if (
         token.pos_ in nouns
         and not token.is_stop
         ):
         article_tokens.append(token.lemma_.lower())

   tokenized_articles.append(article_tokens)

dictionary_german_noun_only = corpora.Dictionary(tokenized_articles)
dictionary_german_noun_only.save(fname_or_handle=os.path.normpath("../models/dictionaries/dictionary-german-noun-only"))
print("Exported dictionary: dictionary_german_noun-only")

At step: 1000 of 8044
At step: 2000 of 8044
At step: 3000 of 8044
At step: 4000 of 8044
At step: 5000 of 8044
At step: 6000 of 8044
At step: 7000 of 8044
At step: 8000 of 8044
Exported dictionary: dictionary_german_noun-only


Very few weird words in top 300:

```py
stopwords = ["\x96", "the", "#", "keystone", "*", "--", "a"]
```

In [None]:
#dictionary_german_noun_only.most_common(n=300)

# Dictionaries with some POS removed

In [4]:
import spacy
import os
from gensim import corpora
from pipeline.service import FileService


cleaned_df = FileService.read_parquet_to_df(file_name='articles_cleaned', file_dir='../data/processed')
fr_df = cleaned_df[cleaned_df['language'] == 'fr']

nlp_fr = spacy.load('fr_core_news_sm')


In [25]:
articles = nlp_fr.pipe(fr_df['content'], disable=["tagger", "ner", "textcat"], n_process=4)

# Tags to be removed: https://universaldependencies.org/u/pos/
pos_to_remove= ["ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NUM", "PART", "PRON", "PUNCT", "SCONJ", "SYM", ]
stopwords = ["«", "-", " ", "m.", "#", "–", "\x96", "*", "c\x92est", "d\x92un", "-t", "/" ,
             "qu\x92il", "webtv@20minutes.ch", "j.", "d\x92autre", "https://t.co", "c\x9cur", 
             "j\x92ai", "h.", "o", "n\x92er", "n\x92a", "c.", "s.", ".keystone", "n\x92y", 
             "s\x9cur", "l.", "b.", "\x9cuvre", "jusqu\x92à", "n\x92aver", "|", "''", "n\x92est", "…"] # And many more...
tokenized_articles = []
i = 0
len_df = len(fr_df)
for article in articles:
   i += 1
   if i % 1000 == 0:
      print(f"At step: {i} of {len_df}")

   article_tokens = []
   for token in article:
      if (
         token.pos_ not in pos_to_remove # Remove defined parts of speech
         and not token.is_stop # Token is not a stopword
         and not token.is_space
      ):
         article_tokens.append(token.lemma_.lower())

   tokenized_articles.append(article_tokens)

dictionary_french_removed_pos = corpora.Dictionary(tokenized_articles)
dictionary_french_removed_pos.save(fname_or_handle=os.path.normpath("../models/dictionaries/dictionary-french-removed-pos"))
print("Exported dictionary: dictionary_french_removed_pos")

At step: 1000 of 15511
At step: 2000 of 15511
At step: 3000 of 15511
At step: 4000 of 15511
At step: 5000 of 15511
At step: 6000 of 15511
At step: 7000 of 15511
At step: 8000 of 15511
At step: 9000 of 15511
At step: 10000 of 15511
At step: 11000 of 15511
At step: 12000 of 15511
At step: 13000 of 15511
At step: 14000 of 15511
At step: 15000 of 15511
Exported dictionary: dictionary_french_removed_pos


Still contains some 
* special characters
* abbreviations "a.", "l." -> might this be the end of a sentence? 
* a lot of escaped "\x92"... this might be apostrophes -> encoding issue during preprocessing?

```py
stopwords = ["«", "-", " ", "m.", "#", "–", "\x96", "*", "c\x92est", "d\x92un", "-t", "/" ,
"qu\x92il", "webtv@20minutes.ch", "j.", "d\x92autre", "https://t.co", "c\x9cur", 
"j\x92ai", "h.", "o", "n\x92er", "n\x92a", "c.", "s.", ".keystone", "n\x92y", 
"s\x9cur", "l.", "b.", "\x9cuvre", "jusqu\x92à", "n\x92aver", "|", "''", "n\x92est", "…"] # And many more...
```

In [None]:
#dictionary_french_removed_pos.most_common(n=300)

# Dictionaries with only nouns
Inspired by https://aclanthology.org/U15-1013

In [5]:
articles = nlp_fr.pipe(fr_df['content'], disable=["tagger", "ner", "textcat"], n_process=4)

# The only tag to keep is nouns: https://universaldependencies.org/u/pos/
nouns = ["NOUN", "PROPN"]
stopwords = ["«", "-", "l’", "afp", "m.", "a", "#"] # Top 300
tokenized_articles = []
i = 0
len_df = len(fr_df)
for article in articles:
   i += 1
   if i % 1000 == 0:
      print(f"At step: {i} of {len_df}")

   article_tokens = []
   for token in article:
      if (
         token.pos_ in nouns
         and not token.is_stop
         ):
         article_tokens.append(token.lemma_.lower())

   tokenized_articles.append(article_tokens)

dictionary_french_noun_only = corpora.Dictionary(tokenized_articles)
dictionary_french_noun_only.save(fname_or_handle=os.path.normpath("../models/dictionaries/dictionary-french-noun-only"))
print("Exported dictionary: dictionary_french_noun-only")

At step: 1000 of 15511
At step: 2000 of 15511
At step: 3000 of 15511
At step: 4000 of 15511
At step: 5000 of 15511
At step: 6000 of 15511
At step: 7000 of 15511
At step: 8000 of 15511
At step: 9000 of 15511
At step: 10000 of 15511
At step: 11000 of 15511
At step: 12000 of 15511
At step: 13000 of 15511
At step: 14000 of 15511
At step: 15000 of 15511


KeyboardInterrupt: 

Very few weird words in top 300:
```py
stopwords = ["«", "-", "l’", "afp", "m.", "a", "#"] # Top 300
```

In [None]:
#dictionary_french_noun_only.most_common(n=300)