In [None]:
%%script echo skipping
import spacy
import os
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from pipeline.service import FileService

nlp_de = spacy.load('de_core_news_sm')
lemmatizer = WordNetLemmatizer()

cleaned_df = FileService.read_parquet_to_df(file_name='articles_cleaned')

texts = list(cleaned_df[cleaned_df['language'] == 'de']['content'])
custom_german_stopwords:set = {
    " ", "\x96", "the", "to", "of", "20", "minuten",
}

# Test if https://github.com/solariz/german_stopwords/blob/master/german_stopwords_full.txt helps
with open(os.path.normpath("./german_stopwords_full.txt"), "r") as f:
    german_stopwords_full = f.readlines()


stop_words = set(stopwords.words("german")) | set(german_stopwords_full) | custom_german_stopwords


processed_texts = []
for idx in range(len(texts)):
    if idx % 100 == 0:
        print(f"At step: {idx} of {len(texts)}")
    doc = texts[idx]
    # Tokenize the document
    doc = nlp(str(doc).lower())  # Lowercase and tokenize
    tokenized_articles = [token.text for token in doc if not token.is_stop and not token.is_punct]

    # Lemmatize words and remove stopwords
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokenized_articles if token not in stop_words]

    processed_texts.append(lemmatized_tokens)

processed_texts

Maybe we can optimize what we actually select... but how?

There are some problematic characters

In [None]:
%%script echo skipping
import re

strip_chars = "".join(["«", "»"])
replace_empty = "".join(["-", "/", "|", "#", ".", "…"])


de_df = cleaned_df[cleaned_df['language'] == 'de']
article_list = list(de_df['content'])
article_list =  [re.sub(r'[«»]', '', article) for article in article_list]
article_list =  [re.sub(r'[-/|#.…]', ' ', article) for article in article_list]


In [None]:
%%script echo skipping
from pipeline.service import FileService
import spacy
nlp_de = spacy.load('de_core_news_sm')

cleaned_df = FileService.read_parquet_to_df(file_name='articles_cleaned')
de_df = cleaned_df[cleaned_df['language'] == 'de']
article_list = list(de_df['content'])

In [None]:
%%script echo skipping
article_list = list(de_df['content'])
#article_list =  [re.sub(r'[«»]', '', article) for article in article_list]
#article_list =  [re.sub(r'[-/|#.…]', ' ', article) for article in article_list]


articles = nlp_de.pipe(article_list, disable=["tagger", "ner", "textcat"], n_process=4)

In [None]:
%%script echo skipping
i = 0
for article in articles:
    i = i + 1
    if i > 3:
        break
    print(article)
    alphas = [(token, token.lemma_.lower()) for token in article if not token.is_alpha and not token.is_punct]
    print(alphas)

In [None]:
%%script echo skipping
i = 0
for article in articles:
    i = i + 1
    if i > 3:
        break
    print(article)
    alphas = [(token, token.lemma_.lower()) for token in article if  not token.is_punct and not token.is_space]
    print(alphas)

Manual selection of: https://universaldependencies.org/u/pos/

# Dictionaries with some POS removed
Dataset 1 - Throw away those:
* "ADP", adapositon
* "ADV", adverb
* "AUX", auxiliary
* "CCONJ", coordinating conunction
* "DET", determiner
* "INTJ", interjection
* "NUM", numeral
* "PART", particle
* "PRON", pronoun
* "PUNCT", punctuation
* "SCONJ", subordinating conjunction
* "SYM" symbol

I think it might make mostly sense to keep these double words?

In [None]:
import spacy
import os
from gensim import corpora
from pipeline.service import FileService

stopwords = {"#", "*", "--"}

cleaned_df = FileService.read_parquet_to_df(file_name='articles_cleaned')
de_df = cleaned_df[cleaned_df['language'] == 'de']

nlp_de = spacy.load('de_core_news_sm')
nlp_de.Defaults.stop_words |= stopwords


In [None]:
articles = nlp_de.pipe(de_df['content'], disable=["tagger", "ner", "textcat"], n_process=4)

# Tags to be removed: https://universaldependencies.org/u/pos/
pos_to_remove= ["ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NUM", "PART", "PRON", "PUNCT", "SCONJ", "SYM", ]

tokenized_articles = []
i = 0
for article in articles:
   i += 1
   if i % 1000 == 0:
      print(f"At step: {i} of {len(de_df)}")

   article_tokens = []
   for token in article:
      if (
         token.pos_ not in pos_to_remove # Remove defined parts of speech
         and not token.is_stop # Token is not a stopword
         and not token.is_space
      ):
         article_tokens.append(token.lemma_.lower())

   tokenized_articles.append(article_tokens)

dictionary_german_removed_pos = corpora.Dictionary(tokenized_articles)
dictionary_german_removed_pos.save(fname_or_handle=os.path.normpath("./models/dictionaries/dictionary-german-removed-pos"))
print("Exported dictionary: dictionary_german_removed_pos")

In [None]:
#dictionary_german_removed_pos.most_common(n=300)

# Dictionaries with only nouns
Inspired by https://aclanthology.org/U15-1013

In [None]:
articles = nlp_de.pipe(de_df['content'], disable=["tagger", "ner", "textcat"], n_process=4)

# The only tag to keep is nouns: https://universaldependencies.org/u/pos/
nouns = ["NOUN", "PROPN"]

tokenized_articles = []
i = 0
for article in articles:
   i += 1
   if i % 1000 == 0:
      print(f"At step: {i} of {len(de_df)}")

   article_tokens = []
   for token in article:
      if (
         token.pos_ in nouns
         and not token.is_stop
         ):
         article_tokens.append(token.lemma_.lower())

   tokenized_articles.append(article_tokens)

dictionary_german_noun_only = corpora.Dictionary(tokenized_articles)
dictionary_german_noun_only.save(fname_or_handle=os.path.normpath("./models/dictionaries/dictionary-german-noun-only"))
print("Exported dictionary: dictionary_german_noun-only")

Very few weird words in top 300:

```py
stopwords = ["\x96", "the", "#", "keystone", "*", "--", "a"]
```

In [None]:
#dictionary_german_noun_only.most_common(n=300)

# Dictionaries with some POS removed

In [None]:
import spacy
import os
from gensim import corpora
from pipeline.service import FileService


cleaned_df = FileService.read_parquet_to_df(file_name='articles_cleaned')
fr_df = cleaned_df[cleaned_df['language'] == 'fr']

nlp_fr = spacy.load('fr_core_news_sm')


In [None]:
articles = nlp_fr.pipe(fr_df['content'], disable=["tagger", "ner", "textcat"], n_process=4)

# Tags to be removed: https://universaldependencies.org/u/pos/
pos_to_remove= ["ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NUM", "PART", "PRON", "PUNCT", "SCONJ", "SYM", ]
stopwords = ["«", "-", " ", "m.", "#", "–", "\x96", "*", "c\x92est", "d\x92un", "-t", "/" ,
             "qu\x92il", "webtv@20minutes.ch", "j.", "d\x92autre", "https://t.co", "c\x9cur", 
             "j\x92ai", "h.", "o", "n\x92er", "n\x92a", "c.", "s.", ".keystone", "n\x92y", 
             "s\x9cur", "l.", "b.", "\x9cuvre", "jusqu\x92à", "n\x92aver", "|", "''", "n\x92est", "…"] # And many more...
tokenized_articles = []
i = 0
len_df = len(fr_df)
for article in articles:
   i += 1
   if i % 1000 == 0:
      print(f"At step: {i} of {len_df}")

   article_tokens = []
   for token in article:
      if (
         token.pos_ not in pos_to_remove # Remove defined parts of speech
         and not token.is_stop # Token is not a stopword
         and not token.is_space
      ):
         article_tokens.append(token.lemma_.lower())

   tokenized_articles.append(article_tokens)

dictionary_french_removed_pos = corpora.Dictionary(tokenized_articles)
dictionary_french_removed_pos.save(fname_or_handle=os.path.normpath("./models/dictionaries/dictionary-french-removed-pos"))
print("Exported dictionary: dictionary_french_removed_pos")

Still contains some 
* special characters
* abbreviations "a.", "l." -> might this be the end of a sentence? 
* a lot of escaped "\x92"... this might be apostrophes -> encoding issue during preprocessing?

```py
stopwords = ["«", "-", " ", "m.", "#", "–", "\x96", "*", "c\x92est", "d\x92un", "-t", "/" ,
"qu\x92il", "webtv@20minutes.ch", "j.", "d\x92autre", "https://t.co", "c\x9cur", 
"j\x92ai", "h.", "o", "n\x92er", "n\x92a", "c.", "s.", ".keystone", "n\x92y", 
"s\x9cur", "l.", "b.", "\x9cuvre", "jusqu\x92à", "n\x92aver", "|", "''", "n\x92est", "…"] # And many more...
```

In [None]:
#dictionary_french_removed_pos.most_common(n=300)

# Dictionaries with only nouns
Inspired by https://aclanthology.org/U15-1013

In [None]:
articles = nlp_fr.pipe(fr_df['content'], disable=["tagger", "ner", "textcat"], n_process=4)

# The only tag to keep is nouns: https://universaldependencies.org/u/pos/
nouns = ["NOUN", "PROPN"]
stopwords = ["«", "-", "l’", "afp", "m.", "a", "#"] # Top 300
tokenized_articles = []
i = 0
len_df = len(fr_df)
for article in articles:
   i += 1
   if i % 1000 == 0:
      print(f"At step: {i} of {len_df}")

   article_tokens = []
   for token in article:
      if (
         token.pos_ in nouns
         and not token.is_stop
         ):
         article_tokens.append(token.lemma_.lower())

   tokenized_articles.append(article_tokens)

dictionary_french_noun_only = corpora.Dictionary(tokenized_articles)
dictionary_french_noun_only.save(fname_or_handle=os.path.normpath("./models/dictionaries/dictionary-french-noun-only"))
print("Exported dictionary: dictionary_french_noun-only")

Very few weird words in top 300:
```py
stopwords = ["«", "-", "l’", "afp", "m.", "a", "#"] # Top 300
```

In [None]:
#dictionary_french_noun_only.most_common(n=300)