# 💻 Einführung in Topic Modeling mit Python - Step 1: Textdaten vorbereiten

## 1. Sortiertes Datenkorpus laden

In [None]:
import pandas as pd

# read csv into dataframe object
corpus = pd.read_csv("../daten/speeches-bundesregierung_sorted.csv", encoding="utf-8") 
print(f"Der Datzensatz enthält {corpus.shape[0]} Reden mit {corpus.shape[1]} Attributen.")

In [None]:
corpus.head()

In [None]:
# select the speeches from the column text and store the
# texts as elements in a python-list-object
list_of_speeches = corpus.loc[:, "text"].tolist()
print(list_of_speeches[-3][:500])

In [None]:
# compute token count
raw_token_count = 0
for speech in list_of_speeches:
    raw_token_count += len(speech.split(" "))

print(f"Das einfach tokenisierte Korpus auf Basis von Whitespaces enthält {raw_token_count} Token.")

## Überarbeitung der Textdaten

In [None]:
import spacy
from gensim.utils import tokenize, simple_preprocess

# optional: removing stopwords - for example via nltk
import nltk
nltk.download("stopwords")

# import NLTK stopwords
from nltk.corpus import stopwords

### Tokenisierung

In [None]:
def sent_to_words(speeches):
    
    """Turns a list of strings into a tokenized list of words."""
    
    tokenized = []
    
    for speech in speeches:
        # gensims tokenize-function by default only tokenizes, returns alphabetic characters
        # note: no digits or punctuation are yielded
        tokenized.append(list(tokenize(speech)))

    return tokenized

speeches_tokenized = sent_to_words(list_of_speeches) 

print(speeches_tokenized[-3])

### Lemmatisierung und POS-Tagging

In [None]:
#uncomment in binder to download language model

#!python -m spacy download de_core_news_sm # small language model
#!python -m spacy download de_core_news_lg # large language model

!python -m spacy download de_core_news_md # medium language model

In [None]:
import de_core_news_md
nlp = de_core_news_md.load(disable=["parser", "ner"])

In [None]:
# lemmatize and filter by pos-tag
# note: this may take some time in Binder

def lemmatization(texts, allowed_postags=["NOUN", "PROPN", 'ADJ', 'VERB']):
    """Takes a list of tokenized texts and only returns lemmatized data
    depending on allowed POS tags"""
    texts_out = []
    #counter = 0
    for text in texts:
        doc = nlp(" ".join(text)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) # list comprehension
        #counter += 1 # optional to show progress in output
        #print(f"Text {counter} lemmatisiert.")
    return texts_out

data_lemmatized = lemmatization(speeches_tokenized, allowed_postags=['NOUN', 'ADJ', 'VERB', "PROPN"])

print(data_lemmatized[-3])

### Stoppwörter entfernen (optional) und normalisieren

In [None]:
# get typical german stopwords
stop_words = stopwords.words("german")

# optional: extend nltk stopwort list
#stop_words.extend([""])

In [None]:
# stopword-removal and normalization
def remove_stopwords(texts):
    
    """"Takes list of tokenized texts, filters stopwords 
    and returns normalized list of words (all lowercase, min 2 character tokens
    and max 100 character tokens)."""
    
    preprocessed_texts = []
    
    for text in texts:
        word_list = []
        
        for word in simple_preprocess(str(text), min_len=2, max_len=100):
            if word not in stop_words:
                word_list.append(word)
                
        preprocessed_texts.append(word_list)
            
    return preprocessed_texts

data_words_nostops = remove_stopwords(data_lemmatized)

print(data_words_nostops[-3])

#### 📝 **Jetzt:** Aufgabe - Tokencount nach dem Preprocessing 
Zählen Sie in Anlehnung an den obigen Codeblock die Anzahl der Token nach dem Preprocessing. Wie hat sich die Datengrundlage verändert?

⏳ 5 Minuten

In [None]:
# your code

### Vorverarbeitete Textdaten in den Dataframe speichern

In [None]:
speeches_string = []

for speech in data_words_nostops:
    speeches_string.append(" ".join(speech[:]))

# save normalized data to dataframe
corpus["preprocessed_text"] = speeches_string
corpus["preprocessed_text"] 

In [None]:
# save data for topic modeling
corpus.to_csv("../daten/speeches-bundesregierung_preprocessed.csv", index=False, encoding="utf-8")