# Spacy Pipeline mit pandas

## Importe

In [None]:
! pip install spacy==3.2.1

In [None]:
! python -m spacy download de_core_news_md

In [None]:
! nvcc --version

In [None]:
!!pip install -U spacy[cuda111]

In [None]:
import spacy
import pickle

import pandas as pd
import de_core_news_md

In [None]:
print(spacy.__version__)

In [None]:
gpu = spacy.prefer_gpu()
print('GPU:', gpu)

## Laden der Daten

In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

In [None]:
df = pd.read_xml('Bundesregierung.xml')

In [None]:
# change dtype to datetime
df.loc[:, 'datum'] = pd.to_datetime(df.loc[:, 'datum'])

In [None]:
df.info()

## Doc-Objekt im pandas-Dataframe

Mit Hilfe einer Funktion kann für jede Zeile die Spalte 'rohtext' in ein Doc-Objekt umgewandelt  und in eine neue Spalte eingefügt werden.

Mit pickle kann der Dataframe gespeichert werden, sodass das Doc-Objekt erhalten bleibt. Allerdings ist dann die pickle-Datei mit fast 3.5 GB sehr groß. Wird der Dataframe wieder geladen, dann ist jedoch der Speicherbedarf des Arbeitsspeiche nicht sehr hoch.

### Funktion zum Erstellen des Doc-Objekts 

In [None]:
def create_doc_object(text, nlp):
    '''
    Loads SpaCy Language Model and creates a SpaCy Doc-Object.
    INPUT: string
    RETURN: spacy.tokens.doc.Doc
    '''       
        
    return nlp(text)    

In [None]:
%%time

nlp = de_core_news_md.load()

In [None]:
%%time

# Erstellen der Doc-Objekte
# kann ein paar Minuten laufen
# 15 min 18s mit CPU via google Colab
# 8min 13s mit GPU via google Colab

df.loc[:, 'doc_object'] = df.loc[:, 'rohtext'].apply(lambda text: create_doc_object(text, nlp))

In [None]:
%%time

# speichern der Datei
# pickle-Datei ist fast 3.5 GB groß!

df.to_pickle('data/reden-bundesregierung.p')

### Checks

In [None]:
type(df_p.loc[0, 'doc_object'])

In [None]:
token_test = df_p.loc[0, 'doc_object']

In [None]:
for token in token_test[:10]:
    print(token.text, token.lemma_, token.pos_,token.ent_type_)

## Tokenisierung

In [None]:
def tokenize(doc):
    '''
    Tokenizes text using Doc-Object
    INPUT: Doc-Object
    RETURN: list with tokens
    '''
    return [ token.text for token in doc if not token.is_punct ]

In [None]:
%%time

df.loc[:, 'tokens'] = df.loc[:, 'doc_object'].apply(lambda doc: tokenize(doc))

In [None]:
df.loc[:, 'ntokens'] = df.loc[:, 'tokens'].apply(lambda tokens: len(tokens))

In [None]:
df.loc[:, 'ntokens'].describe()

## Lemmatisierung

In [None]:
def lemmatize(doc):
    '''
    Lemmatizes text using Doc-Object
    INPUT: Doc-Object
    RETURN: list with lowercase lemmatas with stopwords and punctuation removed    
    '''
    # create stopwords taken from http://members.unine.ch/jacques.savoy/clef/germanST.txt
    with open('../data/stopwords.txt', 'r', encoding='utf-8') as f:
        stopwords = f.read().splitlines()
    
    return [ token.lemma_.lower() for token in doc \
             if token.text not in stopwords \
             and token.is_alpha \
             and not token.is_punct ]

In [None]:
%%time

df.loc[:, 'lemmata'] = df.loc[:, 'doc_object'].apply(lambda doc: lemmatize(doc))

In [None]:
type(df.loc[0, 'lemmata'])

## NER

In [None]:
def extract_named_entities(doc, entity='PER'):
    '''
    Extracts named entities from Doc-Object.
    INPUT: Doc-Object
    RETURN: List with entities    
    '''
    return [ token.text for token in doc.ents if token.label_ == entity ]

In [None]:
%%time

entities = ['PER', 'ORG', 'LOC', 'MISC']

for entity in entities:
    df.loc[:, entity] = df.loc[:, 'doc_object'].apply(lambda doc: extract_named_entities(doc, entity=entity))

In [None]:
df.head(3).T

In [None]:
check_dtype = df.loc[0, 'tokens']

In [None]:
type(check_dtype)

In [None]:
type(check_dtype[0])

## POS

In [None]:
def extract_pos(doc, pos_tag='NOUN'):
    '''
    Extracts Part-of-Speech-Tag from doc-Object
    INPUT: Doc-Object
    RETURN: list with tokens
    '''
    return [ token.text.lower() for token in doc if token.pos_ == pos_tag ]

In [None]:
%%time

pos_tags = ['NOUN', 'VERB', 'ADJ']

for pos_tag in pos_tags:
    df.loc[:, pos_tag] = df.loc[:, 'doc_object'].apply(lambda doc: extract_pos(doc, pos_tag=pos_tag))

In [None]:
df.head(3).T

## Speichern der Resultate

### als csv

In [None]:
df.to_csv('../data/reden-bundesregierung-preprocessed-with-doc-object.csv', index=False)

In [None]:
df_ohne_doc = df.drop(['doc_object'], axis=1)

In [None]:
df_ohne_doc.to_csv('../data/reden-bundesregierung-preprocessed.csv', index=False)

## als json

In [None]:
df_ohne_doc.to_json('../data/reden-bundesregierung-preprocessed.json')

### als pickle

In [None]:
df_ohne_doc.to_pickle('../data/reden-bundesregierung-preprocessed.p')

## checks

### from csv

In [None]:
df_1 = pd.read_csv('../data/reden-bundesregierung-preprocessed-with-doc-object.csv')

In [None]:
df_1.head(3).T

In [None]:
type(df_1.loc[0, 'doc_object'])

In [None]:
type(df_1.loc[0, 'tokens'])

In [None]:
df_1b = pd.read_csv('../data/reden-bundesregierung-preprocessed.csv')

In [None]:
type(df_1.loc[0, 'tokens'])

In [None]:
ORG_Test = df_1b.loc[0, 'LOC']

In [None]:
ORG_Test[0]

In [None]:
type(ORG_Test)

In [None]:
type(ORG_Test[0])

### from pickle

In [None]:
df_2 = pd.read_pickle('../data/reden-bundesregierung-preprocessed.p')

In [None]:
type(df_2.loc[0, 'tokens'])

In [None]:
df_2.head(3).T

In [None]:
ORG_Test = df_2.loc[0, 'LOC']

In [None]:
ORG_Test[0]

In [None]:
type(ORG_Test)

In [None]:
type(ORG_Test[0])