# Visualisierung eines Textkorpus mit Flair Embeddings, Pandas und Plotly

## Import

In [None]:
import spacy
import string
import pickle

from umap import UMAP

import plotly.express as px
import pandas as pd



## Daten einlesen

In [None]:
df = pd.read_pickle('../data/reden-bundesregierung.p')

In [None]:
df.info()

In [None]:
type(df.loc[0, 'doc_object'])

## Doc-Objekte in eine Liste

In [None]:
docs = df.loc[:, 'doc_object'].to_list()

In [None]:
type(docs[0])

In [None]:
len(docs)

## Erstellen der gefilterten Matrix

In [None]:
with open('../data/stopwords.txt', 'r', encoding='utf-8') as f:
        stopwords = f.read().splitlines()

In [None]:
matrix = list()
words = list()

In [None]:
%%time

for doc in docs:
    
    for token in doc:
        if token.text is not token.is_punct \
        and token.text.lower() not in stopwords \
        and token.text.lower() not in words:
            embedding = token.tensor
            matrix.append(embedding)
            word = token.text.lower()
            words.append(word)
            
# statt token.text.lower() könnte für die Word-Liste auch token.lemma_.lower() genommen werden
# ferner NER der Worte mitnehmen, ggf. Tuples nutzen, die später wieder getrennt werden
# ferner POS der Worte mitnehmen, ggf. Tuples nutzen, die später wieder getrennt werden

In [None]:
len(matrix)

In [None]:
matrix[0]

In [None]:
len(words)

## 3D-Scatterplot

In [None]:
reducer_3d = UMAP(n_components=3, metric='cosine', n_neighbors=15, min_dist=0.1)

In [None]:
%%time
reduced_matrix_3d = reducer_3d.fit_transform(matrix)

In [None]:
reduced_matrix_3d.shape

In [None]:
df_3d = pd.DataFrame.from_records(reduced_matrix_3d, columns=['x', 'y', 'z'])

In [None]:
df_3d.loc[:, 'word'] = words

In [None]:
df_3d.head()

In [None]:
# save to file
df_3d.to_csv('../data/spacy-alle-worte-in-reden-umap-reduziert-3d-min-dist-01.csv', index=False)

In [None]:
params = {'hover_data': {c: False for c in df_3d.columns},
          'hover_name': 'word'}

In [None]:
fig = px.scatter_3d(df_3d, x='x', y='y', z='z', opacity=0.3, size_max=3, width=1000, height=1000, **params)
fig.show()
fig.write_html('../img/spacy-alle-worte-in-reden-umap-reduziert-3d-min-dist-01.html')

## Erstellen der gefilterten Matrix plus NER und POS

In [None]:
with open('../data/stopwords.txt', 'r', encoding='utf-8') as f:
        stopwords = f.read().splitlines()

In [None]:
matrix_2 = list()
lemmas = list()
pos = list()
ner = list()

In [None]:
%%time

for doc in docs:
    
    for token in doc:
        if token.text is not token.is_punct \
        and token.text.lower() not in stopwords \
        and token.lemma_.lower() not in lemmas:
            embedding = token.tensor
            matrix_2.append(embedding)
            lemma = token.lemma_.lower()
            lemmas.append(lemma)
            
            if token.ent_type_ in ['PER', 'ORG', 'LOC', 'MISC']:
                ner.append(token.ent_type_)
            else:
                ner.append('NaN')
            
            if token.pos_ in ['NOUN', 'VERB', 'ADJ']:
                pos.append(token.pos_)
            else:
                pos.append('NaN')


In [None]:
len(matrix_2),len(ner), len(lemmas), len(pos)

In [None]:
ner[0], lemmas[0], pos[0]

In [None]:
matrix_2[0]

## 3D-Scatterplot mit NER und POS

In [None]:
reducer_3d = UMAP(n_components=3, metric='cosine', n_neighbors=15, min_dist=0.1)

In [None]:
%%time
reduced_matrix_2_3d = reducer_3d.fit_transform(matrix_2)

In [None]:
reduced_matrix_2_3d.shape

In [None]:
df_ner_pos = pd.DataFrame.from_records(reduced_matrix_2_3d, columns=['x', 'y', 'z'])

In [None]:
df_ner_pos.loc[:, 'lemma'] = lemmas

In [None]:
df_ner_pos.loc[:, 'ner'] = ner

In [None]:
df_ner_pos.loc[:, 'pos'] = pos

In [None]:
df_ner_pos.head()

In [None]:
# save to file
df_ner_pos.to_csv('../data/spacy-alle-worte-in-reden-umap-reduziert-3d-min-dist-01-ner-pos.csv', index=False)

## Plot Lemmas mit NER und POS

In [None]:
params = {'hover_data': {c: False for c in df_ner_pos.columns},
          'hover_name': 'lemma'}

In [None]:
fig = px.scatter_3d(df_ner_pos, x='x', y='y', z='z',
                    color='pos',
                 #   symbol='pos',
                    opacity=0.3, 
                    size_max=3, 
                    width=1000, 
                    height=1000, **params)
fig.show()
fig.write_html('../img/spacy-alle-worte-in-reden-umap-reduziert-3d-min-dist-01-pos.html')

In [None]:
# plotly
# Color = 'ner'
# symbols = 'pos'