# Part-of-Speech (POS) Tagging

## Preparation

In [None]:
# import statements

from functions.basic import read_file
from functions.processing import split_sents
from functions.pos import get_pos_tags, count_tag_freqs, find_ngram_index, find_ngrams, ngram_count, get_n_ngrams, get_pos_diversity, list_individual_grams, grams_matrix_prep

from functions.vis import write_vis, vis_subplots, calculate_weights, pos_heatmap
import plotly.graph_objects as go
import plotly.express as px

import pandas as pd

In [None]:
#datasets

j_all = read_file("data\\0_extraction_data\\j_all.pkl")
k_all = read_file("data\\0_extraction_data\\k_all.pkl")

#subsets
j_cited = read_file("data\\1_text-stats_data\\j_cited_1.pkl")
j_not_cited = read_file("data\\1_text-stats_data\\j_not_cited_1.pkl")
k_cited = read_file("data\\1_text-stats_data\\k_cited_1.pkl")
k_not_cited = read_file("data\\1_text-stats_data\\k_not_cited_1.pkl")

#define indizes to merge dataframes later on
j_cited_index = j_cited.index
j_not_cited_index = j_not_cited.index
k_cited_index = k_cited.index
k_not_cited_index = k_not_cited.index

#get overall length of all subsets
j_c_length = j_cited.char_len.sum()
k_c_length = k_cited.char_len.sum()
j_nc_length = j_not_cited.char_len.sum()
k_nc_length = k_not_cited.char_len.sum()

In [None]:
# text processing
# this might take a while (should be less than 1 min.)

#subsets 

#k cited
k_c_text = [x for x in k_cited["text"]]
k_c_citnum = [x for x in k_cited["frequency"]]
k_c_sents = split_sents(k_c_text)

#k not cited
k_nc_text = [x for x in k_not_cited["text"]]
k_nc_citnum = [x for x in k_not_cited["frequency"]]
k_nc_sents = split_sents(k_nc_text)

#j cited
j_c_text = [x for x in j_cited["text"]]
j_c_citnum = [x for x in j_cited["frequency"]]
j_c_sents = split_sents(j_c_text)

#j not cited
j_nc_text = [x for x in j_not_cited["text"]]
j_nc_citnum = [x for x in j_not_cited["frequency"]]
j_nc_sents = split_sents(j_nc_text)

## (Relative) Frequencies

In [None]:
# get pos tags (get_pos_tags) for each subset

j_c_tagged_dict, j_c_tagged_list = get_pos_tags(j_c_sents)
j_nc_tagged_dict, j_nc_tagged_list = get_pos_tags(j_nc_sents)
k_c_tagged_dict, k_c_tagged_list = get_pos_tags(k_c_sents)
k_nc_tagged_dict, k_nc_tagged_list = get_pos_tags(k_nc_sents)

In [None]:
# count the relative tag frequencies (count_tag_freqs) for each subset
j_c_tag_freqs, j_c_tags_used = count_tag_freqs(j_c_tagged_list)
j_nc_tag_freqs, j_nc_tags_used = count_tag_freqs(j_nc_tagged_list)
k_c_tag_freqs, k_c_tags_used = count_tag_freqs(k_c_tagged_list)
k_nc_tag_freqs, k_nc_tags_used = count_tag_freqs(k_nc_tagged_list)

In [None]:
# count the relative tag frequencies (count_tag_freqs) and visualize them (heatmap)

heatmap = pos_heatmap(k_nc_tag_freqs)
#write_vis(["pdf", "png", "html"],"k_nc_pos-heatmap_not-weighted", heatmap)
heatmap

In [None]:
# for weighted values, the function calculate_weights must be applied as well

weights = calculate_weights(k_c_tag_freqs, k_c_citnum)

# addition [26.02.]: the steps below could be simplified using the function apply_scaling() from summary.py

#define min and max values for normalization
min_norm = weights.min(axis = 1).sort_values(ascending=True)[0]
max_norm = weights.max(axis = 1).sort_values(ascending=False)[0]

heatmap = pos_heatmap((weights - min_norm) / (max_norm - min_norm))
#write_vis(["pdf", "png", "html"],"k_c_pos-heatmap_weighted", heatmap)
heatmap

## n-grams

In [None]:
# example for counting individual ngram frequencies, e.g. 3-grams

ngrams_found = find_ngrams(j_c_tagged_list, 3)
ngram_counter = ngram_count(ngrams_found)
ngram_counter

In [None]:
# get topn ngrams for a set number n of grams, e.g. get top 10 2- to 5-grams
grams, gram_names = get_n_ngrams(n=list(range(6))[2:], topn=10, pos_tagged=j_c_tagged_list)

# visualize the number of occurences
subplots = vis_subplots(gram_names, grams, rowcount=2, colcount=2, showlabels = True, rel_yaxis = False)
#write_vis(["pdf", "png","html"],"k_c_subplots_2-5-grams", subplots)
subplots

In [None]:
# find overlapping ngrams

# first, define all ngram variables

k_c_grams, k_c_gram_names = get_n_ngrams(n=list(range(6))[2:], topn=25, pos_tagged=k_c_tagged_list)
k_nc_grams, k_nc_gram_names = get_n_ngrams(n=list(range(6))[2:], topn=25, pos_tagged=k_nc_tagged_list)
j_c_grams, j_c_gram_names = get_n_ngrams(n=list(range(6))[2:], topn=25, pos_tagged=j_c_tagged_list)
j_nc_grams, j_nc_gram_names = get_n_ngrams(n=list(range(6))[2:], topn=25, pos_tagged=j_nc_tagged_list)

# then, create a list of all indivual ngrams that occur in any of our ngram lists
all_grams = list_individual_grams([j_c_grams, j_nc_grams, k_c_grams, k_nc_grams])

# use grams_matrix_prep to lookup the ngrams for each dataset in the all_grams list and return a "binary" or "count" values for their occurences

j_c_grams_matrix = grams_matrix_prep(j_c_grams, all_grams, "count")
j_nc_grams_matrix = grams_matrix_prep(j_nc_grams, all_grams, "count")
k_c_grams_matrix = grams_matrix_prep(k_c_grams, all_grams, "count")
k_nc_grams_matrix = grams_matrix_prep(k_nc_grams, all_grams, "count")

matrix = [j_c_grams_matrix, j_nc_grams_matrix, k_c_grams_matrix, k_nc_grams_matrix]

# create a dataframe for easier inspection, visualization
df = pd.DataFrame.from_records(matrix)
df = df.transpose()
df.columns = ["J(C)", "K(C)", "J(NC)", "K(NC)"]
df["sum"] = df.sum(axis=1)
df["ngram"] = all_grams

In [None]:
# visualize the results
# use option "binary" in grams_matrix_prep()

fig = go.Figure()
fig.add_trace(go.Heatmap(
                    x=df.loc[:,"J(C)":"K(NC)"].columns, y=df.loc[:,"J(C)":"K(NC)"].index, z=df.loc[:,"J(C)":"K(NC)"], colorscale=[[0.0,"#C16152"], [0.5,"#C16152"], [0.5,"#509F98"], [1,"#509F98"]], colorbar=dict(tickvals=[0,1], ticktext=[0,1], lenmode="pixels", len=100,)))
fig.update_layout(template="plotly_dark", font=dict(family = "CMU Serif", size=20),)
fig.show()

#write_vis(["pdf", "png","html"],"all_pos_2-5-grams_comparison", fig)

In [None]:
# devide values by length of subsets to make them more comparable

df = df.sort_values(by="sum", ascending=False)

def devide_byLength(df, cols, lengths):
    for col, length in zip(cols, lengths):
        df[col+"_norm"] = [round((val/length)*100, 2) for val in df[col]]
    return df

df = devide_byLength(df, ["J(C)", "K(C)", "J(NC)", "K(NC)"], [j_c_length, k_c_length, j_nc_length, k_nc_length])
    
#top 10 ngrams
df[:10]

In [None]:
# filter out ngrams that only occur in one subset
df.loc[(df[["K(C)", "J(NC)", "K(NC)"]]==0).all(1)][:10]

In [None]:
# to find the index of passages that contain specific POS n-Grams, use find_ngram_index() 

idx_list = find_ngram_index(j_c_tagged_list, "PUNCT, PRON, VERB, PRON")
print(idx_list)

## Diversity

In [None]:
#calculate the diversity (Shannon entropy) of POS-Tags for each passage, using get_pos_diversity

# Wertebereich liegt zwischen 0 und max_div
#max_div = -numpy.log2(1/len(tag_freqs.index))

j_c_div_scores = get_pos_diversity(j_c_tag_freqs)
j_nc_div_scores = get_pos_diversity(j_nc_tag_freqs)
k_c_div_scores = get_pos_diversity(k_c_tag_freqs)
k_nc_div_scores = get_pos_diversity(k_nc_tag_freqs)

In [None]:
#append div_scores to previous dfs again

# J cited
j_c_div_scores.index = j_cited_index
j_cited_2 = pd.concat([j_cited, j_c_div_scores], axis=1)

# J not cited
j_nc_div_scores.index = j_not_cited_index
j_not_cited_2 = pd.concat([j_not_cited, j_nc_div_scores], axis=1)

# K cited
k_c_div_scores.index = k_cited_index
k_cited_2 = pd.concat([k_cited, k_c_div_scores], axis=1)

# K not cited
k_nc_div_scores.index = k_not_cited_index
k_not_cited_2 = pd.concat([k_not_cited, k_nc_div_scores], axis=1)

In [None]:
# save new dataframes again for the next step

new_dfs = [j_cited_2, j_not_cited_2, k_cited_2, k_not_cited_2]
paths = ["data\\2_pos_data\\j_cited_2.pkl", "data\\2_pos_data\\j_not_cited_2.pkl", "data\\2_pos_data\\k_cited_2.pkl", "data\\2_pos_data\\k_not_cited_2.pkl"]

for df, path in zip(new_dfs, paths):
    df.to_pickle(path)

In [None]:
# scatter plot, to show pos_diversity by the passage position within a text

fig = px.scatter(j_c_div_scores, x=j_c_div_scores["pos_diversity"].index, y="pos_diversity",color_discrete_sequence=["#509F98"],opacity=.6)
fig.update_layout(template="plotly_dark", font=dict(family = "CMU Serif", size=20,), xaxis_title = "position of passage (chronological order)",)
fig.show()

#write_vis(["pdf", "png","html"],"j_c_pos_diversity", fig)

In [None]:
# analyse relationship between pos_diversity and length of passage (token_count)

# statsmodels needs to be installed to create trendline

# for j_nc and k_nc subsets, drop 0 values to be able to use log_x=True as an option for the trendline below (like example below)
# df = df.drop(df[df.diversity == 0].index)

fig = px.scatter(j_cited_2, x="token_count", y="pos_diversity",color_discrete_sequence=["#509F98"],opacity=.6, trendline="ols",trendline_options=dict(log_x=True))

#trendline_options=dict(log_x=True), log_x = True
fig.update_layout(template="plotly_dark", font=dict(family = "CMU Serif", size=20,), xaxis_title="token_count (log)")
fig.show()

results = px.get_trendline_results(fig)
print(results.px_fit_results.iloc[0].summary())

#write_vis(["pdf", "png","html"],"j_c_diversity-tokencount_corr_logx", fig)