# Text Stats

## Preparation

In [None]:
# import statements

from functions.basic import read_file
from functions.processing import split_sents
from functions.stats import get_stats

from functions.vis import write_vis, box_plot, scatter_plot
import plotly.graph_objects as go
import plotly.express as px

from scipy import stats
import pandas as pd

In [None]:
# read data

j_all = read_file("data\\0_extraction_data\\j_all.pkl")
k_all = read_file("data\\0_extraction_data\\k_all.pkl")

#subsets
j_cited = j_all.loc[j_all['passage_type'] == "cited"] 
j_not_cited = j_all.loc[j_all['passage_type'] == "not_cited"] 
k_cited = k_all.loc[k_all['passage_type'] == "cited"] 
k_not_cited = k_all.loc[k_all['passage_type'] == "not_cited"]

#define indizes to merge dataframes later on
j_cited_index = j_cited.index
j_not_cited_index = j_not_cited.index
k_cited_index = k_cited.index
k_not_cited_index = k_not_cited.index

In [None]:
# text processing
# this might take a while (should be less than 1 min.)

#subsets 

#k cited
k_c_text = [x for x in k_cited["text"]]
k_c_citnum = [x for x in k_cited["frequency"]]
k_c_sents = split_sents(k_c_text)

#k not cited
k_nc_text = [x for x in k_not_cited["text"]]
k_nc_citnum = [x for x in k_not_cited["frequency"]]
k_nc_sents = split_sents(k_nc_text)

#j cited
j_c_text = [x for x in j_cited["text"]]
j_c_citnum = [x for x in j_cited["frequency"]]
j_c_sents = split_sents(j_c_text)

#j not cited
j_nc_text = [x for x in j_not_cited["text"]]
j_nc_citnum = [x for x in j_not_cited["frequency"]]
j_nc_sents = split_sents(j_nc_text)

## Textual Statistics

In [None]:
# apply text stats and summary statistics

j_c_stats, j_c_sumstats = get_stats(j_c_text, j_c_sents, j_c_citnum)
j_nc_stats, j_nc_sumstats = get_stats(j_nc_text, j_nc_sents, j_nc_citnum)
k_c_stats, k_c_sumstats = get_stats(k_c_text, k_c_sents, k_c_citnum)
k_nc_stats, k_nc_sumstats = get_stats(k_nc_text, k_nc_sents, k_nc_citnum)

In [None]:
k_nc_stats

In [None]:
# append all stats data to dataframes and save them

# J cited
j_c_stats.index = j_cited_index
j_cited_1 = pd.concat([j_cited, j_c_stats], axis=1)

# J not cited
j_nc_stats.index = j_not_cited_index
j_not_cited_1 = pd.concat([j_not_cited, j_nc_stats], axis=1)

# K cited
k_c_stats.index = k_cited_index
k_cited_1 = pd.concat([k_cited, k_c_stats], axis=1)

# K not cited
k_nc_stats.index = k_not_cited_index
k_not_cited_1 = pd.concat([k_not_cited, k_nc_stats], axis=1)

In [None]:
# save data for the next step

new_dfs = [j_cited_1, j_not_cited_1, k_cited_1, k_not_cited_1]
paths = ["data\\1_text-stats_data\\j_cited_1.pkl", "data\\1_text-stats_data\\j_not_cited_1.pkl", "data\\1_text-stats_data\\k_cited_1.pkl", "data\\1_text-stats_data\\k_not_cited_1.pkl"]

for df, path in zip(new_dfs, paths):
    df.to_pickle(path)

## Vis

In [None]:
# bar plot, number of cited/not cited passages

#get overall length of all subsets
j_c_length = j_c_stats.char_len.sum()
k_c_length = k_c_stats.char_len.sum()
j_nc_length = j_nc_stats.char_len.sum()
k_nc_length = k_nc_stats.char_len.sum()

# plot them
fig = go.Figure(data=[
    go.Bar(name='cited', x=["J", "K"], y=[j_c_length, k_c_length], marker_color="#509F98", opacity=.7),
    go.Bar(name='not cited', x=["J", "K"], y=[j_nc_length, k_nc_length], marker_color="#C16152", opacity=.7)
])
fig.update_layout(barmode='group', template="plotly_dark", font=dict(family = "CMU Serif", size=20))
fig.show()

#write_vis(["pdf", "png", "html"],"all_charlen_bar", fig)

In [None]:
# create box_plots for all text stats variables in one step

cols_listed = list(j_c_stats) #list of all col names to loop through
for i in cols_listed:
    if i != "id":
        cols = [j_c_stats[i], j_nc_stats[i], k_c_stats[i], k_nc_stats[i]]
        names = ["J (C)", "J (NC)","K (C)", "K (NC)"]
        box = box_plot(cols, names, i)

    #write_vis(["pdf", "png", "html"],"all_" + i + "_box", box)

In [None]:
# scatter plot, comparing cited/not cited passages by their frequency and over time

fig = px.scatter(j_all, x="startpos", y="passage_type",color="frequency", color_continuous_scale=["#C16152", "#509F98", "#1a3331"], category_orders={"passage_type": ["cited", "not_cited"]},opacity=.6)
fig.update_traces(marker=dict(size=((j_all.endpos-j_all.startpos)*0.05)))
fig.update_layout(template="plotly_dark", font=dict(family = "CMU Serif", size=20))
fig.show()
#write_vis(["pdf", "png", "html"],"j_all_scatter", fig)

In [None]:
# scatter plots, to compare relationships

j_all_stats = j_c_stats.append(j_nc_stats)
k_all_stats = k_c_stats.append(k_nc_stats)
all_stats = j_all_stats.append(k_all_stats)
print(all_stats)

scatter = scatter_plot(all_stats, "J & K", "token_len", "sent_len")
#write_vis(["pdf"],"all_scatter_token-len_sent-len", scatter)

In [None]:
# in addition, get correlation values using scipy.stats

corr = stats.pearsonr(all_stats.token_len, all_stats.cit_num)
for i in corr:
    print(round(i, 2))