In [151]:
import pandas as pd
import seaborn as sns
import plotly.graph_objects as go
# import numpy as np

In [152]:
comments_path = "../data/predicted_comments.csv"
paragraphs_path = "../data/predicted_paragraphs.csv"
articles_path = "../data/predicted_articles.csv"
train_path = "../data/data_mean_0.15_5.csv"

df_com = pd.read_csv(comments_path)
df_par = pd.read_csv(paragraphs_path)
df_art = pd.read_csv(articles_path)
df_train = pd.read_csv(train_path)

dfs = {"comments": df_com, "paragraphs": df_par, "articles": df_art, "training": df_train}

# df.head(1)

In [153]:
# df.shape #, df.dtypes

In [154]:
def get_label_columns(data):
    columns = list(data.columns)
    if "fold" in columns:
        label_columns = columns[1:-2]
    else:
        label_columns = columns[1:]
    return label_columns

def get_data_to_barplot(data):
    label_columns = get_label_columns(data)
    ones_percent = data[label_columns].sum(axis=0)/data.shape[0]
    dimension_average = data[label_columns].sum(axis=1).mean()
    return ones_percent, dimension_average, label_columns

In [155]:
def create_barplot(dfs):
    data = {k:  get_data_to_barplot(v) for k,v in dfs.items()}
    fig = go.Figure(
        data=[
            go.Bar(name=k, x=v[2], y=v[0].values, text=v[0].values, texttemplate='%{text:.2}', textposition="auto")
            for k, v in data.items()
        ]
    )
    fig.update_layout(
        barmode="group",
        width=2000,
    )
    fig.show()
    fig.write_image("processed_barplot.pdf")

In [156]:
create_barplot(dfs)

In [157]:
def count_vectors(data):
    label_columns = get_label_columns(data)
    original = data.shape[0]
    counted = data.groupby(by=label_columns).count().reset_index()[["text"]]
    all = counted.shape[0]
    values = counted.value_counts()
    return all, original, values, counted

In [172]:
for k, df in dfs.items():
    all, original, vals, count = count_vectors(df)
    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=count.index, y=count.text, name=k
    ))
    fig.update_layout(title_text=k)
    fig.show()
    # print(vals/all)
    print(k)
    print("\tdata size:\t\t", original)
    print("\tdifferent vectors:\t", all)
    print("\tpercent:\t\t", round(all/original,4))


comments
	data size:		 1929571
	different vectors:	 450
	percent:		 0.0002


paragraphs
	data size:		 817497
	different vectors:	 257
	percent:		 0.0003


articles
	data size:		 99168
	different vectors:	 79
	percent:		 0.0008


training
	data size:		 1000
	different vectors:	 579
	percent:		 0.579


In [159]:
# y = tmp.values[0]
# tmp = ones_percent.to_frame().transpose()
# x = tmp.columns
# plot = sns.barplot(x=x, y=y)
# for i, name in enumerate(x):
#     # print(i[0]._height)
#     va = "top" if y[i] > 0.2 else "bottom"
#     plot.text(i, y[i], round(y[i], 2), ha="center", va=va, rotation=90)
# _ = plot.set_xticklabels(plot.get_xticklabels(), rotation=90)