In [None]:
import os
os.chdir(r"D:\PythonApps\exercise_reddit_titles")
# os.getcwd()

# Imports

In [None]:
import warnings
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio

In [None]:
warnings.filterwarnings('ignore')
pio.templates.default = "plotly_dark"

# Data to visualization

In [None]:
sig_features = [
    "sentiment",

    "ner_DATE",
    "ner_MONEY",
    "ner_ORG",
    "ner_CARDINAL",

    "pos_VERB",
    "pos_NOUN",
    "pos_PUNCT",

    "tag_NN",
    "tag_VBD",
    "tag_DT",
    "tag_IN",
    "tag_PRP"
]
sig_cols = ["text", "score"] + sig_features

In [None]:
df = pd.read_parquet("resources/dataframe_features.pq")[sig_cols]
df = df[df.score <= (df.score.mean() + df.score.std()*3)].reset_index()

In [None]:
sentiment_map = {
    "Very Negative": 0,
    "Negative": 1,
    "Neutral": 2,
    "Positive": 3,
    "Very Positive": 4
}
df["sentiment_rank"] = df.sentiment.apply(lambda x: sentiment_map[x])

In [None]:
df_low = df[df.score <= df.score.quantile(0.1)].reset_index(drop=True)
df_high = df[df.score >= df.score.quantile(0.9)].reset_index(drop=True)

In [None]:
df_low

In [None]:
fig = make_subplots(
    cols=3,
    rows=1,
    subplot_titles=("NER", "Part-of-speech", "Detailed part-of-speech")
)

for data, color, name in [
    [df_low, "rgb(1, 58, 99)", "low-scored titles"],
    [df_high, "rgb(128, 0, 22)", "high-scored titles"],
]:
    features = [col for col in data.columns if col.startswith("ner")]
    fig.add_trace(
        go.Bar(
            name="ner",
            orientation="h",
            y=features,
            x=data[features].mean(),
            marker_color=color,
            legendgroup=name,
            legendgrouptitle_text=name,
        ),
        col=1,
        row=1
    )

    features = [col for col in data.columns if col.startswith("pos")]
    fig.add_trace(
        go.Bar(
            name="pos",
            orientation="h",
            y=features,
            x=data[features].mean(),
            marker_color=color,
            legendgroup=name,
            legendgrouptitle_text=name,
        ),
        col=2,
        row=1
    )

    features = [col for col in data.columns if col.startswith("tag")]
    fig.add_trace(
        go.Bar(
            name="tag",
            orientation="h",
            y=features,
            x=data[features].mean(),
            marker_color=color,
            legendgroup=name,
            legendgrouptitle_text=name,
        ),
        col=3,
        row=1
    )


fig.update_layout(
    title="<b>High- and low-scored titles differ in lexical structure</b>",
    width=1400,
    height=500,
    xaxis=dict(title="count per text"),
    xaxis2=dict(title="count per text"),
    xaxis3=dict(title="count per text"),
    legend=dict(groupclick="toggleitem")
)


fig.show()

In [None]:
df_graph = df.groupby("sentiment").agg({"score": "median"}).reset_index().sort_values("score", ascending=False)

fig = px.bar(
    df_graph,
    x="sentiment",
    y="score",
    width=900,
    height
    title="<b>Negative sentiment attracts attention more than positive one</b>",
    color="sentiment",
    color_discrete_map={
        "Very Negative": "rgb(128, 0, 22)",
        "Negative": "rgb(192, 0, 33)",
        "Neutral": "silver",
        "Positive": "rgb(0, 128, 0)",
        "Very Positive": "rgb(0, 75, 35)"
    }
)
fig.show()