In [None]:
import os
os.chdir(r"D:\PythonApps\exercise_reddit_titles")
os.getcwd()

# Imports

In [None]:
import pandas as pd
import json
import pprint
import numpy as np
from IPython.core.display import HTML
from IPython.display import display
from tqdm import tqdm

import spacy
from transformers import pipeline

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from plotly.colors import n_colors

In [None]:
import warnings
warnings.filterwarnings('ignore')

# DataFrame

In [None]:
df = pd.read_parquet("resources/dataframe_features.pq")
df.head()

In [None]:
df.info()

In [None]:
base_cols = ["text", "score", "sentiment"]
ner_cols = [c for c in df.columns if c.startswith("ner")]
pos_cols = [c for c in df.columns if c.startswith("pos")]
tag_cols = [c for c in df.columns if c.startswith("tag")]
lemma_cols = [c for c in df.columns if c.startswith("lemma")]

# Utils

In [None]:
def extreme_groups_df(df: pd.DataFrame):
    df_low = df[df.score <= df.score.quantile(0.10)]
    df_mean = df[(df.score >= df.score.quantile(0.45)) & (df.score <= df.score.quantile(0.55))]
    df_high = df[df.score >= df.score.quantile(0.90)]
    return df_low, df_mean, df_high

In [None]:
def extreme_groups_diff(cols, df_high, df_mean, df_low):
    high_means = df_high[cols].mean().to_list()
    mean_means = df_mean[cols].mean().to_list()
    low_means = df_low[cols].mean().to_list()
    high_low_diff = [np.abs(h - l) for h,l in zip(high_means, low_means)]
    graph_df = pd.DataFrame(
        dict(
            high_means=high_means,
            mean_means=mean_means,
            low_means=low_means,
            high_low_diff=high_low_diff,
            ner_ratio=cols
        )
    ).sort_values("high_low_diff", ascending=False).reset_index(drop=True)


    fig = make_subplots(
        cols=1,
        rows=2,
        subplot_titles=(
            "Groups medians",
            "High - Low Diff"
        )
    )


    fig.add_trace(
        go.Bar(
            name="ABS High - Low",
            x=graph_df.ner_ratio,
            y=graph_df.high_low_diff,
            text=[round(x, 2) for x in graph_df.high_low_diff],
            marker_color="silver",
        ),
        col=1,
        row=2
    )


    fig.add_trace(
        go.Bar(
            name="High",
            x=graph_df.ner_ratio,
            y=graph_df.high_means,
            marker_color="red"
        ),
        col=1,
        row=1
    )

    fig.add_trace(
        go.Bar(
            name="Mean",
            x=graph_df.ner_ratio,
            y=graph_df.mean_means,
            marker_color="green"
        ),
        col=1,
        row=1
    )

    fig.add_trace(
        go.Bar(
            name="Low",
            x=graph_df.ner_ratio,
            y=graph_df.low_means,
            marker_color="blue"
        ),
        col=1,
        row=1
    )

    fig.update_yaxes(title_text="Mean ratio", secondary_y=False)
    fig.update_yaxes(title_text="ABS High - Low Diff", secondary_y=True)

    fig.update_layout(
        height=800,
        legend=dict(
            orientation="h",
            yanchor="top",
            y=1.05
        )
    )

    fig.show()


# Score
- cut off records with score higher than +3SD

In [None]:
fig = px.histogram(
    df,
    "score"
)
fig.update_traces(
    marker_color="gray"
)

v = df.score.median()
fig.add_trace(
    go.Scatter(
        name='median',
        x=[v, v],
        y=[0, 10000],
        text="median",
        mode="lines+text",
        textposition="top right",
        marker_color="blue"
    )
)

v = df.score.quantile(0.7)
fig.add_trace(
    go.Scatter(
        name='q70',
        x=[v, v],
        y=[0, 10000],
        text="q75",
        mode="lines+text",
        textposition="top right",
        marker_color="orange"
    )
)

v = df.score.quantile(0.9)
fig.add_trace(
    go.Scatter(
        name='q90',
        x=[v, v],
        y=[0, 10000],
        text="q90",
        mode="lines+text",
        textposition="top right",
        marker_color="red"
    )
)


v = df.score.std()*3 + df.score.mean()
fig.add_trace(
    go.Scatter(
        name='+3sd',
        x=[v, v],
        y=[0, 10000],
        text="3sd",
        mode="lines+text",
        textposition="top right",
        marker_color="black"
    )
)


fig.show()

In [None]:
df = df[df.score <= (df.score.std()*3 + df.score.mean())]

# Ner entities
- There's no strong correlation between er entites and title score
- However, titles with high scores (> q90) slightly differed from titles with low scores (< q10). Hight socred titles have:
    - more MONEY entities
    - more DATE entities
    - less PERSON entites
- **It may means that the money-related titles attract attention**

In [None]:
df_ner = df[base_cols + ner_cols]
df_ner["total_ents"] = df_ner[ner_cols].sum(axis=1)

for col in ner_cols:
    df_ner[f"{col}_ratio"] = (df_ner[col] / df_ner["total_ents"]) * 100

df_low, df_mean, df_high = extreme_groups_df(df_ner)

df_ner.info()

In [None]:
corr_df = (
    df
    [["score"] + ner_cols]
    .corr(method="spearman")
    .round(2)
    .applymap(lambda x: x if x != 1 else None)
)
corr_df

## Raw NER counts

In [None]:
extreme_groups_diff(ner_cols, df_high, df_mean, df_low)

## NER ratios

In [None]:
cols = [x+"_ratio" for x in ner_cols]
extreme_groups_diff(cols, df_high, df_mean, df_low)

# POS labels (aprt of speech)
- No strong correlations between POS counts and titles score
- However, compairng high-scored titles (>= q90) and low-scored titles (<= q10), high-scored have more:
    - verbs
    - nouns
- **It may means that high-scored titles have more active character**
- **A simple and strong message (verb + noun) may attracts atention**

In [None]:
df_pos = df[base_cols + pos_cols]
df_pos["total"] = df_pos[pos_cols].sum(axis=1)

for col in pos_cols:
    df_pos[f"{col}_ratio"] = (df_pos[col] / df_pos["total"]) * 100

df_low, df_mean, df_high = extreme_groups_df(df_pos)

df_pos.info()

In [None]:
corr_df = (
    df
    [["score"] + pos_cols]
    .corr(method="spearman")
    .round(2)
    .applymap(lambda x: x if x != 1 else None)
)
corr_df

In [None]:
df_pos

## POS counts

In [None]:
extreme_groups_diff(pos_cols, df_high, df_mean, df_low)

## POS ratios

In [None]:
extreme_groups_diff([f"{x}_ratio" for x in pos_cols], df_high, df_mean, df_low)

# TAG labels (fine-grained part-of-speech)
- more detailed POS analysis shows that high-scored titles in comparison to low-scored titles have:
    - more past tense verbs
    - personal pronouns
- **A simple message e.g. 'someone does smth' may attracts attention**

In [None]:
df_tag = df[base_cols + tag_cols]
df_tag["total"] = df_tag[tag_cols].sum(axis=1)

for col in tag_cols:
    df_tag[f"{col}_ratio"] = (df_tag[col] / df_tag["total"]) * 100

df_low, df_mean, df_high = extreme_groups_df(df_tag)

df_tag.info()

In [None]:
df_tag

In [None]:
corr_df = (
    df
    [["score"] + tag_cols]
    .corr(method="spearman")
    .round(2)
    .applymap(lambda x: x if x != 1 else None)
)
corr_df

In [None]:
extreme_groups_diff(tag_cols, df_high, df_mean, df_low)

In [None]:
extreme_groups_diff([f"{x}_ratio" for x in tag_cols], df_high, df_mean, df_low)

# Sentiment
- titles with negative sentiment have 2x higher scores than tiles with positive sentiment
- **It means that negative sentiment attracts attention stronger than positive sentiment**

In [None]:
df_s = df[base_cols]
sentiment_map = {
    "Very Negative": 0,
    "Negative": 1,
    "Neutral": 2,
    "Positive": 3,
    "Very Positive": 4
}
df_s["sentiment_rank"] = df_s.sentiment.apply(lambda x: sentiment_map[x])
df_low, df_mean, df_high = extreme_groups_df(df_s)
df_s

In [None]:
df_s[["score", "sentiment_rank"]].corr(method="pearson")

In [None]:
df_graph = df_s.groupby("sentiment").agg({"score": "median"}).reset_index().sort_values("score", ascending=False)

fig = px.bar(
    df_graph,
    x="sentiment",
    y="score",
    width=600,
    title="Titles scores medians and sentiments",
    color="sentiment",
    color_discrete_map={
        "Very Negative": "rgb(255, 50, 50)",
        "Negative": "rgba(255, 50, 50, 0.5)",
        "Neutral": "silver",
        "Positive": "rgba(50, 255, 50, 0.5)",
        "Very Positive": "rgb(50, 255, 50)"
    }
)
fig.show()

In [None]:
fig = px.histogram(
    df_s,
    x="score",
    color="sentiment",
    width=1400,
    height=700,
    title="Titles scores medians and sentiments",
    color_discrete_map={
        "Very Negative": "rgb(255, 50, 50)",
        "Negative": "rgba(255, 50, 50, 0.5)",
        "Neutral": "silver",
        "Positive": "rgba(50, 255, 50, 0.5)",
        "Very Positive": "rgb(50, 255, 50)"
    }
)
fig.show()