## Plotting Pretty Charts for Presentation

In [1]:
import os

os.chdir("..")
print(os.getcwd())
import json
import isoweek
from datetime import datetime
from pathlib import Path
import pandas as pd
import altair as alt
from utils.serializer import load_fr_pkl

/Users/christopherliew/Desktop/Y4S1/HT/crypto_uncertainty_index


In [2]:
def get_week_start_end_from_date(date):
    date = datetime.strptime(date, "%Y-%m-%d")
    year, week = date.isocalendar()[0], date.isocalendar()[1]  # year, week, weekday
    w = isoweek.Week(year, week)
    return w.monday(), w.sunday()

In [3]:
# Lucey Index Data
# Original Lucey UCRY Index Data
index_data_path = (
    Path(
        "/Users/christopherliew/Desktop/Y4S1/HT/crypto_uncertainty_index/pipelines/crypto_index"
    )
    / "index_data"
)

ucry_original = pd.read_csv(index_data_path / "ucry_lucey_original_.csv")
ucry_original_policy = ucry_original[["Date", "UCRY Policy"]]
ucry_original_price = ucry_original[["Date", "UCRY Price"]]

# Process dates
# Break up timeline column

# Create start date and end date
ucry_original_policy[["start_date", "end_date"]] = pd.DataFrame(
    ucry_original_policy["Date"]
    .apply(lambda x: get_week_start_end_from_date(x))
    .tolist(),
    index=ucry_original_policy.index,
)

ucry_original_policy.drop(columns=["Date"], inplace=True)
ucry_original_policy = ucry_original_policy.rename(
    columns={"UCRY Policy": "index_value"}
)
ucry_original_policy["type"] = "Lucey-Policy"


ucry_original_price[["start_date", "end_date"]] = pd.DataFrame(
    ucry_original_price["Date"]
    .apply(lambda x: get_week_start_end_from_date(x))
    .tolist(),
    index=ucry_original_price.index,
)

ucry_original_price.drop(columns=["Date"], inplace=True)
ucry_original_price = ucry_original_price.rename(columns={"UCRY Price": "index_value"})
ucry_original_price["type"] = "Lucey-Price"

ucry_combined = pd.concat([ucry_original_policy, ucry_original_price], axis=0)
ucry_combined["start_date"] = pd.to_datetime(ucry_combined["start_date"])
ucry_combined["end_date"] = pd.to_datetime(ucry_combined["end_date"])

In [4]:
# Plot
ucry_line = (
    alt.Chart(ucry_combined)
    .mark_line(interpolate="basis")
    .encode(
        x=alt.X("start_date:T", title="Date"),
        y=alt.Y(
            "index_value:Q", scale=alt.Scale(domain=[90, 110]), title="UCRY Index Value"
        ),
        color=alt.Color("type:N", title="Index Name", scale=alt.Scale(scheme="plasma")),
        strokeDash=alt.StrokeDash("type:N", legend=None),
    )
    .properties(width=600)
)
ucry_line

### 1. Plotting Doc Counts
* Doc count per Week
* Doc count per Week broken down by Subreddit

In [5]:
with open("forecasting/data/docCountPerWeek.json", "r") as f:
    doc_count_data_json = json.load(f)

In [6]:
doc_count_data = (
    pd.DataFrame(
        {
            doc["key_as_string"]: doc["doc_count"]
            for doc in doc_count_data_json["aggregations"]["count_per_week"]["buckets"]
        },
        index=[0],
    )
    .T.reset_index()
    .rename(columns={"index": "Date", 0: "Count"})
)

In [7]:
weekly_doc_count = (
    alt.Chart(doc_count_data)
    .mark_bar()
    .encode(
        x=alt.X("Date:T", title="Date"),
        y=alt.Y("Count:Q", title="No. of Submissions & Comments"),
        color="Count:Q",
    )
    .properties(width=600)
)

In [8]:
# Combined Dual Axis
(
    alt.layer(
        ucry_line,
        weekly_doc_count,
        title="Weekly Crypto Reddit Comments & Submissions Count with Lucey Index",
    ).resolve_scale(y="independent", size="independent", shape="independent")
)

In [9]:
doc_count_data.index = pd.to_datetime(doc_count_data["Date"])
doc_count_data_month = (
    doc_count_data.groupby(by=[doc_count_data.index.year, doc_count_data.index.month])[
        "Count"
    ].sum()
).to_frame()
doc_count_data_month.index.set_names(["Year", "Month"], inplace=True)
doc_count_data_month.reset_index(inplace=True)

In [10]:
doc_count_data_month["Day"] = 1
doc_count_data_month["Date"] = pd.to_datetime(
    doc_count_data_month[["Year", "Month", "Day"]]
)

In [11]:
monthly_doc_count = (
    alt.Chart(doc_count_data_month)
    .mark_bar()
    .encode(
        x=alt.X("Date:T", title="Date"),
        y=alt.Y("Count:Q", title="No. of Submissions & Comments"),
        color="Count:Q",
    )
    .properties(width=600)
)
monthly_doc_count

In [12]:
# Combined Dual Axis
(
    alt.layer(
        monthly_doc_count,
        ucry_line,
        title="Monthly Crypto Reddit Comments & Submissions Count with Lucey Index",
    ).resolve_scale(y="independent", size="independent", shape="independent")
)

### Topic Coherence vs. K

In [23]:
# Get coherence values
lda_umass_1_10 = load_fr_pkl(
    "nlp/topic_models/models/lda/lda_run_2022-02-26 03:58:40.048501/results.pkl"
)
lda_umass_11_15 = load_fr_pkl(
    "nlp/topic_models/models/lda/lda_run_2022-02-26 11:04:18.709381/results.pkl"
)
f = open("nlp/topic_models/models/lda/lda_run_train_test_10_15_20220307/results.json")
lda_perplexity_1_15 = json.load(f)

In [81]:
# Tidy Data
umass_df = (
    pd.concat(
        [
            pd.DataFrame(lda_umass_1_10).loc[["u_mass"], :].melt(var_name="K"),
            pd.DataFrame(lda_umass_11_15).loc[["u_mass"], :].melt(var_name="K"),
        ]
    )
    .reset_index(drop=True)
    .rename(columns={"value": "Coherence Score"})
)
umass_df["Metric"] = "UMass"

pp_df = (
    pd.DataFrame(lda_perplexity_1_15)
    .loc[["log_perplexity"], :]
    .melt(var_name="K")
    .reset_index(drop=True)
    .rename(columns={"value": "Coherence Score"})
)
pp_df["Metric"] = "Log Perplexity"

pp_df.K = pp_df.K.astype("int64")

# Combine DFs for plotting
combined_df = pd.concat([umass_df, pp_df]).reset_index(drop=True)
combined_df = combined_df[(combined_df.K <= 10) & (combined_df.K >= 2)]
combined_df

Unnamed: 0,K,Coherence Score,Metric
1,2,-2.520405,UMass
2,3,-2.895103,UMass
3,4,-2.87886,UMass
4,5,-3.137433,UMass
5,6,-3.04278,UMass
6,7,-3.069376,UMass
7,8,-3.049688,UMass
8,9,-2.916546,UMass
9,10,-2.91791,UMass
16,2,-9.769696708519952,Log Perplexity


In [97]:
UMASS_line = (
    alt.Chart(combined_df[combined_df.Metric == "UMass"])
    .mark_line()
    .encode(
        x=alt.X("K:Q", title="K (Number of Topics)"),
        y=alt.Y(
            "Coherence Score:Q",
            axis=alt.Axis(title="UMass", titleColor="#ff7f0e"),
            scale=alt.Scale(domain=[-2.0, -3.5]),
        ),
        color=alt.Color(
            "Metric", title="Coherence Metric", scale=alt.Scale(scheme="plasma")
        ),
    )
    .properties(width=600)
)

PP_line = (
    alt.Chart(combined_df[combined_df.Metric == "Log Perplexity"])
    .mark_line()
    .encode(
        x=alt.X("K:Q", title="K (Number of Topics)"),
        y=alt.Y(
            "Coherence Score:Q",
            axis=alt.Axis(title="Log Perplexity", titleColor="#9467bd"),
            scale=alt.Scale(domain=[-9.5, -10.0]),
        ),
        color=alt.Color(
            "Metric", title="Coherence Metric", scale=alt.Scale(scheme="plasma")
        ),
    )
    .properties(width=600)
)

In [98]:
alt.layer(UMASS_line, PP_line).resolve_scale(y="independent")