In [1]:
import math
import os
import ast

import sqlparse
import numpy as np
import pandas as pd
from sqlalchemy import MetaData, Table, literal_column, Column
from sqlalchemy.sql import select, func, or_
from sqlalchemy.dialects.mysql import insert
from sqlalchemy.dialects.mysql.types import VARCHAR, TINYINT, TEXT

from f3_data_builder import mysql_connection, region_queries, pull_beatdowns, build_users, build_aos, build_beatdowns, build_attendance

In [2]:
from typing import Any, Tuple, Hashable, Iterable
from sqlalchemy.engine import Engine
from pandas._libs.missing import NAType
from _mysql_connector import MySQLInterfaceError

In [3]:
engine = mysql_connection()

In [4]:
metadata = MetaData()
metadata.reflect(engine, schema="weaselbot")

Table("regions", 
      metadata, 
      Column("region", VARCHAR(length=45), nullable=False, primary_key=True), 
      Column("slack_token", VARCHAR(length=90), nullable=False),
      Column("schema_name", VARCHAR(length=45), nullable=True, default=None),
      Column("active", TINYINT(), default=1),
      Column("firstf_channel", VARCHAR(length=45), nullable=True, default=None),
      Column("contact", VARCHAR(length=45), nullable=True, default=None),
      Column("send_pax_charts", TINYINT(), default=0),
      Column("send_ao_leaderboard", TINYINT(), default=0),
      Column("send_q_charts", TINYINT(), default=0),
      Column("send_region_leaderboard", TINYINT(), default=0),
      Column("send_region_uniquepax_chart", TINYINT(), default=0),
      Column("send_region_stats", VARCHAR(length=45), default=0),
      Column("send_mid_month_charts", VARCHAR(length=45), default=0),
      Column("comments", TEXT()),
      schema="paxminer");

In [None]:
for table in metadata.sorted_tables:
    print(table.__repr__())

In [None]:
cr = metadata.tables["weaselbot.combined_regions"]
cu = metadata.tables["weaselbot.combined_users"]
cud = metadata.tables["weaselbot.combined_users_dup"]
ca = metadata.tables["weaselbot.combined_aos"]
cb = metadata.tables["weaselbot.combined_beatdowns"]
catt = metadata.tables["weaselbot.combined_attendance"]

In [None]:
def insert_statement(table, insert_values, update_cols):
    sql = insert(table).values(insert_values)
    on_dup = sql.on_duplicate_key_update(
        {v.name: v for v in sql.inserted if v.name in update_cols}
    )
    return on_dup

In [None]:
def region_subquery(metadata):
    cb = metadata.tables["weaselbot.combined_beatdowns"]
    a = metadata.tables["weaselbot.combined_aos"]

    sql = select(
        a.c.region_id,
        func.max(cb.c.timestamp).label("max_timestamp"),
        func.max(cb.c.ts_edited).label("max_ts_edited"),
        func.count().label("beatdown_count"),
    )
    sql = sql.select_from(cb.join(a, cb.c.ao_id == a.c.ao_id))
    sql = sql.group_by(a.c.region_id).subquery("b")
    return sql

In [None]:
def paxminer_region_query(metadata):
    r = metadata.tables["paxminer.regions"]
    cr = metadata.tables["weaselbot.combined_regions"]
    sub = region_subquery(metadata)

    sql = select(
        r.c.schema_name,
        r.c.region.label("region_name"),
        sub.c.max_timestamp,
        sub.c.max_ts_edited,
        sub.c.beatdown_count,
        cr.c.region_id,
    )
    sql = sql.select_from(
        r.outerjoin(cr, r.c.schema_name == cr.c.schema_name).outerjoin(sub, cr.c.region_id == sub.c.region_id)
    )

    return sql

In [None]:
def weaselbot_region_query(metadata):
    cr = metadata.tables["weaselbot.combined_regions"]
    sub = region_subquery(metadata)

    sql = select(cr, sub.c.beatdown_count)
    sql = sql.select_from(cr.outerjoin(sub, cr.c.region_id == sub.c.region_id))

    return sql

In [None]:
paxminer_region_sql = paxminer_region_query(metadata) # verified
weaselbot_region_sql = weaselbot_region_query(metadata) # verified

In [None]:
### Sample data to move forward with ###

region_data = dict(schema_name="f3alamo f3chicago f3naperville f3omaha f3stcharles".split(),
                   region_name="Alamo Chicago Naperville Omaha St_Charles".split(),
                   max_timestamp=[1704824582.100129, 1704808955.413849, None, 1697642801.087519, 1704825215.550419],
                   max_ts_edited=[1704824731, 1704724809, None, None, 1704825356]
                  )
df_regions = pd.DataFrame(region_data)

In [None]:
insert_values = df_regions.to_dict("records")
update_cols = ("region_name", "max_timestamp", "max_ts_edited")
region_insert_sql = insert_statement(cr, insert_values, update_cols)

In [None]:
# print(sqlparse.format(region_insert_sql.compile(engine, compile_kwargs={"literal_binds": True}).__str__(), keyword_case="upper", reindent=True))

In [5]:
from collections import namedtuple
from sqlalchemy import text

In [6]:
Row = namedtuple("Row", ["region_id", "schema_name", "max_timestamp", "max_ts_edited"])
row1 = Row(18, "f3chicago", 1671647384.278359, 1671647542)
row2 = Row(18, "f3chicago", 1671647384.278359, pd.NA)
row3 = Row(18, "f3chicago", pd.NA, pd.NA)

In [7]:
dtypes = dict(
        slack_channel_id=pd.StringDtype(),
        slack_q_user_id=pd.StringDtype(),
        slack_coq_user_id=pd.StringDtype(),
        pax_count=pd.Int16Dtype(),
        fng_count=pd.Int16Dtype(),
        region_id=pd.StringDtype(),
        timestamp=pd.Float64Dtype(),
        ts_edited=pd.StringDtype(),
        backblast=pd.StringDtype(),
        json=pd.StringDtype(storage="pyarrow"),
    )

In [8]:
df = pull_beatdowns(row1, engine, metadata)
base_sql = f"SELECT ao_id as slack_channel_id, bd_date, q_user_id as slack_q_user_id, coq_user_id as slack_coq_user_id, pax_count, fng_count, '{row1.region_id}' AS region_id, timestamp, ts_edited, backblast, json FROM {row1.schema_name}.beatdowns WHERE timestamp > {row1.max_timestamp} OR ts_edited > {row1.max_ts_edited};"
with engine.begin() as cnxn:
    df_base = pd.read_sql(text(base_sql), cnxn, dtype=dtypes)

In [13]:
(df_base == df).fillna(True).mean().mean()

1.0

In [9]:
df_base.iloc[13,:].json

'{"files": ["https://slackblast-images.s3.amazonaws.com/F06E2UD2RQB.jpg"]}'

In [10]:
df.iloc[13,:].json

'{"files": ["https://slackblast-images.s3.amazonaws.com/F06E2UD2RQB.jpg"]}'

In [None]:
dtypes = dict(
    region_id=pd.StringDtype(),  # this is a string everywhere else
    region_name=pd.StringDtype(),
    schema_name=pd.StringDtype(),
    slack_team_id=pd.StringDtype(),
    max_timestamp=pd.Float64Dtype(),
    max_ts_edited=pd.Float64Dtype(),
    beatdown_count=pd.Int16Dtype()
)

with engine.begin() as cnxn:
    df_regions = pd.read_sql(weaselbot_region_sql, cnxn, dtype=dtypes)