In [None]:
import os
import sys
from dotenv import load_dotenv

load_dotenv(override=True)

sys.path.append(os.environ["WORKING_DIR"])
from os.path import join
import json
import pandas as pd
import numpy as np

from data_loader.utils import load_public_bi_table_by_cols, get_label_encoder

valid_headers_path = join(os.environ["WORKING_DIR"], "data", "extract", "out",
                          "valid_headers")

# Label Encoder
label_enc = get_label_encoder()

numeric_types = ["X1B",
                 "X2B",
                 "X3B",
                 "TB",
                 "HR",
                 "R",
                 "BB",
                 "AB",
                 "GIDP",
                 "HBP",
                 "H",
                 "SF",
                 "SH",
                 "SO",
                 "iBB",
                 "CS",
                 "SB",
                 "latitude",
                 "longitude",
                 "year"]

In [None]:
## load valid headers
valid_headers_path = join(os.environ["WORKING_DIR"], "data", "extract", "out",
                          "valid_headers")
with open(join(valid_headers_path, "public_bi_num_type_public_bi.json")) as f:
    valid_headers = json.load(f)

def get_all_cols_with_type(semantic_type:str) -> pd.DataFrame:
    results = []

    for table in valid_headers.keys():
        for column in valid_headers[table].keys():
            if valid_headers[table][column]["semanticType"] == semantic_type:
                results.append([table+"+"+column, valid_headers[table][column]["semanticType"]])
    return pd.DataFrame(results, columns=["dataset_id", "semantic_type"])


In [None]:
semantic_type = "CS"

results = []
for semantic_type in numeric_types:
    df = get_all_cols_with_type(semantic_type)
    df["count"] = None
    df["mean"] = None
    df["std"] = None
    df["min"] = None
    df["25%"] = None
    df["50%"] = None
    df["75%"] = None
    df["max"] = None
    df["nunique"] = None

    for row_idx, row in df.iterrows():
        # if row_idx > 0:
        #     break
        domain = row["dataset_id"].split("_")[0]
        tablename = row["dataset_id"].split("+")[0]
        column = int(row["dataset_id"].split("+")[1].split("_")[1])
        df_table = load_public_bi_table_by_cols(domain, tablename, [column], [semantic_type])
        df_table[semantic_type] = pd.to_numeric(df_table[semantic_type], errors="coerce")
        df_table[semantic_type].dropna()
        df_table_stats = df_table.describe()
        for statistic in ["count", "mean", "std", "min", "25%", "50%", "75%", "max", "nunique"]:
            if statistic == "nunique":
                df.loc[row_idx, statistic] = df_table[semantic_type].nunique()
                continue    
            df.loc[row_idx, statistic] = df_table_stats.loc[statistic, semantic_type]
    results.append(df)


In [None]:
pd.concat(results).reset_index(drop=True).to_csv("public_bi_stats_num_cols.csv", index=False)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df_stats = pd.read_csv("public_bi_stats_num_cols.csv")
number_of_groups = 0

plot_data = []
for idx, group in df_stats.groupby(by=["semantic_type", "min", "25%", "50%", "75%", "max"]):
    number_of_groups += 1
    # if number_of_groups > 50:
    #     break
    #print(idx[0])
    if idx[0] in ["latitude", "longitude", "year"]:
        continue
    #if idx[0] in ["SF", "SH"]:
    plot_data.append({
        "label": idx[0],
        "whislo": idx[1],
        "q1": idx[2],
        "med": idx[3],
        "q3": idx[4],
        "whishi": idx[5],
        "fliers": []
    })
    #print(group)
print(number_of_groups)

fig, ax = plt.subplots(figsize=(25,50))
ax.bxp(plot_data, boxprops=dict(color="red"), widths=0.75, vert=False)
ax.set_xlim([-1,150])


In [None]:
df_stats[(df_stats["nunique"] < 2)].to_csv(join(valid_headers_path, "public_bi_num_type_public_bi_nums_to_delete.csv"), index=False)

In [None]:
df = load_public_bi_table_by_cols("TrainsUK2", "TrainsUK2_1", [2], ["year"])

In [None]:
df

In [None]:
df_stats[(df_stats["min"] == 0) & (df_stats["25%"] == 0) & (df_stats["50%"] == 0) & (df_stats["75%"] == 0)]