# Bet Vault 

In [1]:
import pandas as pd
import numpy as np
import sqlite3
import hashlib
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [2]:
connection_string = "../data/raw/database.sqlite"

In [3]:
query_table_names = """
            SELECT name FROM sqlite_master
            WHERE type IN ('table', 'view')
            AND name NOT LIKE 'sqlite_%';
        """

query_table_info = "PRAGMA table_info([{table_name}]);"

query_drop_table_if_exists = "DROP TABLE IF EXISTS {table_name};"

query_select = "SELECT * FROM {table}"

query_create_hub = """
    CREATE TABLE {table} (
        HK TEXT NOT NULL PRIMARY KEY,
        BK INTEGER,
        LDTS DATE, 
        RS INTEGER
    );
"""

# query_create_link_fk2 = """
#     CREATE TABLE {table} (
#         HK TEXT NOT NULL PRIMARY KEY,
#         HK_{FK0} TEXT NOT NULL,
#         HK_{FK1} TEXT NOT NULL,
#         LDTS DATE,
#         RS INTEGER
#     );
# """

query_create_link_match = """
    CREATE TABLE {table} (
        HK TEXT NOT NULL PRIMARY KEY,
        HK_{HK0_column} TEXT NOT NULL,
        HK_{HK1_column} TEXT NOT NULL,
        HK_{HK2_column} TEXT NOT NULL,
        tmp_{HK3_column} TEXT NOT NULL,
        tmp_{HK4_column} TEXT NOT NULL,
        LDTS DATE,
        RS INTEGER
    );
"""

query_insert_hub = """
    INSERT INTO {table_hub} (HK,BK,LDTS,RS)
    SELECT 
        {primary_key}, 
        {primary_key}, 
        date("now"), 
        1
    FROM {source_table} 
    ORDER BY {primary_key} ASC;
"""

query_insert_link_match = """
    INSERT INTO {table_link} (HK, HK_{HK0_column}, HK_{HK1_column}, HK_{HK2_column}, tmp_{HK3_column},tmp_{date_column}, LDTS, RS)
    SELECT 
        id, 
        {HK0_fk}, 
        {HK1_fk}, 
        {HK2_fk}, 
        {HK3_fk}, 
        {date_fk},
        date("now"), 
        1
    FROM {table_source} 
    ORDER BY {HK0_fk} ASC;
"""

In [4]:
def get_db(connection_string):
    print(f"Get db for {connection_string}")
    return sqlite3.connect(connection_string)

def get_cursor(db):
    #query_in_memory_db = "ATTACH DATABASE ':memory:' AS memdb1"
    # query_in_memory_db = "ATTACH DATABASE 'file::memory:?cache=shared' AS aux1";
    cursor = db.cursor()
    print("Connection established!")
    return cursor

def close_connection(cursor):
    cursor.close()
    print("Connection closed!")
    print("******************")
    

In [5]:
db = get_db(connection_string)
cursor = db.cursor()
cursor

Get db for ../data/raw/database.sqlite


<sqlite3.Cursor at 0x11e7c8e30>

In [6]:

def get_table_names(db) -> list:
    """returns a list of all the names of the tables of a SQL database"""
    table = pd.read_sql(query_table_names, con=db)
    return table['name'].to_list()

def drop_table_if_exists(table_name,cursor):
    try:
        cursor.execute(query_drop_table_if_exists.format(table_name=table_name))
    except:
        pass

def get_primary_key(table: str, db) -> str:
    """returns the name of the primary key of a table from a SQL database if index was created properly"""
    try:
        table_idx = pd.read_sql(query_table_info.format(table_name=table), con=db)
        primary_key_var = (table_idx.loc[table_idx['pk']== 1]['name'][0])
        return primary_key_var
    except IndexError:
        return f"Table 'index_info' not found, can't get primary key of {table}"

def get_hash(string):
    return hashlib.md5(string.encode()).hexdigest()

def hash_key_generator_hub(df):
    listAttribute = ["BK", "RS"]
    # trim all fields
    df.loc[:, listAttribute] = df.loc[:, listAttribute].replace(" ", "")
    # convert all to string data type
    df.loc[:, listAttribute] = df.loc[:,listAttribute].astype(str)
    # change NULLS to empty strings
    df = df.fillna(" ") 
    # Concatinate with "_"
    joined_column_values=df.loc[:,listAttribute].agg('_'.join, axis=1)
    # hash generated
    hashes = []
    for column_value in joined_column_values:
        column_value = column_value.upper()
        column_hash = get_hash(column_value)
        hashes.append(column_hash)
    df.iloc[:,0] = hashes
    return df

def hash_key_generator_link(df, table_information):
    print(df.columns)
    df['HK'] = df.iloc[:, 1] +"_"+ df.iloc[:, 2] +"_"+ df.iloc[:, 3] +"_"+ df.iloc[:, 4] +"_"+ df.iloc[:, 5] +"_"+ df.iloc[:, 7].astype(str)
    df['HK']= df['HK'].apply(lambda value: get_hash(value))  # data_vault_hash function
    df.iloc[:, 1]=df.iloc[:, 1].apply(lambda value: get_hash(value))
    df.iloc[:, 2]=df.iloc[:, 2].apply(lambda value: get_hash(value))
    df.iloc[:, 3]=df.iloc[:, 3].apply(lambda value: get_hash(value))
    return df

# FOR TEST REASONS
tables = get_table_names(db)
for table in tables:
    print(f"Table:{table}         \tKey:{get_primary_key(table,db)}")

Table:Player_Attributes         	Key:id
Table:Player         	Key:id
Table:Match         	Key:id
Table:League         	Key:id
Table:Country         	Key:id
Table:Team         	Key:id
Table:Team_Attributes         	Key:id
Table:HUB_Player         	Key:HK
Table:HUB_Team         	Key:HK
Table:HUB_League         	Key:HK
Table:LINK_Match         	Key:HK


In [7]:
class Hub:
    def __init__(self, table_name, table_information, db, source_db_path):
        self.name = table_name
        self.source_db_path = source_db_path
        self.db = db
        self.cursor = get_cursor(db)
        self.df = None
        self.create_table(table_name, table_information['source'], table_information['pk'][0], cursor)
        close_connection(self.cursor)

    def create_table(self, table_name, source_table, primary_key_source, cursor):
        print(f"Drop table '{table_name}' if exists")
        drop_table_if_exists(table_name, cursor)
        print(f"Get data from source '{source_table}'")
        # create hub table
        cursor.execute(query_create_hub.format(table=table_name))
        # primary_key = get_primary_key(source_table, db) 
        # fill hub
        cursor.execute(query_insert_hub.format(
            table_hub=table_name, 
            primary_key=primary_key_source,
            source_table=source_table
        ))

        df = pd.read_sql(query_select.format(table=table_name), con=db)
        self.df = hash_key_generator_hub(df)
        
        print(f"Create table: {table_name}")
        df.to_sql(f"{table_name}", con=db, if_exists="replace",  index=False,
                        dtype={"HK": "TEXT NOT NULL PRIMARY KEY", "BK":"INTEGER", "LDTS": "DATETIME", "RS": "INTEGER"})



In [8]:
hub_source_tables = [
    {"source": "Player", "pk": ['player_api_id']}, 
    {"source": "Team", "pk": ['team_api_id']}, 
    {"source": "League", "pk": ['id']}
]
hubs = {}
for hub_table in hub_source_tables:
    hub_table_name = f"HUB_{hub_table['source']}"
    hub = Hub(hub_table_name, hub_table, db, connection_string)
    hubs[hub_table_name] = hub

hubs

Connection established!
Drop table 'HUB_Player' if exists
Get data from source 'Player'
Create table: HUB_Player
Connection closed!
******************
Connection established!
Drop table 'HUB_Team' if exists
Get data from source 'Team'
Create table: HUB_Team
Connection closed!
******************
Connection established!
Drop table 'HUB_League' if exists
Get data from source 'League'
Create table: HUB_League
Connection closed!
******************


{'HUB_Player': <__main__.Hub at 0x10c48b760>,
 'HUB_Team': <__main__.Hub at 0x11e8658b0>,
 'HUB_League': <__main__.Hub at 0x11e842910>}

In [9]:
hubs["HUB_Player"].df.head()

Unnamed: 0,HK,BK,LDTS,RS
0,1adb0c2cf78aa85dd55a30506d9695ba,2625,2021-02-10,1
1,04b1883afc7fd50e7ff15a3b0f5a87b8,2752,2021-02-10,1
2,3c1f0cffb54784fffcfad0d9d2819910,2768,2021-02-10,1
3,8bf1b8616904be9aee4364ae18653479,2770,2021-02-10,1
4,5793dcf6b65b060e64fec83607281293,2790,2021-02-10,1


In [10]:
hubs["HUB_Team"].df.head()

Unnamed: 0,HK,BK,LDTS,RS
0,ba7c94fb0a31d94e8feae513dc647b25,1601,2021-02-10,1
1,0cbd21d1866272c1d445ae2f92955258,1773,2021-02-10,1
2,a6611c630d79c37771cf69ff622e51fb,1957,2021-02-10,1
3,a101f3e4f0ea7531035ba171aa0ec77e,2033,2021-02-10,1
4,76da31f9307c5da46e162bf0585f5fce,2182,2021-02-10,1


In [11]:
hubs["HUB_League"].df.head()

Unnamed: 0,HK,BK,LDTS,RS
0,ec308451c1d095c528cfa3c009ea7235,1,2021-02-10,1
1,0bdc65e4364db54d79b628670f347490,1729,2021-02-10,1
2,248eeffc41601122c9cbcb4ff65441a6,4769,2021-02-10,1
3,e0bd49f6b7486603d558fc3a81018a9c,7809,2021-02-10,1
4,2963f320cccf82d389a3b13ac26373a7,10257,2021-02-10,1


In [12]:
class Link:
    def __init__(self, table_name, table_information, db, source_db_path):
        self.name = table_name
        self.source_db_path = source_db_path
        self.db = db
        self.cursor = get_cursor(db)
        self.df = None
        if table_name == "LINK_Match":
            self.create_table_match(table_name, table_information, cursor)
        else:
            print("other")
        close_connection(self.cursor)

    def create_table_match(self, table_name, table_information, cursor):
        print(f"Get data from source '{table_information['name']}'")
        drop_table_if_exists(table_name, cursor)
        # create link table
        cursor.execute(query_create_link_match.format(
            table=table_name,
            HK0_column=table_information["fks"][0]["name"],
            HK1_column=table_information["fks"][1]["name"],
            HK2_column=table_information["fks"][2]["name"],
            HK3_column=table_information["fks"][3]["name"],
            HK4_column=table_information["fks"][4]["name"],
            ))
        self.df = pd.read_sql(query_select.format(table=table_name), con=db)
        # # get primary key
        # primary_key = get_primary_key(source_table, db) 
        # fill link
        cursor.execute(query_insert_link_match.format(
            table_link=table_name, 
            HK0_column=table_information["fks"][0]["name"],
            HK0_fk=table_information["fks"][0]["pk"],
            HK1_column=table_information["fks"][1]["name"],
            HK1_fk=table_information["fks"][1]["pk"],
            HK2_column=table_information["fks"][2]["name"],
            HK2_fk=table_information["fks"][2]["pk"],
            HK3_column=table_information["fks"][3]["name"],
            HK3_fk=table_information["fks"][3]["pk"],
            date_column=table_information["fks"][4]['name'],
            date_fk=table_information["fks"][4]['pk'],
            table_source=table_information['name']
        ))

        df = pd.read_sql(query_select.format(table=table_name), con=db)
        self.df = hash_key_generator_link(df,table_information)
        
        print(f"Create table: {table_name}")
        columns = [
            "HK",
            f"HK_{table_information['fks'][0]['name']}",
            f"HK_{table_information['fks'][1]['name']}",
            f"HK_{table_information['fks'][2]['name']}",
            "LDTS",
            "RS"
        ]
        df[columns].to_sql(f"{table_name}", con=db, if_exists="replace",  index=False,
                        dtype={
                            "HK": "TEXT NOT NULL PRIMARY KEY", 
                            f"HK_{table_information['fks'][0]['name']}":"TEXT", 
                            f"HK_{table_information['fks'][1]['name']}":"TEXT", 
                            f"HK_{table_information['fks'][2]['name']}":"TEXT", 
                            "LDTS": "DATETIME", 
                            "RS": "INTEGER"})
        self.df = pd.read_sql(query_select.format(table=table_name), con=db)


In [13]:
link_source_tables = [
    {"name": "Match", 
     "hk_columns": ["date","home_team_api_id","away_team_api_id","league_id"],
     "fks": [
        {"name": "home_team" , "pk": 'home_team_api_id', "source_table": "Team"},
        {"name": "away_team" , "pk": 'away_team_api_id', "source_table": "Team"},
        {"name": "league" , "pk": 'league_id', "source_table": "HUB_League"},
        {"name": "id" , "pk": 'id', "source_table": "Match"},
        {"name": "date" , "pk": 'date', "source_table": "Match"},
    ]}, 
]
links = {}
for link_table in link_source_tables:
    link_table_name = f"LINK_{link_table['name']}"
    link = Link(link_table_name, link_table, db, connection_string)
    links[link_table_name] = link

# links
links['LINK_Match'].df.head()

Connection established!
Get data from source 'Match'
Index(['HK', 'HK_home_team', 'HK_away_team', 'HK_league', 'tmp_id', 'tmp_date',
       'LDTS', 'RS'],
      dtype='object')
Create table: LINK_Match
Connection closed!
******************


Unnamed: 0,HK,HK_home_team,HK_away_team,HK_league,LDTS,RS
0,ad20b19c638021427e9df64dcd9755f6,c559da2ba967eb820766939a658022c8,71969a804c28c0961383008958497b1b,8b3ecc6c4da9bf7e321df2d89de60aa8,2021-02-10,1
1,225ec43101960d267b16ac8767fb00fc,c559da2ba967eb820766939a658022c8,62d2b7ba91f34c0ac08aa11c359a8d2c,8b3ecc6c4da9bf7e321df2d89de60aa8,2021-02-10,1
2,e826ccfafd5740050ea08e599a8a306a,c559da2ba967eb820766939a658022c8,5ad2c993fa4f162c255867250267de48,8b3ecc6c4da9bf7e321df2d89de60aa8,2021-02-10,1
3,96c83767da951a6f320a5fdcb7eb9aa8,c559da2ba967eb820766939a658022c8,88591b4d3219675bdeb33584b755f680,8b3ecc6c4da9bf7e321df2d89de60aa8,2021-02-10,1
4,de6f3269f417738877b4b2211c9a039d,c559da2ba967eb820766939a658022c8,07b1c04a30f798b5506c1ec5acfb9031,8b3ecc6c4da9bf7e321df2d89de60aa8,2021-02-10,1


In [14]:
# link_table = "Match"
# link_table_name = f"Link_{link_table}"
# link = Link(link_table_name, link_table, db, connection_string)
# link.df