In [1]:
import sqlite3
import pandas as pd
import os


## Copy .sqlite files to common directory

I did this through terminal commands to copy from data/raw/spider/database/ to data/processed/db.

## Create Functions For Getting Schema Info

I already went through this with a really complicated script to get the details from a supplied json file. But that was with the itention of getting everything into a PostgreSQL database. Right now I want to prioritize the application and model itself rather than fussing with the data, so I'm going to pivot to using the supplies .sqlite files.

So while I already have some schema info, it was really lacking in the supplied dtypes (only text and number), so I'm going to cleanup the process and build functions to extract the info directly from the databases.

In [2]:
def get_filenames(filepath, filetype):
    """Create empty list, loop through files within a directory and grab those of a specifed filetype. Append those to the empty list and return without the filetypes.
    filepath example: "../data/processed/db/"
    filetype example: ".sqlite"
    """
    file_list = []

    for root, dirs, files in os.walk(filepath):
        for file in files:
            if file.endswith(filetype):
                file_list.append(os.path.join(file))

    filenames = [file.replace(filetype, "") for file in file_list]

    return filenames

In [3]:
def get_table_names(db_path):
    """Function to use within build_schema_info to pull down table names from given schema, loop through them, and save to a list"""
    
    table_list = []

    db = sqlite3.connect(db_path)
    cursor=db.cursor()
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()

    for table in tables:
        table_list.append(table[0])
    
    cursor.close()
    db.close()
    
    return table_list


In [4]:
def get_column_info(db_path, schema_name, table_name):
    """Function to use within build_schema_info to pull the column info under the specific table_name"""
    
    db = sqlite3.connect(db_path)
    cursor=db.cursor()
    cursor.execute("PRAGMA table_info(" + table_name + ");")
    columns = cursor.fetchall()

    column_data = (schema_name, table_name, columns)

    cursor.close()
    db.close()

    return column_data

In [5]:
def build_schema_info(filepath, filetype):
    """Function combines the get_filenames, get_table_names, and get_column_info funciton to create list of file names in a subdirectory and running the PRAGMA table info against each, saving their results to a list.
    It then builds a pandas dataframe with the full schema info"""
    
    schema_list = get_filenames(filepath, filetype)

    schema_data = []

    for schema in schema_list:
        schema_name = schema
        db_path = filepath + str(schema) + filetype

        table_list = get_table_names(db_path)
        for table in table_list:
            table_name = table
            column_data = get_column_info(db_path, schema_name, table_name)
            schema_data.append(column_data)
    
    schema_df = (pd.DataFrame(schema_data, columns=['schema','table','column_info']).explode('column_info', ignore_index=True))

    schema_df[['c_id','c_name','c_type','notnull','dflt_value','is_pk']] = schema_df.column_info.tolist()

    schema_df.drop(columns='column_info', inplace=True)
    
    return schema_df
    

In [6]:
schema_info = build_schema_info('../data/processed/db/', '.sqlite')

In [7]:
schema_info.head()

Unnamed: 0,schema,table,c_id,c_name,c_type,notnull,dflt_value,is_pk
0,coffee_shop,shop,0,Shop_ID,INT,0,,1
1,coffee_shop,shop,1,Address,TEXT,0,,0
2,coffee_shop,shop,2,Num_of_staff,TEXT,0,,0
3,coffee_shop,shop,3,Score,REAL,0,,0
4,coffee_shop,shop,4,Open_Year,TEXT,0,,0


## Save to .pkl file

In [8]:
#export as pickle file
filepath = '../data/interim/schema_info.pkl'
schema_info.to_pickle(filepath)