In [6]:
import sqlite3
import pandas as pd
import os
import shutil
import json
import string
import re

## Copy .sqlite files to common directory

Use terminal commands to copy from data/raw/spider/database/ to data/processed/db.

In [2]:
###commented out so we don't run again###
#create a 'db' directory in the data/processed folder
#! mkdir ../data/processed/db

In [8]:
###commented out so we don't run again###
#run command to move all .sqlite folders to our newly created directory

# src_dir = "/Users/brettly/Sboard/projects/text-to-sql/data/raw/spider/database"
# dst_dir = "/Users/brettly/Sboard/projects/text-to-sql/data/processed/db"
# for root, dirs, files in os.walk(src_dir):
#     for f in files:
#         if f.endswith('.sqlite'):
#             shutil.copy(os.path.join(root,f), dst_dir)

## Create Functions For Getting Schema Info

I already went through this with a really complicated script to get the details from a supplied json file. But that was with the itention of getting everything into a PostgreSQL database. Right now I want to prioritize the application and model itself rather than fussing with the data, so I'm going to pivot to using the supplies .sqlite files.

So while I already have some schema info, it was really lacking in the supplied dtypes (only text and number), so I'm going to cleanup the process and build functions to extract the info directly from the databases.

In [2]:
def get_filenames(filepath, filetype):
    """Create empty list, loop through files within a directory and grab those of a specifed filetype. Append those to the empty list and return without the filetypes.
    filepath example: "../data/processed/db/"
    filetype example: ".sqlite"
    """
    file_list = []

    for root, dirs, files in os.walk(filepath):
        for file in files:
            if file.endswith(filetype):
                file_list.append(os.path.join(file))

    filenames = [file.replace(filetype, "") for file in file_list]

    return filenames

In [3]:
def get_table_names(db_path):
    """Function to use within build_schema_info to pull down table names from given schema, loop through them, and save to a list"""
    
    table_list = []

    db = sqlite3.connect(db_path)
    cursor=db.cursor()
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()

    for table in tables:
        table_list.append(table[0])
    
    cursor.close()
    db.close()
    
    return table_list


In [4]:
def get_column_info(db_path, schema_name, table_name):
    """Function to use within build_schema_info to pull the column info under the specific table_name"""
    
    db = sqlite3.connect(db_path)
    cursor=db.cursor()
    cursor.execute("PRAGMA table_info(" + table_name + ");")
    columns = cursor.fetchall()

    column_data = (schema_name, table_name, columns)

    cursor.close()
    db.close()

    return column_data

In [12]:
#remove the processing and only leave the additional columns created
def df_text_processing(df, text_columns=['schema', 'table', 'c_name']):
    """Function to apply some simple text processing to our schema info"""
    #df[text_columns] = df[text_columns].apply(lambda x: x.str.lower()) #make lowercase
    
    #punc = string.punctuation
    #df[text_columns] = df[text_columns].apply(lambda x: [word for word in x if word not in punc]) #remove punctuation

    for col in text_columns:
        df[col+'_split'] = df[col].str.replace('_', ' ') #loop through gext columns and create versions that breakout the '_' connected titles into seperate words

    return df

In [13]:
def build_schema_info(filepath, filetype):
    """Function combines the get_filenames, get_table_names, and get_column_info funciton to create list of file names in a subdirectory and running the PRAGMA table info against each, saving their results to a list.
    It then builds a pandas dataframe with the full schema info"""
    
    schema_list = get_filenames(filepath, filetype)

    schema_data = []

    for schema in schema_list:
        schema_name = schema
        db_path = filepath + str(schema) + filetype

        table_list = get_table_names(db_path)
        for table in table_list:
            table_name = table
            column_data = get_column_info(db_path, schema_name, table_name)
            schema_data.append(column_data)
    
    schema_df = (pd.DataFrame(schema_data, columns=['schema','table','column_info']).explode('column_info', ignore_index=True))

    schema_df[['c_id','c_name','c_type','notnull','dflt_value','is_pk']] = schema_df.column_info.tolist()

    schema_df.drop(columns=['column_info','notnull','dflt_value','is_pk'], inplace=True)

    schema_df_processed = df_text_processing(schema_df)
    
    return schema_df_processed
    

In [14]:
schema_info = build_schema_info('../data/processed/db/', '.sqlite')

In [15]:
schema_info.head()

Unnamed: 0,schema,table,c_id,c_name,c_type,schema_split,table_split,c_name_split
0,coffee_shop,shop,0,Shop_ID,INT,coffee shop,shop,Shop ID
1,coffee_shop,shop,1,Address,TEXT,coffee shop,shop,Address
2,coffee_shop,shop,2,Num_of_staff,TEXT,coffee shop,shop,Num of staff
3,coffee_shop,shop,3,Score,REAL,coffee shop,shop,Score
4,coffee_shop,shop,4,Open_Year,TEXT,coffee shop,shop,Open Year


### Explore Schema Info Columns

## Create parallel json file
I can see some easier use with some of our future langchain steps with a json.

In [17]:
#create dict of each schema-table combo with corresponding column information.
schema_json = (schema_info.groupby(['schema', 'schema_split', 'table','table_split']) #breakout each of these columns into a record for schema-table combos
       .apply(lambda x: x[['c_id','c_name','c_name_split','c_type']].to_dict('records')) #brekout these into dictionaries under each of the schema-table combos
       .reset_index()
       .rename(columns={0:'columns'})
       .to_dict(orient='records'))

In [18]:
schema_json[0]

{'schema': 'academic',
 'schema_split': 'academic',
 'table': 'author',
 'table_split': 'author',
 'columns': [{'c_id': 0,
   'c_name': 'aid',
   'c_name_split': 'aid',
   'c_type': 'INT'},
  {'c_id': 1,
   'c_name': 'homepage',
   'c_name_split': 'homepage',
   'c_type': 'TEXT'},
  {'c_id': 2, 'c_name': 'name', 'c_name_split': 'name', 'c_type': 'TEXT'},
  {'c_id': 3, 'c_name': 'oid', 'c_name_split': 'oid', 'c_type': 'INT'}]}

In [19]:
#write to a json file - savings in our interim data folder
with open('../data/interim/schema_info.json', 'w') as file:
    json.dump(schema_json, file)

## Save dataframe to .pkl file

I already saved to a json, but this will be incase I want to pull in the direct dataframe via a pickle file incase that's easier.

In [20]:
#export as pickle file
filepath = '../data/interim/schema_info.pkl'
schema_info.to_pickle(filepath)