In [1]:
import sqlite3
import pandas as pd
import os


## Get Schema Info

In [2]:
db = sqlite3.connect('../data/processed/db/academic.sqlite')
cursor = db.cursor()

schema_name = 'academic'
table_name = 'author'
cursor.execute("PRAGMA table_info(" + table_name + ");")
columns = cursor.fetchall()

schema_info = list((schema_name, table_name, columns))

cursor.close()
db.close()

schema_info

['academic',
 'author',
 [(0, 'aid', 'INT', 0, None, 1),
  (1, 'homepage', 'TEXT', 0, None, 0),
  (2, 'name', 'TEXT', 0, None, 0),
  (3, 'oid', 'INT', 0, None, 0)]]

# os Work

In [135]:
#more flexible -> multiples subdirs and specified filetype

file_list = []
for root, dirs, files in os.walk('../data/processed/db/'):
    for file in files:
        if file.endswith(".sqlite"):
            file_list.append(os.path.join(file))

filenames = [file.replace(".sqlite", "") for file in file_list]

filenames[:3]

['coffee_shop', 'news_report', 'program_share']

## Create Functions

In [136]:
def get_filenames(filepath, filetype):
    """Create empty list, loop through files within a directory and grab those of a specifed filetype. Append those to the empty list and return without the filetypes.
    filepath example: "../data/processed/db/"
    filetype example: ".sqlite"
    """
    file_list = []

    for root, dirs, files in os.walk(filepath):
        for file in files:
            if file.endswith(filetype):
                file_list.append(os.path.join(file))

    filenames = [file.replace(filetype, "") for file in file_list]

    return filenames

In [137]:
files = get_filenames('../data/processed/db/', '.sqlite')

files[:3]

['coffee_shop', 'news_report', 'program_share']

In [None]:
#get all table names within schema

table_list = []

db = sqlite3.connect('../data/processed/db/academic.sqlite')
cursor = db.cursor()

cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

print(tables)
for table in tables:
    table_list.append(table[0])

print(table_list[:3])

cursor.close()
db.close()

In [116]:
def get_table_names(db_path):
    """Function to use within build_schema_info to pull down table names from given schema, loop through them, and save to a list"""
    
    table_list = []

    db = sqlite3.connect(db_path)
    cursor=db.cursor()
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()

    for table in tables:
        table_list.append(table[0])
    
    cursor.close()
    db.close()
    
    return table_list


In [176]:
def get_column_info(db_path, schema_name, table_name):
    """Function to use within build_schema_info to pull the column info under the specific table_name"""
    
    db = sqlite3.connect(db_path)
    cursor=db.cursor()
    cursor.execute("PRAGMA table_info(" + table_name + ");")
    columns = cursor.fetchall()

    column_data = (schema_name, table_name, columns)

    cursor.close()
    db.close()

    return column_data

In [189]:
def build_schema_info(filepath, filetype):
    """Function combines the get_filenames and get_table_info funciton to create list of file names in a subdirectory and running the PRAGMA table info against each, saving their results to a list.
    It then builds a pandas dataframe with the full schema info"""
    
    schema_list = get_filenames(filepath, filetype)

    schema_data = []

    for schema in schema_list:
        schema_name = schema
        db_path = filepath + str(schema) + filetype

        table_list = get_table_names(db_path)
        for table in table_list:
            table_name = table
            column_data = get_column_info(db_path, schema_name, table_name)
            schema_data.append(column_data)
    
    schema_df = (pd.DataFrame(schema_data, columns=['schema','table','column_info']).explode('column_info', ignore_index=True))

    schema_df[['c_id','c_name','c_type','notnull','dflt_value','is_pk']] = schema_df.column_info.tolist()

    schema_df.drop(columns='column_info', inplace=True)
    
    return schema_df
    

In [148]:
test1 = get_filenames('../data/processed/db/', '.sqlite')
test1[:3]

['coffee_shop', 'news_report', 'program_share']

In [147]:
test2 = get_table_names('../data/processed/db/academic.sqlite')

test2[:3]

['author', 'conference', 'domain']

In [149]:
test3 = get_column_info('../data/processed/db/academic.sqlite', 'academic', 'author')

test3

[('academic',
  'author',
  [(0, 'aid', 'INT', 0, None, 1),
   (1, 'homepage', 'TEXT', 0, None, 0),
   (2, 'name', 'TEXT', 0, None, 0),
   (3, 'oid', 'INT', 0, None, 0)])]

In [190]:
test = build_schema_info('../data/processed/db/', '.sqlite')

test.head()

Unnamed: 0,schema,table,c_id,c_name,c_type,notnull,dflt_value,is_pk
0,coffee_shop,shop,0,Shop_ID,INT,0,,1
1,coffee_shop,shop,1,Address,TEXT,0,,0
2,coffee_shop,shop,2,Num_of_staff,TEXT,0,,0
3,coffee_shop,shop,3,Score,REAL,0,,0
4,coffee_shop,shop,4,Open_Year,TEXT,0,,0


In [191]:
column_list = []

db = sqlite3.connect('../data/processed/db/coffee_shop.sqlite')
cursor = db.cursor()

schema_name = 'coffee_shop'
table_name = 'shop'
cursor.execute("PRAGMA table_info(" + table_name + ");")
columns = cursor.fetchall()

table_data = list(zip([schema_name], [table_name], [columns]))

cursor.close()
db.close()

schema_df = (pd.DataFrame(table_data, columns=['schema','table','column_info']).explode('column_info', ignore_index=True))

schema_df[['c_id','c_name','c_type','notnull','dflt_value','is_pk']] = schema_df.column_info.tolist()

schema_df.drop(columns='column_info', inplace=True)

print(table_data)
schema_df

[('coffee_shop', 'shop', [(0, 'Shop_ID', 'INT', 0, None, 1), (1, 'Address', 'TEXT', 0, None, 0), (2, 'Num_of_staff', 'TEXT', 0, None, 0), (3, 'Score', 'REAL', 0, None, 0), (4, 'Open_Year', 'TEXT', 0, None, 0)])]


Unnamed: 0,schema,table,c_id,c_name,c_type,notnull,dflt_value,is_pk
0,coffee_shop,shop,0,Shop_ID,INT,0,,1
1,coffee_shop,shop,1,Address,TEXT,0,,0
2,coffee_shop,shop,2,Num_of_staff,TEXT,0,,0
3,coffee_shop,shop,3,Score,REAL,0,,0
4,coffee_shop,shop,4,Open_Year,TEXT,0,,0


[('author',), ('conference',), ('domain',), ('domain_author',), ('domain_conference',), ('journal',), ('domain_journal',), ('keyword',), ('domain_keyword',), ('publication',), ('domain_publication',), ('organization',), ('publication_keyword',), ('writes',), ('cite',)]
['author', 'conference', 'domain']
