## Load Data Into Postgres Database

### Merge all .sql files into one for easy loading into database

The sample data comes with full .sql files that create and insert into tables. The added challenege will be to specify the schema and then load in the tables. As the standard structure assumes a lot of different databases and then tables. But I want to do a single database with different schemas and tables underneath those.

In [1]:
import os
import json
import pandas as pd
import numpy as np

In [2]:
directory = os.fsencode('/Users/brettly/Sboard/projects/text-to-sql/data/raw/spider/database')
schemas = []

for subdir, dirs, files in os.walk(directory):
    for dir in dirs:
        schemas.append(os.fsdecode(dir))
    #for file in files:
    #    filename = os.fsdecode(file)
    #    if filename.endswith('.sql'):
    #        path = os.path.join(subdir, file)
    #        print(path)
    #    continue
    #    else:
    #        continue

In [3]:
schemas[:10]

['browser_web',
 'musical',
 'farm',
 'voter_1',
 'game_injury',
 'hospital_1',
 'manufacturer',
 'station_weather',
 'perpetrator',
 'storm_record']

## Breakout tables.json into Schema info

In [4]:
path = '../data/raw/spider/'

with open(path+'tables.json', "r") as f:
    data = json.load(f)

In [5]:
#path to schema name
data[1]['db_id']

'college_2'

In [6]:
#path to first table
data[0]['table_names_original'][1]

'people'

In [7]:
#path for fields in first table
i = 0
for field in data[0]['column_names_original']:
    if data[0]['column_names_original'][i][0] == 0:
        print(data[0]['column_names_original'][i][1])

    i+=1

Perpetrator_ID
People_ID
Date
Year
Location
Country
Killed
Injured


In [8]:
#path for column types -- these are in a big list, not by table
data[0]['column_types']

['text',
 'number',
 'number',
 'text',
 'number',
 'text',
 'text',
 'number',
 'number',
 'number',
 'text',
 'number',
 'number',
 'text']

In [9]:
#path to primary keys
data[0]['primary_keys']

[1, 9]

In [10]:
#path to foreign keys
data[0]['foreign_keys'][0]

[2, 9]

In [11]:
data[0]['foreign_keys'][0][0]

2

## Build a Nested List

In [12]:
#Create a nested list with the schema info

schema_info = []

i = 0
for item in range(2):#data:
    schem_list = []
    schema = data[i]['db_id']
    schem_list.append(schema)
    
    k=0
    for table in data[i]['table_names_original']:
        tab_list = []
        tab_value=data[i]['table_names_original'][k]
        tab_list.append(tab_value)

        col_list=[]
        col_type_list=[]
        col_pk_list=[]
        col_fk_list=[]
        col_fk_ref_tab_list=[]
        col_fk_ref_col_list=[]
        n = 0
        for field in data[i]['column_names_original']:
            if data[i]['column_names_original'][n][0] == k:
                col_value = data[i]['column_names_original'][n][1]
                col_list.append(col_value)

                col_type = data[i]['column_types'][n]
                col_type_list.append(col_type)

                pk_flag = n in data[i]['primary_keys']
                col_pk_list.append(pk_flag)

                b=0
                fk_tup = (None, None, None, None)
                for fk in data[i]['foreign_keys']:
                    if data[i]['foreign_keys'][b][0] == n:
                        #expand tuple variables here
                        fk_tup = (data[i]['foreign_keys'][b][0], 1==1, data[i]['column_names_original'][data[i]['foreign_keys'][b][1]][1], data[i]['table_names_original'][data[i]['column_names_original'][data[i]['foreign_keys'][b][1]][0]])
                    b+=1

                if fk_tup[0] == n:
                    col_fk_list.append(fk_tup[1]) #append True/False
                    col_fk_ref_tab_list.append(fk_tup[3]) #append reference table
                    col_fk_ref_col_list.append(fk_tup[2]) #append reference field
                else:
                    col_fk_list.append(0==1) #append True/False
                    col_fk_ref_tab_list.append(None) #append reference table
                    col_fk_ref_col_list.append(None) #append reference field

            n+=1
        column_info = list(zip(col_list, col_type_list, col_pk_list, col_fk_list, col_fk_ref_tab_list, col_fk_ref_col_list))
        combined = [schem_list, tab_list, column_info]
        schema_info.append(combined)
        k+=1
    i+=1

In [13]:
schema_info[5]

[['college_2'],
 ['instructor'],
 [('ID', 'text', True, False, None, None),
  ('name', 'text', False, False, None, None),
  ('dept_name', 'text', False, True, 'department', 'dept_name'),
  ('salary', 'number', False, False, None, None)]]

## Convert Nested List into Pandas Dataframe

In [14]:
schema_df = (pd.DataFrame(schema_info, columns=['schema','table','column_info']).explode('schema', ignore_index=True).explode('table', ignore_index=True).explode('column_info', ignore_index=True))

In [15]:
schema_df.head()

Unnamed: 0,schema,table,column_info
0,perpetrator,perpetrator,"(Perpetrator_ID, number, True, False, None, None)"
1,perpetrator,perpetrator,"(People_ID, number, False, True, people, Peopl..."
2,perpetrator,perpetrator,"(Date, text, False, False, None, None)"
3,perpetrator,perpetrator,"(Year, number, False, False, None, None)"
4,perpetrator,perpetrator,"(Location, text, False, False, None, None)"


In [16]:
schema_df[['column_name','column_type','is_primary_key','is_foreign_key','fk_reference_table','fk_reference_column']] = pd.DataFrame(schema_df.column_info.tolist(), index=schema_df.index)

schema_df.head()

Unnamed: 0,schema,table,column_info,column_name,column_type,is_primary_key,is_foreign_key,fk_reference_table,fk_reference_column
0,perpetrator,perpetrator,"(Perpetrator_ID, number, True, False, None, None)",Perpetrator_ID,number,True,False,,
1,perpetrator,perpetrator,"(People_ID, number, False, True, people, Peopl...",People_ID,number,False,True,people,People_ID
2,perpetrator,perpetrator,"(Date, text, False, False, None, None)",Date,text,False,False,,
3,perpetrator,perpetrator,"(Year, number, False, False, None, None)",Year,number,False,False,,
4,perpetrator,perpetrator,"(Location, text, False, False, None, None)",Location,text,False,False,,


In [17]:
#drop tuple data
schema_df = schema_df.drop(columns=['column_info'])

In [18]:
#replace 'None' with NaN
schema_df = schema_df.fillna(value=np.nan)

In [19]:
schema_df.head(50)

Unnamed: 0,schema,table,column_name,column_type,is_primary_key,is_foreign_key,fk_reference_table,fk_reference_column
0,perpetrator,perpetrator,Perpetrator_ID,number,True,False,,
1,perpetrator,perpetrator,People_ID,number,False,True,people,People_ID
2,perpetrator,perpetrator,Date,text,False,False,,
3,perpetrator,perpetrator,Year,number,False,False,,
4,perpetrator,perpetrator,Location,text,False,False,,
5,perpetrator,perpetrator,Country,text,False,False,,
6,perpetrator,perpetrator,Killed,number,False,False,,
7,perpetrator,perpetrator,Injured,number,False,False,,
8,perpetrator,people,People_ID,number,True,False,,
9,perpetrator,people,Name,text,False,False,,
