## Load Data Into Postgres Database

### Merge all .sql files into one for easy loading into database

The sample data comes with full .sql files that create and insert into tables. The added challenege will be to specify the schema and then load in the tables. As the standard structure assumes a lot of different databases and then tables. But I want to do a single database with different schemas and tables underneath those.

In [50]:
import os
import json
import pandas as pd
import numpy as np

In [51]:
directory = os.fsencode('/Users/brettly/Sboard/projects/text-to-sql/data/raw/spider/database')
schemas = []

for subdir, dirs, files in os.walk(directory):
    for dir in dirs:
        schemas.append(os.fsdecode(dir))
    #for file in files:
    #    filename = os.fsdecode(file)
    #    if filename.endswith('.sql'):
    #        path = os.path.join(subdir, file)
    #        print(path)
    #    continue
    #    else:
    #        continue

In [52]:
schemas[:10]

['browser_web',
 'musical',
 'farm',
 'voter_1',
 'game_injury',
 'hospital_1',
 'manufacturer',
 'station_weather',
 'perpetrator',
 'storm_record']

## Breakout tables.json into Schema info

In [53]:
path = '../data/raw/spider/'

with open(path+'tables.json', "r") as f:
    data = json.load(f)

In [54]:
#path to schema name
data[1]['db_id']

'college_2'

In [55]:
#path to first table
data[0]['table_names_original'][1]

'people'

In [56]:
#path for fields in first table
i = 0
for field in data[0]['column_names_original']:
    if data[0]['column_names_original'][i][0] == 0:
        print(data[0]['column_names_original'][i][1])

    i+=1

Perpetrator_ID
People_ID
Date
Year
Location
Country
Killed
Injured


In [57]:
#path for column types -- these are in a big list, not by table
data[0]['column_types']

['text',
 'number',
 'number',
 'text',
 'number',
 'text',
 'text',
 'number',
 'number',
 'number',
 'text',
 'number',
 'number',
 'text']

In [58]:
#path to primary keys
data[0]['primary_keys']

[1, 9]

In [59]:
#path to foreign keys
data[0]['foreign_keys'][0]

[2, 9]

In [60]:
data[0]['foreign_keys'][0][0]

2

## Build a Nested List

In [61]:
#Create a nested list with the schema info
#This won't continually build overtime, but reset the lists each time the parent cycle changes in order to create a complete list of unique schema, table, column, column info

schema_info = [] #list for savings total schema info

i = 0 #set initial i value, we will increase this each time we move to the next schema in the json file
for item in data:
#loop through schemas in the json file and add the schema name to a new list (schem_list)
    schem_list = [] #initialize empty list
    schema = data[i]['db_id'] #set variable to the db_id, which we will eventually title the schema
    schem_list.append(schema) #append the variable to the schem_list
    
    k=0 #set initial k value, we will increase this each time we move to the next table within a schema
    for table in data[i]['table_names_original']:
    #loop through the tables in the current schema and add the table name to a new list (tab_list)
        tab_list = [] #initialize empty list
        tab_value=data[i]['table_names_original'][k] #set variable to the name of the table
        tab_list.append(tab_value) #append the variable to the tab_list

        #create empty lists for the column name, type, if its a primary key, if its a foreign key (and if so: the reference table and field)
        col_list=[]
        col_type_list=[]
        col_pk_list=[]
        col_fk_list=[]
        col_fk_ref_tab_list=[]
        col_fk_ref_col_list=[]

        n = 0 #set initial n value, we will increase this each time we move to the next field within a table
        for field in data[i]['column_names_original']:
        #loop through the fields in each table and write the field info to seperate lists we will eventually zip
            if data[i]['column_names_original'][n][0] == k: #the columns are not nested underneath the tables in the json, rather the first index is the table # and the second is the col name. So this checks to be sure we don't assign a col to the wrong table.
                col_value = data[i]['column_names_original'][n][1] #set variable equal to the column name
                col_list.append(col_value) #append name to the col_list table

                col_type = data[i]['column_types'][n] #set variable equal to the column type
                col_type_list.append(col_type) #append type to the col_type_list table

                pk_flag = n in data[i]['primary_keys'] #set variable True/False for if the column is the primary key.
                col_pk_list.append(pk_flag) #append the boolean to the list

                b=0 #set initial b value, we will increase this each time we check the next foreign key column. The foreign key is not listed in the same way as the primary key, but rather it lists all the 
                fk_tup = (None, None, None, None) #create a blank tuple as a baseline so it doesn't error when we try to call on fk_tup within an if_statement
                for fk in data[i]['foreign_keys']:
                #loop through the foreign key list and parse out the field name, reference table, and reference field
                    if data[i]['foreign_keys'][b][0] == n: #only parse out data if the foreign key matches the column we're currently looping through
                        #expand tuple variables here
                        fk_column = data[i]['foreign_keys'][b][0] #go through json and grab # of foreign key column
                        fk_ref_table = data[i]['table_names_original'][data[i]['column_names_original'][data[i]['foreign_keys'][b][1]][0]] #find corresponding the reference table in the 1st index under foreign key and return the name
                        fk_ref_column = data[i]['column_names_original'][data[i]['foreign_keys'][b][1]][1] #find corresponding field in reference table  and retun the name
                        fk_tup = (fk_column, fk_ref_table, fk_ref_column) #create a tuple from our variables
                    b+=1 #increment the foreign key item

                if fk_tup[0] == n: #only perform if foreign key matches the current column
                    col_fk_list.append(1==1) #append True
                    col_fk_ref_tab_list.append(fk_tup[1]) #append reference table
                    col_fk_ref_col_list.append(fk_tup[2]) #append reference field
                else:
                    col_fk_list.append(0==1) #append False
                    col_fk_ref_tab_list.append(None) #append reference table
                    col_fk_ref_col_list.append(None) #append reference field

            n+=1 #increment the column item
        column_info = list(zip(col_list, col_type_list, col_pk_list, col_fk_list, col_fk_ref_tab_list, col_fk_ref_col_list)) #zip together all the lists created with column info
        combined = [schem_list, tab_list, column_info] #create a combined list with schema name, table name, and zipped column info. We'll unzip in the dataframe later
        schema_info.append(combined) #append this combined info to the master schema info list
        k+=1 #increment the table item
    i+=1 #increment the schema item

In [62]:
schema_info[0]

[['perpetrator'],
 ['perpetrator'],
 [('Perpetrator_ID', 'number', True, False, None, None),
  ('People_ID', 'number', False, True, 'people', 'People_ID'),
  ('Date', 'text', False, False, None, None),
  ('Year', 'number', False, False, None, None),
  ('Location', 'text', False, False, None, None),
  ('Country', 'text', False, False, None, None),
  ('Killed', 'number', False, False, None, None),
  ('Injured', 'number', False, False, None, None)]]

## Convert Nested List into Pandas Dataframe

Using the explode function to breakout the unique combos of schema->table->column->info

In [63]:
schema_df = (pd.DataFrame(schema_info, columns=['schema','table','column_info']).explode('schema', ignore_index=True).explode('table', ignore_index=True).explode('column_info', ignore_index=True))

In [64]:
schema_df.head()

Unnamed: 0,schema,table,column_info
0,perpetrator,perpetrator,"(Perpetrator_ID, number, True, False, None, None)"
1,perpetrator,perpetrator,"(People_ID, number, False, True, people, Peopl..."
2,perpetrator,perpetrator,"(Date, text, False, False, None, None)"
3,perpetrator,perpetrator,"(Year, number, False, False, None, None)"
4,perpetrator,perpetrator,"(Location, text, False, False, None, None)"


In [65]:
#unzeip the column info into their own columns
schema_df[['column_name','column_type','is_primary_key','is_foreign_key','fk_reference_table','fk_reference_column']] = pd.DataFrame(schema_df.column_info.tolist(), index=schema_df.index)

schema_df.head()

Unnamed: 0,schema,table,column_info,column_name,column_type,is_primary_key,is_foreign_key,fk_reference_table,fk_reference_column
0,perpetrator,perpetrator,"(Perpetrator_ID, number, True, False, None, None)",Perpetrator_ID,number,True,False,,
1,perpetrator,perpetrator,"(People_ID, number, False, True, people, Peopl...",People_ID,number,False,True,people,People_ID
2,perpetrator,perpetrator,"(Date, text, False, False, None, None)",Date,text,False,False,,
3,perpetrator,perpetrator,"(Year, number, False, False, None, None)",Year,number,False,False,,
4,perpetrator,perpetrator,"(Location, text, False, False, None, None)",Location,text,False,False,,


In [66]:
#drop tuple data and sort alphabetically
schema_df = schema_df.drop(columns=['column_info'])

In [67]:
#replace 'None' with NaN
schema_df = schema_df.fillna(value=np.nan)

In [68]:
schema_df.head()

Unnamed: 0,schema,table,column_name,column_type,is_primary_key,is_foreign_key,fk_reference_table,fk_reference_column
0,perpetrator,perpetrator,Perpetrator_ID,number,True,False,,
1,perpetrator,perpetrator,People_ID,number,False,True,people,People_ID
2,perpetrator,perpetrator,Date,text,False,False,,
3,perpetrator,perpetrator,Year,number,False,False,,
4,perpetrator,perpetrator,Location,text,False,False,,


## Write Create Schema Statements

In [69]:
schemas = (schema_df['schema'].unique()).tolist()

In [70]:
schemas[:5]

['perpetrator', 'college_2', 'flight_company', 'icfp_1', 'body_builder']

In [71]:
#write new .sql file
#with open('/Users/brettly/Sboard/projects/text-to-sql/references/sql_files/create_schema.sql', 'w') as myFile:
#    for schema in schemas:
#        statement = "CREATE SCHEMA "+schema+";\n"
#        myFile.write(statement)

## Write Create Table Statements

The result doesn't have to be perfect, but the tables.json had only basic column_types (text, number) while the .sql files had a much better representation of types. If possible I'll do some updating but I won't worry too much about it.

In [72]:
#view col types
schema_df['column_type'].value_counts().sort_values(ascending=False)

column_type
number     2178
text       2097
time        215
others        8
boolean       5
Name: count, dtype: int64

In [73]:
#what is that "others"?
schema_df[schema_df['column_type'] == 'others']

#debate is boolean
#concert_singer is boolean
#school_bus is boolean
#apartment_rentals is 'BIT', we'll use the same for postgres
#employee_hire_evaluation is boolean
#sakila_1 is 'BLOB'. We'll use 'bytea' type for postgres
#orchestra is boolean
#party_host is boolean

Unnamed: 0,schema,table,column_name,column_type,is_primary_key,is_foreign_key,fk_reference_table,fk_reference_column
693,debate,debate_people,If_Affirmative_Win,others,False,False,,
1223,concert_singer,singer,Is_male,others,False,False,,
1865,school_bus,school_bus,If_full_time,others,False,False,,
2187,apartment_rentals,View_Unit_Status,available_yn,others,False,False,,
2618,employee_hire_evaluation,hiring,Is_full_time,others,False,False,,
3070,sakila_1,staff,picture,others,False,False,,
3750,orchestra,show,If_first_show,others,False,False,,
4473,party_host,party_host,Is_Main_in_Charge,others,False,False,,


### Update 'other' datatypes

In [74]:
#we'll do some manual and specific updates
schema_df.at[693,'column_type'] = 'boolean'
schema_df.at[1223,'column_type'] = 'boolean'
schema_df.at[1865,'column_type'] = 'boolean'
schema_df.at[2187,'column_type'] = 'bit'
schema_df.at[2618,'column_type'] = 'boolean'
schema_df.at[3070,'column_type'] = 'bytea' #postgres equivalent of BLOB
schema_df.at[3750,'column_type'] = 'boolean'
schema_df.at[4473,'column_type'] = 'boolean'

In [75]:
schema_df['column_type'].value_counts().sort_values(ascending=False)

column_type
number     2178
text       2097
time        215
boolean      11
bit           1
bytea         1
Name: count, dtype: int64

### Fix Date Types
I'm note a huge fan of the number and text being so generic, but with so many tables, I don't have time to validate them all so I may just try to change anything with Date in it to Dates. I'll also change any that have a pure 'Date" name to something that's not also a dtype.

In [76]:
schema_df[schema_df['column_name'].str.contains("Date") | schema_df['column_name'].str.contains("date")]

Unnamed: 0,schema,table,column_name,column_type,is_primary_key,is_foreign_key,fk_reference_table,fk_reference_column
2,perpetrator,perpetrator,Date,text,False,False,,
73,flight_company,flight,Date,text,False,False,,
100,body_builder,people,Birth_Date,text,False,False,,
104,storm_record,storm,Dates_active,text,False,False,,
132,pilot_record,pilot_record,Date,text,False,False,,
...,...,...,...,...,...,...,...,...
4426,company_1,department,Mgr_start_date,text,False,False,,
4434,company_1,dependent,Bdate,text,False,False,,
4439,workshop_paper,workshop,Date,text,False,False,,
4480,product_catalog,Catalogs,date_of_publication,time,False,False,,


In [77]:
#change column_type
schema_df.loc[schema_df.column_name.str.contains("Date") | schema_df.column_name.str.contains("date"), 'column_type'] = 'date'

In [78]:
#change column name
schema_df.loc[(schema_df.column_name == "Date") | (schema_df.column_name == "date"), 'column_name'] = 'date_value'

In [79]:
schema_df['column_type'].value_counts().sort_values(ascending=False)

column_type
number     2173
text       2049
date        252
time         16
boolean      11
bit           1
bytea         1
Name: count, dtype: int64

### Convert Number to Numeric

Turns out Number isn't a postgres datatype, so let's update that

In [80]:
schema_df.loc[schema_df.column_type=='number', 'column_type'] = 'numeric'

In [81]:
schema_df['column_type'].value_counts().sort_values(ascending=False)

column_type
numeric    2173
text       2049
date        252
time         16
boolean      11
bit           1
bytea         1
Name: count, dtype: int64

### Remove Spaces and Make Lowercase

This will hold up the integrity of the data

In [82]:
schema_df.column_name = schema_df.column_name.str.replace(' ','_')

In [83]:
schema_df.column_name = schema_df.column_name.str.lower()

### Fix Some Naming Erros
An initial test of the load revelaed some column naming issues, so I'l fix those here.

In [84]:
col_upd_dict = {'from': 'from_value', 'start': 'start_time', 'end': 'end_time', '%_change_2007': 'change_perc', 'official_ratings_(millions)': 'official_ratings_mil', '18_49_rating_share':'rating_share'}


In [85]:
for key, value in col_upd_dict.items():
    schema_df.loc[schema_df.column_name == key, 'column_name'] = value

In [86]:
schema_df.loc[schema_df.column_name == 'offical_ratings_(millions)']

Unnamed: 0,schema,table,column_name,column_type,is_primary_key,is_foreign_key,fk_reference_table,fk_reference_column


### Function to Write .sql statements

Thanks to the chaptGPT assist. This should generate the create statements for me

In [87]:
# SQL CREATE TABLE statement builder
table_columns = {}
for row in schema_df.itertuples(index=False):
    schema, table, column, data_type, primary_key = row[:5]  # Unpack the row values

    # Create or update the table_columns dictionary
    key = (schema, table)
    if key not in table_columns:
        table_columns[key] = []
    
    column_def = f"{column} {data_type}"
    if primary_key:
        column_def += " PRIMARY KEY"

    table_columns[key].append(column_def)

# Generate SQL CREATE TABLE statements
sql_statements = []
for (schema, table), columns in table_columns.items():
    column_defs = ',\n'.join(columns)
    sql_statement = f"CREATE TABLE {schema}.{table} (\n{column_defs}\n);\n"
    sql_statements.append(sql_statement)


In [88]:
for i in range(3):
    print(sql_statements[i])

CREATE TABLE perpetrator.perpetrator (
perpetrator_id numeric PRIMARY KEY,
people_id numeric,
date_value date,
year numeric,
location text,
country text,
killed numeric,
injured numeric
);

CREATE TABLE perpetrator.people (
people_id numeric PRIMARY KEY,
name text,
height numeric,
weight numeric,
home_town text
);

CREATE TABLE college_2.classroom (
building text PRIMARY KEY,
room_number text,
capacity numeric
);



### Write to .SQL File

In [89]:
#write new .sql file
#with open('/Users/brettly/Sboard/projects/text-to-sql/references/sql_files/create_tables.sql', 'w') as sqlFile:
#    for statement in sql_statements:
#        sqlFile.write(statement)

## Prep Foreign Key Alteration Script

Thanks chatGPT for the help!

In [90]:
# SQL ALTER TABLE statement builder
alter_statements = []
for row in schema_df.itertuples(index=False):
    schema = row[0]
    table = row[1]
    column_name = row[2]
    is_foreign_key = row[5]
    reference_table = row[6]
    reference_column = row[7]

    if is_foreign_key and reference_table and reference_column:
        alter_statement = (
            f"ALTER TABLE {schema}.{table}\n"
            f"ADD CONSTRAINT {column_name}_fk\n"
            f"FOREIGN KEY ({column_name})\n"
            f"REFERENCES {schema}.{reference_table}({reference_column});\n\n"
        )
        alter_statements.append(alter_statement)

### Write to .SQL File

In [91]:
#write new .sql file
#with open('/Users/brettly/Sboard/projects/text-to-sql/references/sql_files/alter_tables.sql', 'w') as fkFile:
#    for statement in alter_statements:
#        fkFile.write(statement)