#### **First Step**: Write the queries for the PostgreSQL database based on the data.


Task:

- Identify the data types to write the database `schema.sql`
- Transfer the data from the CSV to a query that allows me to insert all the `seed_data.sql`


In [1]:
import pandas as pd
import numpy as np

import logging

logging.basicConfig(level=logging.INFO)


def infer_sql_type(dtype):
    """
    Infers the corresponding SQL data type for a given pandas dtype.

    Args:
        dtype (numpy.dtype): The data type of the pandas DataFrame column.

    Returns:
        str: The inferred SQL data type as a string. Possible values include "INTEGER", 
        "FLOAT", "BOOLEAN", "TIMESTAMP", or "TEXT".
    """
    logging.info(f"Infering SQL type for {dtype}")
    if np.issubdtype(dtype, np.integer):
        return "INTEGER"
    elif np.issubdtype(dtype, np.floating):
        return "FLOAT"
    elif np.issubdtype(dtype, np.bool_):
        return "BOOLEAN"
    elif np.issubdtype(dtype, np.datetime64):
        return "TIMESTAMP"
    else:
        return "TEXT"


def generate_schema(df, table_name='my_table'):
    """
    Generates a SQL schema for a given pandas DataFrame.

    Args:
        df (pandas.DataFrame): The DataFrame for which the schema is to be generated.
        table_name (str, optional): The name of the SQL table. Defaults to 'my_table'.

    Returns:
        str: A string containing the SQL statement to create the table with the inferred 
        column types.
    """
    logging.info(f"Generating schema for {table_name}")
    columns = []
    for col in df.columns:
        col_type = infer_sql_type(df[col].dtype)
        columns.append(f'"{col}" {col_type}')
    schema = f"CREATE TABLE IF NOT EXISTS {table_name} (\n" + ",\n".join(columns) + "\n);"
    return schema


def generate_seed_data(df, table_name='candidates'):
    """
    Generates SQL insert statements to seed data into a SQL table.

    Args:
        df (pandas.DataFrame): The DataFrame containing the data to be inserted.
        table_name (str, optional): The name of the SQL table. Defaults to 'my_table'.

    Returns:
        str: A string containing multiple SQL insert statements to seed the data into the table.
    """
    logging.info(f"Generating seed data for {table_name}")
    insert_statements = []
    for _, row in df.iterrows():
        values = []
        for value in row:
            if pd.isna(value):
                values.append("NULL")
            elif isinstance(value, str):
                values.append(f"'{value.replace('\'', '\'\'')}'")
            else:
                values.append(str(value))
        insert_statements.append(
            f"INSERT INTO {table_name} VALUES (" + ", ".join(values) + ");")
    return "\n".join(insert_statements)


def main(csv_file, schema_file, seed_file, table_name, delimiter):
    """
    Main function to load a CSV file, generate a SQL schema and seed data, and save them to files.

    Args:
        csv_file (str): Path to the input CSV file.
        schema_file (str): Path to the output file where the SQL schema will be saved.
        seed_file (str): Path to the output file where the SQL seed data will be saved.
        table_name (str): Name of the SQL table to be created and seeded.
        delimiter (str): Delimiter used in the CSV file.

    Returns:
        None
    """

    # Load the CSV file
    logging.info(f"Loading CSV file {csv_file}")
    df = pd.read_csv(csv_file, delimiter=delimiter)
    df.columns = df.columns.str.replace(' ', '_')

    # Generate the SQL schema
    schema = generate_schema(df, table_name)
    with open(schema_file, 'w') as f:
        f.write(schema)

    # Generate the SQL seed data
    seed_data = generate_seed_data(df, table_name)
    with open(seed_file, 'w') as f:
        f.write(seed_data)


if __name__ == "__main__":
    main(csv_file='../data/raw/candidates.csv', schema_file='../sql/schema.sql',
         seed_file='../sql/seed_data.sql', table_name='candidates', delimiter=';')

INFO:root:Loading CSV file ../data/raw/candidates.csv
INFO:root:Generating schema for candidates
INFO:root:Infering SQL type for object
INFO:root:Infering SQL type for object
INFO:root:Infering SQL type for object
INFO:root:Infering SQL type for object
INFO:root:Infering SQL type for object
INFO:root:Infering SQL type for int64
INFO:root:Infering SQL type for object
INFO:root:Infering SQL type for object
INFO:root:Infering SQL type for int64
INFO:root:Infering SQL type for int64
INFO:root:Generating seed data for candidates


---


#### **Second Step**: Upload data to database


Task:

- Import db class to use connector
- Establish connection and execute the queries to create the schema and send the data.
- Validate that the table has been created and that all records have been loaded.


In [2]:
import sys
import os

# Add the 'src' folder to sys.path
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

from connections.db import DB
db = DB()

In [3]:
# Remove the table if it already exists
db.execute("../sql/queries/002_drop_tables.sql")

INFO:root:✔ Connected to database
INFO:root:✔ Query executed
INFO:root:✔ Cursor closed
INFO:root:✔ Connection closed


In [4]:
# Create schema
db.execute("../sql/schema.sql")

INFO:root:✔ Connected to database
INFO:root:✔ Query executed
INFO:root:✔ Cursor closed
INFO:root:✔ Connection closed


In [5]:
# Seed data
db.execute("../sql/seed_data.sql")

INFO:root:✔ Connected to database
INFO:root:✔ Query executed
INFO:root:✔ Cursor closed
INFO:root:✔ Connection closed


In [6]:
# View tables
db.execute("../sql/queries/001_view_tables.sql")

INFO:root:✔ Connected to database
INFO:root:✔ Query executed
INFO:root:✔ Cursor closed
INFO:root:✔ Connection closed


[('candidates',)]

In [7]:
# View size of tables
db.execute("../sql/queries/003_view_tables_sizes.sql")

INFO:root:✔ Connected to database
INFO:root:✔ Query executed
INFO:root:✔ Cursor closed
INFO:root:✔ Connection closed


[('public.candidates', 50000)]

---


#### **Results**:


- Created the query to define the database schema based on the data.
- Created the query to insert the data seed to the database.
- Connection established with the database.
- Creation of table with the defined schema.
- Upload data to the table.


---
