In [None]:
import sqlite3
import pandas as pd
import yaml
import os
import mysql.connector
import numpy as np

In [None]:
settings_file = open('/Users/aliceaakerberg/Sites/scripts/settings.yaml','r')
settings = yaml.load(settings_file,Loader=yaml.FullLoader)

db_path = settings[0]['database']['database_path']
username = settings[0]['database']['username']
password = settings[0]['database']['password']
host = settings[0]['database']['host']
database_name = settings[0]['database']['database_name']
output_file_path = settings[1]['output']['output_file_path']

In [None]:
os.makedirs(output_file_path, exist_ok=True)

In [None]:
def get_table_names(db_path):
    # Connect to the SQLite database
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Execute a query to retrieve the table names
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()

    # Close the connection
    conn.close()

    # Store the table names in a list
    table_names = [table[0] for table in tables]
    return table_names

In [None]:
table_names = get_table_names(db_path)
print(table_names)

In [None]:
def transfer_sqlite_to_df(db_path, table_name, output_file_path):
    # Connect to the SQLite3 database
    sqlite_conn = sqlite3.connect(db_path)
    # Read the data from the SQLite3 table into a pandas DataFrame
    df = pd.read_sql(f'SELECT * FROM {table_name}', sqlite_conn)
    sqlite_conn.close()

    # Check for column names with whitespace and replace with underscores
    original_columns = df.columns.tolist()
    new_columns = [col.replace(' ', '_').replace('_-_', '_').replace('-', '_') for col in original_columns]

    # Print which column names have been changed
    for original, new in zip(original_columns, new_columns):
        if original != new:
            print(f"Column name changed in '{table_name}': '{original}' to '{new}'")

    df.columns = new_columns

    output_file_path = f"{output_file_path}/{table_name}.tsv"

    #df.to_csv(output_file_path, sep='\t', index=False)
    print(f"Data from '{table_name}' saved to '{output_file_path}' successfully.")

    df.replace('', np.nan, inplace=True)
    df.replace({np.inf: 'inf', -np.inf: '-inf'}, inplace=True)

    conn = mysql.connector.connect(user=username, password=password, host=host, database=database_name)
    cursor = conn.cursor()

    cursor.execute(f"SELECT * FROM {table_name}")
    existing_rows = cursor.fetchall()

    # Create a set of tuples representing existing rows for fast lookup
    existing_rows_set = set(existing_rows)

    # Step 3: Insert DataFrame into the MySQL table without duplicates
    columns = ', '.join(df.columns)
    placeholders = ', '.join(['%s'] * len(df.columns))
    insert_query = f"INSERT INTO {table_name} ({columns}) VALUES ({placeholders})"

    for i, row in df.iterrows():
        row_tuple = tuple(None if pd.isna(val) else val for val in row)
        if row_tuple not in existing_rows_set:
            try:
                cursor.execute(insert_query, row_tuple)
            except mysql.connector.Error as err:
                print(f"Error: {err}")
                print(f"Failed to insert row: {row_tuple}")

    conn.commit()

    # Step 4: Close the cursor and connection
    cursor.close()
    conn.close()

    print(f"Data imported successfully into table {table_name}!")

In [None]:
for table in table_names:
    transfer_sqlite_to_df(db_path, table, output_file_path)