In [1]:
import csv
import re

def generate_insert_statements(csv_file, database_name, table_name, create_table_stmt, output_file):
    def parse_create_table(create_table_stmt):
        # Extract the column definitions and types
        columns = re.findall(r'(\w+)\s+(\w+)', create_table_stmt, re.IGNORECASE)
        column_names = [col[0] for col in columns]
        column_types = {col[0]: col[1].upper() for col in columns}
        return column_names, column_types
    
    # Parse the CREATE TABLE statement to get column names and types
    column_names, column_types = parse_create_table(create_table_stmt)
    
    # Read the CSV file headers
    with open(csv_file, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        csv_headers = reader.fieldnames
    
    # Ensure CSV headers match the column names in the CREATE TABLE statement
    column_names = [col for col in column_names if col in csv_headers]
    
    # Read the CSV file and generate INSERT statements
    insert_statements = []
    with open(csv_file, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            values = []
            for col in column_names:
                value = row[col]
                if column_types[col] in ['FLOAT']:
                    # Handle float values
                    if value:
                        values.append(value)
                    else:
                        values.append('NULL')
                elif column_types[col] in ['TEXT', 'VARCHAR']:
                    # Handle string values
                    if value:
                        values.append(f"'{value}'")
                    else:
                        values.append('NULL')
                else:
                    # Handle all other types as strings
                    if value:
                        values.append(f"'{value}'")
                    else:
                        values.append('NULL')
            values_str = ', '.join(values)
            insert_statements.append(f"INSERT INTO {database_name}.{table_name} ({', '.join(column_names)}) VALUES ({values_str});")
    
    # Write INSERT statements to the output file
    with open(output_file, 'w', encoding='utf-8') as f:
        for stmt in insert_statements:
            f.write(stmt + '\n')

In [3]:
create_stream_statement = """
CREATE TABLE texas_ois_central (
    dist VARCHAR,
    district_edit VARCHAR,
    date_called_in VARCHAR,
    date_called_in_edit VARCHAR,
    date_of_spill VARCHAR,
    date_of_spill_edit VARCHAR,
    spill_number VARCHAR,
    rrc_job_number VARCHAR,
    operator_rp VARCHAR,
    operator_edit VARCHAR,
    lease_facility_name VARCHAR,
    rrc_id_number VARCHAR,
    county VARCHAR,
    county_edit VARCHAR,
    type_operation VARCHAR,
    source VARCHAR,
    probable_cause VARCHAR,
    probable_cause_edit VARCHAR,
    release_crude_oil VARCHAR,
    release_crude_oil_edit FLOAT,
    release_cond VARCHAR,
    release_prod_wtr VARCHAR,
    release_prod_water_edit FLOAT,
    release_gas VARCHAR,
    recovery_crude_oil VARCHAR,
    recovery_crude_oil_edit FLOAT,
    recovery_cond VARCHAR,
    recovery_prod_wtr VARCHAR,
    recovery_prod_water_edit VARCHAR,
    basis VARCHAR,
    other_rptd_loss_type VARCHAR,
    loss VARCHAR,
    recovery_num FLOAT,
    affected_area VARCHAR,
    spill_on_water VARCHAR,
    spill_on_water_edit VARCHAR,
    ospra VARCHAR,
    swr_ VARCHAR,
    swr_9exempt VARCHAR,
    cleanup_criteria_swr_91 VARCHAR,
    cleanup_criteria_7_00_doc VARCHAR,
    cleanup_criteria_case_specific VARCHAR,
    form_h_rqrd VARCHAR,
    form_h_rqrd_edit VARCHAR,
    date_h_rcvd VARCHAR,
    cleanup_oversight_district VARCHAR,
    cleanup_oversight_austin VARCHAR,
    status_or_phase VARCHAR,
    comments TEXT,
    compliance_date TEXT,
    file_name VARCHAR,
    sheet VARCHAR,
    cleanup_criteria VARCHAR,
    cleanup_oversight VARCHAR,
    rrc_job_number_2 VARCHAR,
    my_of_spill VARCHAR
)
"""

database_name = 'texas_ois'

input_data_file = '/Users/briandunn/Desktop/Kafka Connect/Texas Oil Industry Spills - CockroachDB to Many DBs/orig-data/central_cleaned.csv'

output_file_location = '/Users/briandunn/Desktop/Kafka Connect/Texas Oil Industry Spills - CockroachDB to Many DBs/source-scripts/2_inserts_central.sql'

generate_insert_statements(
    input_data_file, 
    database_name, 
    "texas_ois_central",
    create_stream_statement, 
    output_file_location
)

In [4]:
create_stream_statement = """
CREATE TABLE texas_ois_district (
    dist VARCHAR,
    district_edit VARCHAR NOT NULL,
    date_of_spill VARCHAR,
    date_of_spill_edit VARCHAR,
    date_called_in VARCHAR,
    date_called_in_edit VARCHAR,
    spill_number VARCHAR,
    rrc_job_number VARCHAR,
    operator_rp VARCHAR,
    operator_edit VARCHAR,
    lease_facility_name VARCHAR,
    rrc_id_number VARCHAR,
    county_name VARCHAR,
    county_edited VARCHAR,
    type_operation VARCHAR,
    source VARCHAR,
    probable_cause VARCHAR,
    probable_cause_edit VARCHAR,
    release_crude_oil VARCHAR,
    release_crude_oil_edit VARCHAR,
    release_cond VARCHAR,
    release_prod_wtr VARCHAR,
    release_prod_water_edit FLOAT,
    release_gas VARCHAR,
    recovery_crude_oil VARCHAR,
    recovery_crude_oil_edit FLOAT,
    recovery_cond VARCHAR,
    recovery_prod_wtr VARCHAR,
    recovery_prod_water_edit VARCHAR,
    basis VARCHAR,
    other_rptd_loss_type VARCHAR,
    loss VARCHAR,
    recovery_num VARCHAR,
    affected_area VARCHAR,
    spill_on_water VARCHAR,
    spill_on_water_edit VARCHAR,
    ospra VARCHAR,
    swr_ VARCHAR,
    swr_9exempt VARCHAR,
    cleanup_criteria_swr_91 VARCHAR,
    cleanup_criteria_7_00_doc VARCHAR,
    cleanup_criteria_case_specific VARCHAR,
    form_h_rqrd VARCHAR,
    form_hrqrd_edit VARCHAR,
    date_h_rcvd VARCHAR,
    date_hrcvd_edit VARCHAR,
    cleanup_oversight_district VARCHAR,
    cleanup_oversight_austin VARCHAR,
    status_or_phase VARCHAR,
    comments TEXT,
    compliance_date VARCHAR,
    isocspill FLOAT,
    ocinsp FLOAT,
    ispwspill FLOAT,
    pwinsp FLOAT,
    isotherspill FLOAT,
    otinsp FLOAT,
    last_inspection_date VARCHAR,
    cleanup_criteria VARCHAR,
    cleanup_oversight VARCHAR,
    notes TEXT,
    inspection_id VARCHAR,
    soil_water_samples_required VARCHAR,
    spill_letter_date VARCHAR,
    inspector VARCHAR,
    witn_tech VARCHAR,
    date_witnessed VARCHAR,
    witn_results VARCHAR,
    cleanup_method VARCHAR,
    tph_rcvd VARCHAR,
    tph_comments VARCHAR,
    duplicate VARCHAR
)
"""

database_name = 'texas_ois'

input_data_file = '/Users/briandunn/Desktop/Kafka Connect/Texas Oil Industry Spills - CockroachDB to Many DBs/orig-data/district_cleaned.csv'

output_file_location = '/Users/briandunn/Desktop/Kafka Connect/Texas Oil Industry Spills - CockroachDB to Many DBs/source-scripts/2_inserts_district.sql'

generate_insert_statements(
    input_data_file, 
    database_name,
    "texas_ois_district", 
    create_stream_statement, 
    output_file_location
    )