In [1]:
import csv, re, os

def generate_insert_statements(
    csv_file, 
    create_stream_statement, 
    output_file, 
    table_name,
    encoding='utf-8'
):
    # Extract content within parentheses
    stream_content = re.search(r'\((.*?)\)', create_stream_statement, re.DOTALL).group(1)
    
    # Split content into individual column definitions
    columns = re.findall(r'([a-zA-Z_]+)\s+([a-zA-Z_]+)', stream_content)

    # Extract column names and data types
    column_names_create_stream = [col[0].strip() for col in columns]
    data_types_create_stream = {col[0].strip(): col[1] for col in columns}

    # Open the CSV file
    with open(csv_file, newline='', encoding=encoding, errors='ignore') as csvfile:
        reader = csv.DictReader(csvfile)
        insert_statements = []

        # Iterate over each row in the CSV
        for row in reader:
            columns = []
            values = []

            # Iterate over each column in the row
            for field, value in row.items():
                # Include all columns from the CSV file in INSERT INTO statement
                columns.append(field)
                if value is None or value.strip() == "":
                    # Replace None values or empty strings with NULL
                    values.append("NULL")
                else:
                    # Handle non-empty values
                    data_type = data_types_create_stream.get(field)
                    if data_type == "INTEGER" or data_type == "DOUBLE" or data_type == "FLOAT":
                        # Handle numerical data types
                        values.append(value)
                    elif data_type == "TIMESTAMP" or data_type == "DATE":
                        # Handle timestamp and date data types
                        values.append("'" + str(value) + "'")
                    else:
                        # Handle other data types as strings
                        values.append("'" + str(value).replace("'", "''") + "'")

            # Construct the INSERT INTO statement with the specified table name
            insert_statement = f"INSERT INTO {table_name} ({','.join(columns)}) VALUES ({', '.join(values)});"
            insert_statements.append(insert_statement)

    # Write insert statements to output file
    with open(output_file, 'w', encoding=encoding, errors='ignore') as f:
        for statement in insert_statements:
            f.write(statement + '\n')


In [2]:
create_stream_statement = """
CREATE TABLE IF NOT EXISTS epc_table (
    epc_datetime VARCHAR,
    temperature FLOAT,
    humidity FLOAT,
    wind_speed FLOAT,
    general_diffuse_flows FLOAT,
    diffuse_flows FLOAT,
    power_consumption_zone_1 FLOAT,
    power_consumption_zone_2 FLOAT,
    power_consumption_zone_3 FLOAT
)
"""

In [3]:
input_data_file = '/Users/briandunn/Desktop/Apache_Kafka-Kafka_Connect_non_postgres/Electric Power Consumption/orig-data/powerconsumption.csv'

output_file_location = '/Users/briandunn/Desktop/Apache_Kafka-Kafka_Connect_non_postgres/Electric Power Consumption/source-scripts/3_inserts.sql'

generate_insert_statements(
    input_data_file, 
    create_stream_statement, 
    output_file_location,
    "epc_table"
    )