In [327]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

In [328]:
FILE_NAME = 'Account_fixed.csv'

PRIMARY_KEY = 'Account_Account'

# add tupel like: (column_name, foreign_key_table_name.foreign_key_column_name) || leave empty if not foreign key
FOREIGN_KEY = []

URL = os.path.join('../data_clean', FILE_NAME)

In [329]:
def create_column_names(dataframe):
    columns = dataframe.columns
    columns = [col + '_id' if col == PRIMARY_KEY else col for col in columns]
    columns = [re.sub(r'\W+', '', col) for col in columns]
    columns = [col.lower() for col in columns]
    dict_columns = dict(zip(dataframe.columns, columns))
    return dict_columns

def change_fk_name(string):
    string = string + '_id'
    string = string.lower()
    return string

In [330]:
data = pd.read_csv(URL, sep=',')
data.head()

Unnamed: 0,Account_Account,Account_Adres_Geografische_regio,Account_Adres_Geografische_subregio,Account_Adres_Plaats,Account_Adres_Postcode,Account_Adres_Provincie,Account_Industriezone_Naam_,Account_Is_Voka_entiteit,Account_Ondernemingsaard,Account_Ondernemingstype,Account_Oprichtingsdatum,Account_Primaire_activiteit,Account_Reden_van_status,Account_Status,Account_Voka_Nr_,Account_Adres_Land
0,00002561-6762-EC11-8F8F-000D3A2E7738,Mechelen-Kempen,Kempen-West,Herentals,2200,Antwerpen,,0,,Bedrijf,8-9-2013,,Actief,1,291484,België
1,00002DAC-0A69-E111-B43A-00505680000A,Oost-Vlaanderen,Oudenaarde,BEVERE,9700,Oost-Vlaanderen,,0,Diensten,Bedrijf,2-10-1989,Overige industrie & diensten,Actief,1,128093,België
2,00002F96-DC68-E111-B43A-00505680000A,Limburg,Oost Limburg,KINROOI,3640,Limburg,,0,,Bedrijf,17-2-1999,,Actief,1,166644,België
3,00005967-2959-E411-8F25-005056B06EB4,Vlaams-Brabant,Regio West Side,HALLE,1500,Vlaams-Brabant,,0,,Bedrijf,,Financiële diensten,Inactief,0,217534,België
4,00006DCD-DF68-E111-B43A-00505680000A,West-Vlaanderen,Roeselare/Tielt,EMELGEM,8870,West-Vlaanderen,,0,,Bedrijf,27-12-2001,Technologische industrie & diensten,Inactief,0,81445,België


In [331]:
new_col_names = create_column_names(data)

In [332]:
data.rename(
    columns=new_col_names, inplace=True)

PRIMARY_KEY = new_col_names[PRIMARY_KEY]
FOREIGN_KEY = [(new_col_names[fk[0]], fk[1]) for fk in FOREIGN_KEY]
FOREIGN_KEY = [(fk[0], change_fk_name(fk[1])) for fk in FOREIGN_KEY]

In [333]:
def generate_create_table_query():
    sql_data_types = {
        'int64': 'INT',
        'float64': 'FLOAT',
        'object': 'VARCHAR(255)',
        'datetime64[ns]': 'DATE',
    }

    table_name = FILE_NAME[:-10]  # Remove the '.csv' extension and the 'fixed' to use as the table name
    table_name = table_name.replace(' ', '_')  # Replace spaces with underscores to prevent SQL errors

    # Create a list of column definitions with each column on a new line
    columns_list = [f'{col} {sql_data_types.get(str(dtype), "VARCHAR(255)")}' for col, dtype in zip(data.columns, data.dtypes)]

    # Include PRIMARY KEY and FOREIGN KEY columns if provided
    if PRIMARY_KEY:
        # find which column is the primary key
        primary_key_index = data.columns.get_loc(PRIMARY_KEY)
        # add PRIMARY KEY to the column definition
        columns_list[primary_key_index] = f'{PRIMARY_KEY} INT NOT NULL PRIMARY KEY IDENTITY(1,1)'
    if FOREIGN_KEY:
        # loop through the list of FOREIGN KEY tuples and add each FOREIGN KEY to the column definition
        for foreign_key in FOREIGN_KEY:
            columns_list.append(f'FOREIGN KEY ({foreign_key[0]}) REFERENCES {foreign_key[1].split(".")[0]}({foreign_key[1].split(".")[1]})')
            
    # Join the column definitions into a string with each column on a new line
    columns = ',\n'.join(columns_list)

    # Generate the CREATE TABLE query
    create_table_query = f'IF NOT EXISTS (SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = \'{table_name}\')\nBEGIN\nCREATE TABLE {table_name} (\n{columns});\nEND'

    # Print the generated CREATE TABLE query
    print(create_table_query)
    print()

    ################################################
    ### Generate the SQL_Alchemy class notation ####
    ################################################
    
    # Generate the SQLAlchemy class representation
    class_representation = f'class {table_name.capitalize()}(Base):\n'
    class_representation += f'    __tablename__ = \'{table_name}\'\n'

    for i, (col, dtype) in enumerate(zip(data.columns, data.dtypes)):
        # Change VARCHAR(255) to String(255)
        sqlalchemy_type = 'String(255)' if sql_data_types.get(str(dtype), None) == 'VARCHAR(255)' else sql_data_types.get(str(dtype), 'String(255)')
        
        # Skip the column with the primary key, if applicable
        if primary_key_index is not None and i == primary_key_index:
            class_representation += f'    {col} = Column({sqlalchemy_type}, primary_key=True)\n'
        else:
            class_representation += f'    {col} = Column({sqlalchemy_type})\n'
                
    # Add foreign keys to the class representation, if applicable
    if FOREIGN_KEY:
        for foreign_key_tuple in FOREIGN_KEY:
            # find datatype of foreign key column
            foreign_key_column = data[foreign_key_tuple[0]]
            foreign_key_dtype = str(foreign_key_column.dtype)
            sqlalchemy_type = 'String(255)' if sql_data_types.get(foreign_key_dtype, None) == 'VARCHAR(255)' else sql_data_types.get(foreign_key_dtype, 'String(255)')
            # replace the column definition with the column definition and foreign key
            fk_str_rep = f'{foreign_key_tuple[0]} = Column({sqlalchemy_type}, ForeignKey(\'{foreign_key_tuple[1]}\'))'
            class_representation = class_representation.replace(foreign_key_tuple[0], fk_str_rep)
            class_representation = class_representation.replace(f'{fk_str_rep} = Column({sqlalchemy_type})', fk_str_rep)

    print(class_representation)
    print()

In [334]:
generate_create_table_query()

IF NOT EXISTS (SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 'Account')
BEGIN
CREATE TABLE Account (
account_account_id INT NOT NULL PRIMARY KEY IDENTITY(1,1),
account_adres_geografische_regio VARCHAR(255),
account_adres_geografische_subregio VARCHAR(255),
account_adres_plaats VARCHAR(255),
account_adres_postcode VARCHAR(255),
account_adres_provincie VARCHAR(255),
account_industriezone_naam_ VARCHAR(255),
account_is_voka_entiteit INT,
account_ondernemingsaard VARCHAR(255),
account_ondernemingstype VARCHAR(255),
account_oprichtingsdatum VARCHAR(255),
account_primaire_activiteit VARCHAR(255),
account_reden_van_status VARCHAR(255),
account_status INT,
account_voka_nr_ INT,
account_adres_land VARCHAR(255));
END

class Account(Base):
    __tablename__ = 'Account'
    account_account_id = Column(String(255), primary_key=True)
    account_adres_geografische_regio = Column(String(255))
    account_adres_geografische_subregio = Column(String(255))
    account_adres_plaats = Column(