## Imports

In [1]:
# Imports for all modules needed.
import mysql.connector
import sys
import json
from datetime import datetime
import pandas as pd
sys.path.append('../DataGeneration')
from license_data_generator import License, Corrupt

## Database Setup

In [2]:
# Connect to mySQL.
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  password="password"
)

mycursor = mydb.cursor()

# Create a database for license data if it doesn't already exist.
try:
    mycursor.execute("CREATE DATABASE licensedatav2")
except:
   pass

# List all available databases.
mycursor.execute("SHOW DATABASES")

for x in mycursor:
  print(x)

('information_schema',)
('licensedata',)
('mysql',)
('performance_schema',)
('sys',)


## Normal License Data

In [3]:
# Creating a normal dataset.
test_dataset = License.generate_dataset(100)
test_dataset.head()

Unnamed: 0,First Name,Last Name,Date of Birth,Place of Birth,Gender,Date of Issue,Date of Expiry,Issuing Authority,License Number,Address
0,Trevor,Sutton,31.05.1979,Cyprus,Male,14.08.2020,12.08.2030,DA1,SUTTO705319T99EY,"706 Allen meadows, Elliotchester, LD99 9SE"
1,Charlotte,Gibbons,06.10.1982,Albania,Female,29.10.2011,26.10.2021,DA1,GIBBO860062C99HU,"312 Long village, North Ellie, L8 6UZ"
2,Heather,Ingram,29.11.1991,Monaco,Male,03.01.2019,31.12.2028,DA1,INGRA911291H99BW,"654 Rosie locks, Murphyland, DT06 5DX"
3,Josh,Douglas,26.10.1962,Uganda,Female,25.03.2006,22.03.2016,DA1,DOUGL660262J99MC,"527 Stuart ridges, Lake Jordanfurt, L3C 6AD"
4,Leanne,Fraser,30.11.1985,Syrian Arab Republic,Male,11.02.2007,08.02.2017,DA1,FRASE811305L99TH,"237 Sam glens, East Josephine, PH60 9AP"


In [4]:
# Connect to mySQL database.
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  password="password",
  database="licensedatav2"
)
mycursor = mydb.cursor()

# Create a table in MySQL (if it doesn't exist) based on the columns in the dataframe. 
table_name = 'test_license_dataset'
create_table_query = f"CREATE TABLE IF NOT EXISTS {table_name} (id INT AUTO_INCREMENT PRIMARY KEY, "

# Extract column names from DataFrame and generate SQL query with backticks for column names to avoid conflicts with variable names.
for column in test_dataset.columns:
    column_name = column
    create_table_query += f"`{column_name}` VARCHAR(255), "

# Complete the SQL query and create the table.
try:
    create_table_query = create_table_query.rstrip(', ') + ");"
    mycursor.execute(create_table_query)
    mydb.commit()
except:
    pass

# Insert DataFrame data into the MySQL table.
insert_query = f"INSERT INTO {table_name} ("
insert_query += ", ".join([f"`{col}`" for col in test_dataset.columns]) + ") VALUES ("
insert_query += ", ".join(["%s" for _ in range(len(test_dataset.columns))]) + ")"

# Iterate through DataFrame rows to insert data.
try:
    for _, row in test_dataset.iterrows():
        mycursor.execute(insert_query, tuple(row))
except:
    pass

mydb.commit()

## Corrupt License Data

In [5]:
# Creating a dataset with corrupt entries.
corrupted_test_dataset = Corrupt.introduce_corruptions(License.generate_dataset(100), 0.1)
corrupted_test_dataset.head()

Unnamed: 0,First Name,Last Name,Date of Birth,Place of Birth,Gender,Date of Issue,Date of Expiry,Issuing Authority,License Number,Address
0,Nicole,Jo<nson,18.05.1988,Hond$ras,Female,15.10.2015,12.10.2025,DA1,JOHNS855188N99KY,"850 Peter inlet, East Amanda, N6 2EB"
1,6ack,Hilton,15.02.1985,Saint B]rthel'my,Female,12.12.2020,10.12.2030,DA1,HILTO852155J99IM,"104 Wendy fort, Catherineside, WA23 2PD"
2,Naomi,King,17.07.1994,Ukrai\e,Male,18.01.2015,15.01.2025,DA1,KING9907174N99UI,"835 Angela roads, Harrisonhaven, BD2A 5WT"
3,Jack,Ri?h}rdson,22.07.1956,Namibia,Female,23.08.2019,20.08.2029,DA1,RICHA557226J99BY,"831 Cooke street, Lake Andreaberg, S1H 4NE"
4,Valerie,Shaw,01.04.2006,Sierra Leone,Male,13.03.2023,10.03.2033,DA1,SHAW9001046V99BT,"307 Payne via, Lukemouth, E65 0TX"


In [6]:
# Connect to mySQL database.
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  password="password",
  database="licensedatav2"
)
mycursor = mydb.cursor()


# Create a table in MySQL (if it doesn't exist) based on the columns in the dataframe. 
table_name = 'corrupt_test_license_dataset'
create_table_query = f"CREATE TABLE IF NOT EXISTS {table_name} (id INT AUTO_INCREMENT PRIMARY KEY, "

# Extract column names from DataFrame and generate SQL query with backticks for column names to avoid conflicts with variable names.
for column in corrupted_test_dataset.columns:
    column_name = column
    create_table_query += f"`{column_name}` VARCHAR(255), "

# Complete the SQL query and create the table.
try:
    create_table_query = create_table_query.rstrip(', ') + ");"
    mycursor.execute(create_table_query)
    mydb.commit()
except:
    pass

# Insert DataFrame data into the MySQL table.
insert_query = f"INSERT INTO {table_name} ("
insert_query += ", ".join([f"`{col}`" for col in corrupted_test_dataset.columns]) + ") VALUES ("
insert_query += ", ".join(["%s" for _ in range(len(corrupted_test_dataset.columns))]) + ")"

# Iterate through DataFrame rows to insert data.
try:
    for _, row in corrupted_test_dataset.iterrows():
        mycursor.execute(insert_query, tuple(row))
except:
    pass

mydb.commit()

## Summary Table Creation

In [7]:
# Connect to mySQL database.
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  password="password",
  database="licensedata"
)
mycursor = mydb.cursor()

# Create a table to store general information and the metadata about each dataset.
metadata_table_query = """
    CREATE TABLE IF NOT EXISTS dataset_registry (
        table_id INT AUTO_INCREMENT PRIMARY KEY,
        table_name VARCHAR(255),
        num_rows INT,
        num_columns INT,
        metadata JSON
    )
"""

try:
    mycursor.execute(metadata_table_query)
    mydb.commit()
except:
    pass

In [8]:
dataset_metadata = {   
    # Metadata for the test dataset.
    'test_license_dataset' : {
        'name': 'English',
        'description': 'This dataset contains synthetic data for driver license data.',
        'language': 'English',
        'creation_date': datetime.now().strftime("%d.%m.%y %H:%M:%S"),
        'dataframe_size': test_dataset.shape,
        'columns': test_dataset.columns.tolist(),
        'corruption': 'Needs to be created',
        'num_corrupted_entries': 'Needs to be created',
        'dataset_source': 'https://github.com/AatishDA1/PracticeSDProject.git'
    },

    # Metadata for the corrupt test dataset.
    'corrupt_test_license_dataset' : {
        'description': 'This dataset contains synthetic data for driver license data, with some corruptions.',
        'language': 'English',
        'creation_date': datetime.now().strftime("%d.%m.%y %H:%M:%S"),
        'dataframe_size': test_dataset.shape,
        'columns': test_dataset.columns.tolist(),
        'corruption': 'Needs to be created',
        'num_corrupted_entries': 'Needs to be created',
        'dataset_source': 'https://github.com/AatishDA1/PracticeSDProject.git'
    }
}

In [10]:
# Connect to mySQL database.
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  password="password",
  database="licensedata"
)
mycursor = mydb.cursor()

# Fetch all tables in the database.
all_tables_query = "SHOW TABLES"
mycursor.execute(all_tables_query)
all_tables = mycursor.fetchall()

# Filter out the 'database_registry' table as that is what we are trying to populate. 
tables = [table for table in all_tables if table not in tables]

# Iterate through every table.
for table in tables:
    table_name = table[0]
    
    # Query to count the number of rows in the table.
    count_rows_query = f"SELECT COUNT(*) FROM {table_name}"
    mycursor.execute(count_rows_query)
    num_rows = mycursor.fetchone()[0]
    
    # Query to count the number of columns in the table.
    count_columns_query = f"SELECT COUNT(*) FROM information_schema.columns WHERE table_name = '{table_name}'"
    mycursor.execute(count_columns_query)
    num_columns = mycursor.fetchone()[0]

    # Get metadata for the current table from the previously defined dictionairy.
    if table_name in dataset_metadata:
        metadata = dataset_metadata[table_name]
    else:
        metadata = {}  # Set the default to be empty metadata if not found.

    # Insert table information and metadata into the dataset registry table. 
    dataset_registry_query = "INSERT INTO dataset_registry (table_name, num_rows, num_columns, metadata) VALUES (%s, %s, %s, %s)"
    mycursor.execute(dataset_registry_query, (table_name, num_rows, num_columns, json.dumps(metadata)))


mydb.commit()
mydb.close()
