## Imports

In [2]:
# Imports for all modules needed.
import mysql.connector
import sys
import json
from datetime import datetime
import pandas as pd
sys.path.append('../DataGeneration')
from license_data_generator import License, Corrupt, Validate, Stats

## Database Setup

In [3]:
# Connect to mySQL.
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  password="password"
)

mycursor = mydb.cursor()

# Create a database for license data if it doesn't already exist.
try:
    mycursor.execute("CREATE DATABASE licensedatav2")
except:
   pass

# List all available databases.
mycursor.execute("SHOW DATABASES")

for x in mycursor:
  print(x)

('information_schema',)
('licensedata',)
('licensedatav2',)
('mysql',)
('performance_schema',)
('sys',)


### Dataset Registry Table Creation

In [4]:
# Connect to mySQL database.
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  password="password",
  database="licensedatav2"
)
mycursor = mydb.cursor()

# CREATE TABLE query for dataset_registry.
# Note to future self, if you wonder why we didn't set dataset as the primary key, its because you can't have a key refer to multiple rows.
create_dataset_registry_query = """
CREATE TABLE IF NOT EXISTS dataset_registry (
    dataset_id INT AUTO_INCREMENT PRIMARY KEY,
    Dataset VARCHAR(255), 
    Total_Entries INT,
    Corrupt_Fields INT,
    Corruption_Percentage DECIMAL(5,2),
    Creation_Time TIMESTAMP,
    `First Name Corruption` INT,
    `Last Name Corruption` INT,
    `Date of Birth Corruption` INT,
    `Place of Birth Corruption` INT,
    `Gender Corruption` INT,
    `Date of Issue Corruption` INT,
    `Date of Expiry Corruption` INT,
    `Issuing Authority Corruption` INT,
    `License Number Corruption` INT,
    `Address Corruption` INT,
    Metadata VARCHAR (255)
);
"""

# Execute the CREATE TABLE query.
mycursor.execute(create_dataset_registry_query)

# Commit changes and close the connection.
mydb.commit()
mydb.close()

### License Data Table Creation

In [5]:
# Generate a dataset just to extract the column names.
header = Validate.validate(License.generate_dataset(0))
header["Dataset"] = "header"
header.head(0)

Unnamed: 0,First Name,First Name Corruption,Last Name,Last Name Corruption,Date of Birth,Date of Birth Corruption,Place of Birth,Place of Birth Corruption,Gender,Gender Corruption,...,Date of Issue Corruption,Date of Expiry,Date of Expiry Corruption,Issuing Authority,Issuing Authority Corruption,License Number,License Number Corruption,Address,Address Corruption,Dataset


In [6]:
# Connect to mySQL database.
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  password="password",
  database="licensedatav2"
)
mycursor = mydb.cursor()

table_name = 'license_data'
create_table_query = f"CREATE TABLE IF NOT EXISTS {table_name} (id INT AUTO_INCREMENT PRIMARY KEY, "

# Extract column names from DataFrame and generate SQL query with backticks for column names to avoid conflicts with variable names.
for column in header.columns:
    column_name = column
    create_table_query += f"`{column_name}` VARCHAR(255), "
    

# Put together the SQL query to create the table.
try:
    create_table_query = create_table_query.rstrip(', ') + ");"
    mycursor.execute(create_table_query)

    # ALTER TABLE query to add Dataset column as a foreign key.
    alter_license_data_query = """
    ALTER TABLE license_data
    ADD COLUMN Dataset VARCHAR(255),
    ADD CONSTRAINT fk_dataset
        FOREIGN KEY (Dataset)
        REFERENCES dataset_registry(Dataset);
    """
    mycursor.execute(alter_license_data_query)

except:
    pass

# Commit changes and close the connection.
mydb.commit()
mydb.close()

## Dataset Creation

### Normal License Data

In [7]:
# Creating a normal dataset.
normal_dataset = License.generate_dataset(1000)
normal_dataset = Validate.validate(normal_dataset)
normal_dataset["Dataset"] = "normal_dataset"
normal_dataset.head()

Unnamed: 0,First Name,First Name Corruption,Last Name,Last Name Corruption,Date of Birth,Date of Birth Corruption,Place of Birth,Place of Birth Corruption,Gender,Gender Corruption,...,Date of Issue Corruption,Date of Expiry,Date of Expiry Corruption,Issuing Authority,Issuing Authority Corruption,License Number,License Number Corruption,Address,Address Corruption,Dataset
0,Carole,0,Nolan,0,21.02.1964,0,Greenland,0,Female,0,...,0,19.06.2030,0,DA1,0,NOLAN652214C99IZ,0,"286 Robert cape, Davieschester, S99 3EY",0,normal_dataset
1,Steven,0,Matthews,0,04.05.1952,0,Greece,0,Male,0,...,0,30.12.1983,0,DA1,0,MATTH505042S99BB,0,"50 Yvonne station, Newtonmouth, E0 2UZ",0,normal_dataset
2,Russell,0,Patel,0,13.12.1957,0,North Macedonia,0,Male,0,...,0,05.11.2002,0,DA1,0,PATEL512137R99WA,0,"669 Charlene drives, Lake Marieland, SG1 3TH",0,normal_dataset
3,Gavin,0,Roberts,0,14.02.1974,0,Svalbard & Jan Mayen Islands,1,Female,0,...,0,20.12.2024,0,DA1,0,ROBER752144G99RK,0,"661 Lewis circles, Port Deanhaven, W5S 7UF",0,normal_dataset
4,Jeremy,0,Walton,0,05.01.2004,0,Yemen,0,Female,0,...,0,14.03.2033,0,DA1,0,WALTO051054J99LB,0,"88 O'Sullivan underpass, North Lauren, G2E 3RF",1,normal_dataset


### Corrupt License Data

In [8]:
# Creating a dataset with corrupt entries.
all_corrupt_dataset = Corrupt.introduce_corruptions(License.generate_dataset(1000), 0.2)
all_corrupt_dataset = Validate.validate(all_corrupt_dataset)
all_corrupt_dataset["Dataset"] = "all_corrupt_dataset"
all_corrupt_dataset.head()

Unnamed: 0,First Name,First Name Corruption,Last Name,Last Name Corruption,Date of Birth,Date of Birth Corruption,Place of Birth,Place of Birth Corruption,Gender,Gender Corruption,...,Date of Issue Corruption,Date of Expiry,Date of Expiry Corruption,Issuing Authority,Issuing Authority Corruption,License Number,License Number Corruption,Address,Address Corruption,Dataset
0,<len,1,Dale,0,28.05.1958,0,Tur.menistan,1,Male,0,...,1,19.01.2000,1,DA1,0,DAL19505288G99UA,1,"389 Johnson trafficway, Joannemouth, G24 6TW",0,all_corrupt_dataset
1,K[mb<[_ey,1,Jones,0,07.10.1956,0,Canada,0,Female,0,...,0,01.08.1987,0,DA1,0,JONES560076K99BB,0,,1,all_corrupt_dataset
2,^hris@ian,1,G%een,1,08.09.1950,0,Malaysia,0,Male,0,...,0,31.07.1981,0,DA1,0,GREEN50J090C99KIZ,1,"335 Jeremy ports, Port Gareth, L1J 1RG",0,all_corrupt_dataset
3,Holli\,1,Pa#mer,1,13.08.1984,0,G1or4ia,1,Adipisci,1,...,0,19.01.2017,0,01N,1,PALME85813H99FZ,1,"217 Olivia springs, Mohammadmouth, DH2 0ZD",0,all_corrupt_dataset
4,Ricky,0,Bri2gs,1,12.10.2004,0,Ma<ot.e,1,Female,0,...,0,09.08.2033,0,DDEF,1,BRIGG060124R99ZA,1,"124 Jade track, North Lyndabury",1,all_corrupt_dataset


## Database Population

### Data Registry Table

In [9]:
# Check the dataset attributes for corruption.
normal_dataset_attributes = Stats.attribute_corruption(normal_dataset)
all_corrupt_dataset_attributes = Stats.attribute_corruption(all_corrupt_dataset)

In [10]:
Stats.is_corrupt(normal_dataset)

1

In [11]:
# Summarize the information for each dataset to be uploaded into the database registry table.
datasets_info = [
    {
        'Dataset': 'normal_dataset', 
        'Total_Entries': len(normal_dataset),
        'Corrupt_Fields': Stats.is_corrupt(normal_dataset),
        'Corruption_Percentage': Stats.corrupt_percent(normal_dataset),
        'Creation_Time': datetime.now().strftime("%d.%m.%y %H:%M:%S"),
        'First Name Corruption': normal_dataset_attributes['First Name'],
        'Last Name Corruption': normal_dataset_attributes['Last Name'],
        'Date of Birth Corruption': normal_dataset_attributes['Date of Birth'],
        'Place of Birth Corruption': normal_dataset_attributes['Place of Birth'],
        'Gender Corruption': normal_dataset_attributes['Gender'],
        'Date of Issue Corruption': normal_dataset_attributes['Date of Issue'],
        'Date of Expiry Corruption': normal_dataset_attributes['Date of Expiry'],
        'Issuing Authority Corruption': normal_dataset_attributes['Issuing Authority'],
        'License Number Corruption': normal_dataset_attributes['License Number'],
        'Address Corruption': normal_dataset_attributes['Address'],
        'Metadata': {
            'description': 'This dataset contains uncorrupted synthetic data for driver license data.',
            'language': 'English',
            'license': 'ODC-By',
            'dataset_source': 'https://github.com/AatishDA1/PracticeSDProject.git'
        }
    },

    {
        'Dataset': 'all_corrupt_dataset', 
        'Total_Entries': len(all_corrupt_dataset),
        'Corrupt_Fields': Stats.is_corrupt(all_corrupt_dataset),
        'Corruption_Percentage': Stats.corrupt_percent(all_corrupt_dataset),
        'Creation_Time': datetime.now().strftime("%d.%m.%y %H:%M:%S"),
        'First Name Corruption': all_corrupt_dataset_attributes['First Name'],
        'Last Name Corruption': all_corrupt_dataset_attributes['Last Name'],
        'Date of Birth Corruption': all_corrupt_dataset_attributes['Date of Birth'],
        'Place of Birth Corruption': all_corrupt_dataset_attributes['Place of Birth'],
        'Gender Corruption': all_corrupt_dataset_attributes['Gender'],
        'Date of Issue Corruption': all_corrupt_dataset_attributes['Date of Issue'],
        'Date of Expiry Corruption': all_corrupt_dataset_attributes['Date of Expiry'],
        'Issuing Authority Corruption': all_corrupt_dataset_attributes['Issuing Authority'],
        'License Number Corruption': all_corrupt_dataset_attributes['License Number'],
        'Address Corruption': all_corrupt_dataset_attributes['Address'],
        'Metadata': {
            'description': 'This dataset contains corrupted synthetic data for driver license data.',
            'language': 'English',
            'license': 'ODC-By',
            'dataset_source': 'https://github.com/AatishDA1/PracticeSDProject.git'
        }
    }
]

In [12]:
def insert_dataset_info(dataset_name, total_entries, corrupt_fields, corruption_percentage, creation_time,
                        fname_corr, lname_corr, dob_corr, pob_corr, gender_corr, issue_corr, expiry_corr,
                        authority_corr, license_num_corr, address_corr, metadata):
    insert_query = """
    INSERT INTO dataset_registry (
        Dataset, Total_Entries, Corrupt_Fields, Corruption_Percentage, Creation_Time,
        `First Name Corruption`, `Last Name Corruption`, `Date of Birth Corruption`, `Place of Birth Corruption`,
        `Gender Corruption`, `Date of Issue Corruption`, `Date of Expiry Corruption`, `Issuing Authority Corruption`,
        `License Number Corruption`, `Address Corruption`, Metadata
    )
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    """
    values = (
        dataset_name, total_entries, corrupt_fields, corruption_percentage, creation_time,
        fname_corr, lname_corr, dob_corr, pob_corr, gender_corr, issue_corr, expiry_corr, authority_corr,
        license_num_corr, address_corr, json.dumps(metadata)
    )
    mycursor.execute(insert_query, values)

In [13]:
# Connect to mySQL database.
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  password="password",
  database="licensedatav2"
)
mycursor = mydb.cursor()

# Insert information for each dataset into the dataset_registry table
for dataset in datasets_info:
    insert_dataset_info(
        dataset['Dataset'],
        dataset['Total_Entries'],
        dataset['Corrupt_Fields'],
        dataset['Corruption_Percentage'],
        dataset['Creation_Time'],
        dataset['First Name Corruption'],
        dataset['Last Name Corruption'],
        dataset['Date of Birth Corruption'],
        dataset['Place of Birth Corruption'],
        dataset['Gender Corruption'],
        dataset['Date of Issue Corruption'],
        dataset['Date of Expiry Corruption'],
        dataset['Issuing Authority Corruption'],
        dataset['License Number Corruption'],
        dataset['Address Corruption'],
        dataset['Metadata']
    )

# Commit changes and close the connection.
mydb.commit()
mydb.close()

### License Data Table

In [14]:
# Connect to mySQL database.
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  password="password",
  database="licensedatav2"
)
mycursor = mydb.cursor()

# Create list of datasets to be uploaded.
datasets_list = [normal_dataset, all_corrupt_dataset] 

# Iterate through each DataFrame and insert its rows into the 'license_data' table.
for dataset_df in datasets_list:
    for _, row in dataset_df.iterrows():
        # Prepare the columns and values for the INSERT query.
        columns = ", ".join([f"`{col}`" for col in row.index])
        placeholders = ", ".join(["%s" for _ in range(len(row))])

        # Construct the INSERT query using parameterized placeholders.
        insert_query = f"INSERT INTO license_data ({columns}) VALUES ({placeholders})"
        values = tuple(row)
        
        # Execute the INSERT query with the row values.
        mycursor.execute(insert_query, values)

# Commit changes and close the connection.
mydb.commit()
mydb.close()