## Imports

In [1]:
# Imports for all modules needed.
import mysql.connector
import sys
import json
from datetime import datetime
import pandas as pd
sys.path.append('../DataGeneration')
from license_data_generator import License, Corrupt, Validate, Stats

## Database Setup

In [2]:
# Connect to mySQL.
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  password="password"
)

mycursor = mydb.cursor()

# Create a database for license data if it doesn't already exist.
try:
    mycursor.execute("CREATE DATABASE licensedatav2")
except:
   pass

# List all available databases.
mycursor.execute("SHOW DATABASES")

for x in mycursor:
  print(x)

('information_schema',)
('licensedata',)
('licensedatav2',)
('mysql',)
('performance_schema',)
('sys',)


### Dataset Registry Table Creation

In [3]:
# Connect to mySQL database.
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  password="password",
  database="licensedatav2"
)
mycursor = mydb.cursor()

# CREATE TABLE query for dataset_registry.
# Note to future self, if you wonder why we didn't set dataset as the primary key, its because you can't have a key refer to multiple rows.
create_dataset_registry_query = """
CREATE TABLE IF NOT EXISTS dataset_registry (
    dataset_id INT AUTO_INCREMENT PRIMARY KEY,
    Dataset VARCHAR(255), 
    Total_Entries INT,
    Corrupt_Fields INT,
    Corruption_Percentage DECIMAL(5,2),
    Creation_Time TIMESTAMP,
    `First Name Corruption` INT,
    `Last Name Corruption` INT,
    `Date of Birth Corruption` INT,
    `Place of Birth Corruption` INT,
    `Gender Corruption` INT,
    `Date of Issue Corruption` INT,
    `Date of Expiry Corruption` INT,
    `Issuing Authority Corruption` INT,
    `License Number Corruption` INT,
    `Address Corruption` INT,
    Metadata LONGTEXT
);
"""

# Execute the CREATE TABLE query.
mycursor.execute(create_dataset_registry_query)

# Commit changes and close the connection.
mydb.commit()
mydb.close()

### License Data Table Creation

In [4]:
# Generate a dataset just to extract the column names.
header = Validate.validate(License.generate_dataset(0))
header["Dataset"] = "header"
header.head(0)

Unnamed: 0,First Name,First Name Corruption,Last Name,Last Name Corruption,Date of Birth,Date of Birth Corruption,Place of Birth,Place of Birth Corruption,Gender,Gender Corruption,...,Date of Issue Corruption,Date of Expiry,Date of Expiry Corruption,Issuing Authority,Issuing Authority Corruption,License Number,License Number Corruption,Address,Address Corruption,Dataset


In [5]:
# Connect to mySQL database.
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  password="password",
  database="licensedatav2"
)
mycursor = mydb.cursor()

table_name = 'license_data'
create_table_query = f"CREATE TABLE IF NOT EXISTS {table_name} (id INT AUTO_INCREMENT PRIMARY KEY, "

# Extract column names from DataFrame and generate SQL query with backticks for column names to avoid conflicts with variable names.
for column in header.columns:
    column_name = column
    create_table_query += f"`{column_name}` VARCHAR(255), "
    

# Put together the SQL query to create the table.
try:
    create_table_query = create_table_query.rstrip(', ') + ");"
    mycursor.execute(create_table_query)

    # ALTER TABLE query to add Dataset column as a foreign key.
    alter_license_data_query = """
    ALTER TABLE license_data
    ADD COLUMN Dataset VARCHAR(255),
    ADD CONSTRAINT fk_dataset
        FOREIGN KEY (Dataset)
        REFERENCES dataset_registry(Dataset);
    """
    mycursor.execute(alter_license_data_query)

except:
    pass

# Commit changes and close the connection.
mydb.commit()
mydb.close()

## Dataset Creation

### Normal License Dataset

In [6]:
# Creating a normal dataset.
normal_dataset = License.generate_dataset(100)
normal_dataset = Validate.validate(normal_dataset)
normal_dataset["Dataset"] = "normal_dataset"
normal_dataset.head()

Unnamed: 0,First Name,First Name Corruption,Last Name,Last Name Corruption,Date of Birth,Date of Birth Corruption,Place of Birth,Place of Birth Corruption,Gender,Gender Corruption,...,Date of Issue Corruption,Date of Expiry,Date of Expiry Corruption,Issuing Authority,Issuing Authority Corruption,License Number,License Number Corruption,Address,Address Corruption,Dataset
0,Lawrence,0,Poole,0,31.05.1957,0,Senegal,0,Male,0,...,0,14.09.2006,0,DA1,0,POOLE505317L99AY,0,"904 King brooks, Port Edwardland, N5 4PS",0,normal_dataset
1,Ian,0,Williams,0,16.01.2000,0,Tonga,0,Male,0,...,0,06.10.2027,0,DA1,0,WILLI001160I99YN,0,"421 Pope forks, West Michaelview, TR7 4TR",0,normal_dataset
2,Hannah,0,Davies,0,08.02.1968,0,Ethiopia,0,Female,0,...,0,07.04.2007,0,DA1,0,DAVIE652088H99SU,0,"450 Berry via, East Albertstad, FY9 7NU",0,normal_dataset
3,Donna,0,Edwards,0,12.12.1960,0,Chile,0,Female,0,...,0,04.06.2015,0,DA1,0,EDWAR662120D99TX,0,"850 Edwards overpass, Deborahport, G5 0RA",0,normal_dataset
4,Bethan,0,Davies,0,28.02.1980,0,Netherlands,0,Male,0,...,0,14.06.2024,0,DA1,0,DAVIE802280B99GO,0,"971 O'Donnell mount, Taylortown, SG3H 8UQ",1,normal_dataset


### All Corrupt License Dataset

In [7]:
# Creating a dataset with only corrupt entries.
all_corrupt_dataset = Corrupt.introduce_corruptions(License.generate_dataset(100), 1)
all_corrupt_dataset = Validate.validate(all_corrupt_dataset)
all_corrupt_dataset["Dataset"] = "all_corrupt_dataset"
all_corrupt_dataset.head()

Unnamed: 0,First Name,First Name Corruption,Last Name,Last Name Corruption,Date of Birth,Date of Birth Corruption,Place of Birth,Place of Birth Corruption,Gender,Gender Corruption,...,Date of Issue Corruption,Date of Expiry,Date of Expiry Corruption,Issuing Authority,Issuing Authority Corruption,License Number,License Number Corruption,Address,Address Corruption,Dataset
0,)(]&%],1,"""#9"".*",1,09/05/1995,1,"05[^-2@2!!""})5-#\#81*1:$`|,/&""3&;8%.${&;=<1\",1,Culpa,1,...,1,69.91.2028,1,A,1,TURNE905095A994M,1,,1,all_corrupt_dataset
1,")2*;?#,",1,")""./",1,89.58.1951,1,"^""/""~'^+`{&^]7]",1,Quae,1,...,1,83.18.1984,1,ZPBF,1,FOD9556161J99LY,1,395 Sally passage,1,all_corrupt_dataset
2,~1:.,1,"(,02",1,34.37.1970,1,0639,1,Rem,1,...,1,32.57.2028,1,I8FI,1,WEST976150J99UL,1,,1,all_corrupt_dataset
3,",'|5;[__'",1,]+`>9,1,80.45.1978,1,"/,#@*_~*]-7%/3;1^",1,Dolores,1,...,1,16/08/2017,1,Q,1,SMETH708138S99IH,1,652 Law keys,1,all_corrupt_dataset
4,<6-.,1,{$-&(,1,05/11/1956,1,"@^4!$\|=7:""1/9$_0&}",1,Facere,1,...,1,70.72.1992,1,S,1,JOYCE511056M99IW7,1,547 Smith expressway,1,all_corrupt_dataset


### Some Corrupt License Dataset

In [8]:
# Creating a dataset with some corrupt entries.
some_corrupt_dataset = Corrupt.introduce_corruptions(License.generate_dataset(100), 0.2)
some_corrupt_dataset = Validate.validate(some_corrupt_dataset)
some_corrupt_dataset["Dataset"] = "some_corrupt_dataset"
some_corrupt_dataset.head()

Unnamed: 0,First Name,First Name Corruption,Last Name,Last Name Corruption,Date of Birth,Date of Birth Corruption,Place of Birth,Place of Birth Corruption,Gender,Gender Corruption,...,Date of Issue Corruption,Date of Expiry,Date of Expiry Corruption,Issuing Authority,Issuing Authority Corruption,License Number,License Number Corruption,Address,Address Corruption,Dataset
0,Raymo}d,1,"!c""onal\",1,26.01.1954,0,Al4eria,1,Male,0,...,0,21.05.2013,0,21NQ,1,MGDON50164R99OJ5,1,"116 Wendy freeway, West Howardfort, KA8E 3LR",0,some_corrupt_dataset
1,Emma,0,Pe~co\k,1,23.10.1988,0,Gh>na,1,Ut,1,...,1,19.01.2021,1,DA1,0,PEACO860238E99IS,1,"340 Ali rapids, West Marcus, WS0 3QB",0,some_corrupt_dataset
2,Josh,0,Patel,0,16.09.1987,0,Je)sey,1,Female,0,...,0,09.01.2031,0,DA1,0,PATEL859167J99VD,0,"38 Eric dale, East Dominicburgh, N6 9FR",0,some_corrupt_dataset
3,63eanor,1,C'llins,1,02.07.1987,0,Peru,0,Male,0,...,0,01.02.2033,0,DA1,0,COLLI802077E99YA,1,859 Christian wall,1,some_corrupt_dataset
4,#arre@,1,K/ng,1,21.12.1953,0,Sw$tzerla7d,1,Female,0,...,0,05.01.2017,1,DA1,0,KING9562213D99RC,1,"926 Malcolm vista, Port Amanda, S3S 9ZE",0,some_corrupt_dataset


### Only License Number Corrupt Dataset

In [9]:
# Creating a dataset where only the license number entries are corrupt, everything else is fine.
only_lnum_corrupt_dataset = Corrupt.introduce_drivernum_corruption(License.generate_dataset(100), 'License Number', 1)
only_lnum_corrupt_dataset = Validate.validate(only_lnum_corrupt_dataset)
only_lnum_corrupt_dataset["Dataset"] = "only_lnum_corrupt_dataset"
only_lnum_corrupt_dataset.head()

Unnamed: 0,First Name,First Name Corruption,Last Name,Last Name Corruption,Date of Birth,Date of Birth Corruption,Place of Birth,Place of Birth Corruption,Gender,Gender Corruption,...,Date of Issue Corruption,Date of Expiry,Date of Expiry Corruption,Issuing Authority,Issuing Authority Corruption,License Number,License Number Corruption,Address,Address Corruption,Dataset
0,Olivia,0,Freeman,0,01.08.1977,0,Greece,0,Male,0,...,0,27.01.2014,0,DA1,0,FREEMJ0801A99DN,1,"606 Joan road, Halltown, WD0 3JW",0,only_lnum_corrupt_dataset
1,Harry,0,Barber,0,13.10.1974,0,Philippines,0,Female,0,...,0,02.01.2028,0,DA1,0,BARBE6013H99C,1,"612 Carole key, Scottville, EN7 9JL",0,only_lnum_corrupt_dataset
2,Jessica,0,Lloyd,0,19.10.1957,0,Guam,0,Male,0,...,0,08.12.2020,0,DA1,0,LLOYD51019799SJ,1,"676 Jonathan orchard, North Maxburgh, NE49 2JL",0,only_lnum_corrupt_dataset
3,Olivia,0,Byrne,0,14.09.1995,0,Gabon,0,Female,0,...,0,26.06.2026,0,DA1,0,BYRE95945O99EZ,1,"186 Glenn walk, Justinberg, M3U 2SN",0,only_lnum_corrupt_dataset
4,Sharon,0,White,0,27.06.1983,0,Holy See (Vatican City State),1,Female,0,...,0,15.02.2011,0,DA1,0,WQITE85273S99KG,1,"883 Ashley hill, New Graemeville, IM72 6YQ",0,only_lnum_corrupt_dataset


### Only Names Corrupt Dataset

In [10]:
# Creating a dataset where only the first and last name entries are corrupt, everything else is fine.
only_names_corrupt_dataset = Corrupt.introduce_name_corruption(License.generate_dataset(100), 'First Name', 1)
only_names_corrupt_dataset = Corrupt.introduce_name_corruption(only_names_corrupt_dataset, 'Last Name', 1)
only_names_corrupt_dataset = Validate.validate(only_names_corrupt_dataset)
only_names_corrupt_dataset["Dataset"] = "only_names_corrupt_dataset"
only_names_corrupt_dataset.head()

Unnamed: 0,First Name,First Name Corruption,Last Name,Last Name Corruption,Date of Birth,Date of Birth Corruption,Place of Birth,Place of Birth Corruption,Gender,Gender Corruption,...,Date of Issue Corruption,Date of Expiry,Date of Expiry Corruption,Issuing Authority,Issuing Authority Corruption,License Number,License Number Corruption,Address,Address Corruption,Dataset
0,<6[2@=,1,*&+>!,1,11.08.1987,0,Bosnia and Herzegovina,0,Female,0,...,0,01.02.2023,0,DA1,0,BROWN858117G99RC,1,"785 Forster isle, Port Rickyside, E07 3FZ",0,only_names_corrupt_dataset
1,"+1`$_/""/-",1,9<86!`,1,21.09.1968,0,Cambodia,0,Male,0,...,0,20.04.2014,0,DA1,0,ABBOT609218J99KL,1,"760 Brennan port, North Robert, N8 6LH",0,only_names_corrupt_dataset
2,+=},1,1|:},1,20.05.1965,0,Greenland,0,Female,0,...,0,23.08.2005,0,DA1,0,WOOD9655205T99KL,1,"711 Mark village, New Jean, M0 5JQ",0,only_names_corrupt_dataset
3,^'891,1,"%}"">",1,22.11.1981,0,France,0,Female,0,...,0,17.04.2030,0,DA1,0,HOPE9861221A99XA,1,"513 Kyle pines, Liamberg, E0K 7LL",0,only_names_corrupt_dataset
4,(:*,1,]<.&9,1,13.12.1968,0,Liechtenstein,0,Male,0,...,0,18.06.2006,0,DA1,0,JONES612138R99ZZ,1,"874 Bernard isle, South Jeremy, NP2 2LL",0,only_names_corrupt_dataset


### Only Dates Corrupt Dataset

In [11]:
# Creating a dataset where only the dates entries are corrupt, everything else is fine.
only_dates_corrupt_dataset = Corrupt.introduce_date_corruption(License.generate_dataset(100), 'Date of Birth', 1)
only_dates_corrupt_dataset = Corrupt.introduce_date_corruption(only_dates_corrupt_dataset, 'Date of Issue', 1)
only_dates_corrupt_dataset = Corrupt.introduce_date_corruption(only_dates_corrupt_dataset, 'Date of Expiry', 1)
only_dates_corrupt_dataset = Validate.validate(only_dates_corrupt_dataset)
only_dates_corrupt_dataset["Dataset"] = "only_dates_corrupt_dataset"
only_dates_corrupt_dataset.head()

Unnamed: 0,First Name,First Name Corruption,Last Name,Last Name Corruption,Date of Birth,Date of Birth Corruption,Place of Birth,Place of Birth Corruption,Gender,Gender Corruption,...,Date of Issue Corruption,Date of Expiry,Date of Expiry Corruption,Issuing Authority,Issuing Authority Corruption,License Number,License Number Corruption,Address,Address Corruption,Dataset
0,Frank,0,Bennett,0,25/11/2002,1,Paraguay,0,Male,0,...,1,43.49.2033,1,DA1,0,BENNE011252F99EM,0,"282 Hawkins loaf, West Sean, BN9 3ZT",0,only_dates_corrupt_dataset
1,Jeffrey,0,Coates,0,50.56.1954,1,Samoa,0,Female,0,...,1,07.19.2011,1,DA1,0,COATE557124J99LY,1,"321 Gray light, Anthonyborough, HG28 9FX",0,only_dates_corrupt_dataset
2,Samuel,0,Jones,0,82.84.2002,1,Bahrain,0,Female,0,...,1,78.83.2031,1,DA1,0,JONES059132S99LA,1,"843 Clifford mills, Lake Darren, CA6V 3AT",0,only_dates_corrupt_dataset
3,Phillip,0,Curtis,0,19/09/1989,1,Slovakia (Slovak Republic),1,Female,0,...,1,81.18.2023,1,DA1,0,CURTI859199P99NI,0,"890 Warner islands, Hayleyton, M4U 9DF",0,only_dates_corrupt_dataset
4,Lynda,0,Davis,0,04.13.1956,1,Andorra,0,Male,0,...,1,73.83.2009,1,DA1,0,DAVIS504136L99RN,1,"678 Debra terrace, Port Rebeccamouth, FK79 9JR",0,only_dates_corrupt_dataset


## Database Population

### Data Registry Table

In [12]:
# Check the dataset attributes for corruption.
normal_dataset_attributes = Stats.attribute_corruption(normal_dataset)
all_corrupt_dataset_attributes = Stats.attribute_corruption(all_corrupt_dataset)
some_corrupt_dataset_attributes = Stats.attribute_corruption(some_corrupt_dataset)
only_lnum_corrupt_dataset_attributes = Stats.attribute_corruption(only_lnum_corrupt_dataset)
only_names_corrupt_dataset_attributes = Stats.attribute_corruption(only_names_corrupt_dataset)
only_dates_corrupt_dataset_attributes = Stats.attribute_corruption(only_dates_corrupt_dataset)

In [13]:
# Summarize the information for each dataset to be uploaded into the database registry table.
datasets_info = [
    {
        'Dataset': 'normal_dataset', 
        'Total_Entries': len(normal_dataset),
        'Corrupt_Fields': Stats.is_corrupt(normal_dataset),
        'Corruption_Percentage': Stats.corrupt_percent(normal_dataset),
        'Creation_Time': datetime.now().strftime("%d.%m.%y %H:%M:%S"),
        'First Name Corruption': normal_dataset_attributes['First Name'],
        'Last Name Corruption': normal_dataset_attributes['Last Name'],
        'Date of Birth Corruption': normal_dataset_attributes['Date of Birth'],
        'Place of Birth Corruption': normal_dataset_attributes['Place of Birth'],
        'Gender Corruption': normal_dataset_attributes['Gender'],
        'Date of Issue Corruption': normal_dataset_attributes['Date of Issue'],
        'Date of Expiry Corruption': normal_dataset_attributes['Date of Expiry'],
        'Issuing Authority Corruption': normal_dataset_attributes['Issuing Authority'],
        'License Number Corruption': normal_dataset_attributes['License Number'],
        'Address Corruption': normal_dataset_attributes['Address'],
        'Metadata': {
            'description': 'This dataset contains uncorrupted synthetic data for driver license data.',
            'language': 'English',
            'license': 'ODC-By',
            'dataset_source': 'https://github.com/AatishDA1/PracticeSDProject.git'
        }
    },

    {
        'Dataset': 'all_corrupt_dataset', 
        'Total_Entries': len(all_corrupt_dataset),
        'Corrupt_Fields': Stats.is_corrupt(all_corrupt_dataset),
        'Corruption_Percentage': Stats.corrupt_percent(all_corrupt_dataset),
        'Creation_Time': datetime.now().strftime("%d.%m.%y %H:%M:%S"),
        'First Name Corruption': all_corrupt_dataset_attributes['First Name'],
        'Last Name Corruption': all_corrupt_dataset_attributes['Last Name'],
        'Date of Birth Corruption': all_corrupt_dataset_attributes['Date of Birth'],
        'Place of Birth Corruption': all_corrupt_dataset_attributes['Place of Birth'],
        'Gender Corruption': all_corrupt_dataset_attributes['Gender'],
        'Date of Issue Corruption': all_corrupt_dataset_attributes['Date of Issue'],
        'Date of Expiry Corruption': all_corrupt_dataset_attributes['Date of Expiry'],
        'Issuing Authority Corruption': all_corrupt_dataset_attributes['Issuing Authority'],
        'License Number Corruption': all_corrupt_dataset_attributes['License Number'],
        'Address Corruption': all_corrupt_dataset_attributes['Address'],
        'Metadata': {
            'description': 'This dataset contains synthetic data for driver license data where every value has been corrupted.',
            'language': 'English',
            'license': 'ODC-By',
            'dataset_source': 'https://github.com/AatishDA1/PracticeSDProject.git'
        }
    },

    {
        'Dataset': 'some_corrupt_dataset', 
        'Total_Entries': len(some_corrupt_dataset),
        'Corrupt_Fields': Stats.is_corrupt(some_corrupt_dataset),
        'Corruption_Percentage': Stats.corrupt_percent(some_corrupt_dataset),
        'Creation_Time': datetime.now().strftime("%d.%m.%y %H:%M:%S"),
        'First Name Corruption': some_corrupt_dataset_attributes['First Name'],
        'Last Name Corruption': some_corrupt_dataset_attributes['Last Name'],
        'Date of Birth Corruption': some_corrupt_dataset_attributes['Date of Birth'],
        'Place of Birth Corruption': some_corrupt_dataset_attributes['Place of Birth'],
        'Gender Corruption': some_corrupt_dataset_attributes['Gender'],
        'Date of Issue Corruption': some_corrupt_dataset_attributes['Date of Issue'],
        'Date of Expiry Corruption': some_corrupt_dataset_attributes['Date of Expiry'],
        'Issuing Authority Corruption': some_corrupt_dataset_attributes['Issuing Authority'],
        'License Number Corruption': some_corrupt_dataset_attributes['License Number'],
        'Address Corruption': some_corrupt_dataset_attributes['Address'],
        'Metadata': {
            'description': 'This dataset contains synthetic data for driver license data where some values have been corrupted.',
            'language': 'English',
            'license': 'ODC-By',
            'dataset_source': 'https://github.com/AatishDA1/PracticeSDProject.git'
        }
    },

    {
        'Dataset': 'only_lnum_corrupt_dataset', 
        'Total_Entries': len(only_lnum_corrupt_dataset),
        'Corrupt_Fields': Stats.is_corrupt(only_lnum_corrupt_dataset),
        'Corruption_Percentage': Stats.corrupt_percent(only_lnum_corrupt_dataset),
        'Creation_Time': datetime.now().strftime("%d.%m.%y %H:%M:%S"),
        'First Name Corruption': only_lnum_corrupt_dataset_attributes['First Name'],
        'Last Name Corruption': only_lnum_corrupt_dataset_attributes['Last Name'],
        'Date of Birth Corruption': only_lnum_corrupt_dataset_attributes['Date of Birth'],
        'Place of Birth Corruption': only_lnum_corrupt_dataset_attributes['Place of Birth'],
        'Gender Corruption': only_lnum_corrupt_dataset_attributes['Gender'],
        'Date of Issue Corruption': only_lnum_corrupt_dataset_attributes['Date of Issue'],
        'Date of Expiry Corruption': only_lnum_corrupt_dataset_attributes['Date of Expiry'],
        'Issuing Authority Corruption': only_lnum_corrupt_dataset_attributes['Issuing Authority'],
        'License Number Corruption': only_lnum_corrupt_dataset_attributes['License Number'],
        'Address Corruption': only_lnum_corrupt_dataset_attributes['Address'],
        'Metadata': {
            'description': 'This dataset contains synthetic data for driver license data where only the license number values have been corrupted, everything else is fine.',
            'language': 'English',
            'license': 'ODC-By',
            'dataset_source': 'https://github.com/AatishDA1/PracticeSDProject.git'
        }
    },

    {
        'Dataset': 'only_names_corrupt_dataset', 
        'Total_Entries': len(only_names_corrupt_dataset),
        'Corrupt_Fields': Stats.is_corrupt(only_names_corrupt_dataset),
        'Corruption_Percentage': Stats.corrupt_percent(only_names_corrupt_dataset),
        'Creation_Time': datetime.now().strftime("%d.%m.%y %H:%M:%S"),
        'First Name Corruption': only_names_corrupt_dataset_attributes['First Name'],
        'Last Name Corruption': only_names_corrupt_dataset_attributes['Last Name'],
        'Date of Birth Corruption': only_names_corrupt_dataset_attributes['Date of Birth'],
        'Place of Birth Corruption': only_names_corrupt_dataset_attributes['Place of Birth'],
        'Gender Corruption': only_names_corrupt_dataset_attributes['Gender'],
        'Date of Issue Corruption': only_names_corrupt_dataset_attributes['Date of Issue'],
        'Date of Expiry Corruption': only_names_corrupt_dataset_attributes['Date of Expiry'],
        'Issuing Authority Corruption': only_names_corrupt_dataset_attributes['Issuing Authority'],
        'License Number Corruption': only_names_corrupt_dataset_attributes['License Number'],
        'Address Corruption': only_names_corrupt_dataset_attributes['Address'],
        'Metadata': {
            'description': 'This dataset contains synthetic data for driver license data where only the first and last name values have been corrupted, everything else is fine.',
            'language': 'English',
            'license': 'ODC-By',
            'dataset_source': 'https://github.com/AatishDA1/PracticeSDProject.git'
        }
    },


    {
        'Dataset': 'only_dates_corrupt_dataset', 
        'Total_Entries': len(only_dates_corrupt_dataset),
        'Corrupt_Fields': Stats.is_corrupt(only_dates_corrupt_dataset),
        'Corruption_Percentage': Stats.corrupt_percent(only_dates_corrupt_dataset),
        'Creation_Time': datetime.now().strftime("%d.%m.%y %H:%M:%S"),
        'First Name Corruption': only_dates_corrupt_dataset_attributes['First Name'],
        'Last Name Corruption': only_dates_corrupt_dataset_attributes['Last Name'],
        'Date of Birth Corruption': only_dates_corrupt_dataset_attributes['Date of Birth'],
        'Place of Birth Corruption': only_dates_corrupt_dataset_attributes['Place of Birth'],
        'Gender Corruption': only_dates_corrupt_dataset_attributes['Gender'],
        'Date of Issue Corruption': only_dates_corrupt_dataset_attributes['Date of Issue'],
        'Date of Expiry Corruption': only_dates_corrupt_dataset_attributes['Date of Expiry'],
        'Issuing Authority Corruption': only_dates_corrupt_dataset_attributes['Issuing Authority'],
        'License Number Corruption': only_dates_corrupt_dataset_attributes['License Number'],
        'Address Corruption': only_dates_corrupt_dataset_attributes['Address'],
        'Metadata': {
            'description': 'This dataset contains synthetic data for driver license data where only the date values (Birth/Issue/Expiry) have been corrupted, everything else is fine.',
            'language': 'English',
            'license': 'ODC-By',
            'dataset_source': 'https://github.com/AatishDA1/PracticeSDProject.git'
        }
    }
]

In [14]:
def insert_dataset_info(dataset_name, total_entries, corrupt_fields, corruption_percentage, creation_time,
                        fname_corr, lname_corr, dob_corr, pob_corr, gender_corr, issue_corr, expiry_corr,
                        authority_corr, license_num_corr, address_corr, metadata):
    insert_query = """
    INSERT INTO dataset_registry (
        Dataset, Total_Entries, Corrupt_Fields, Corruption_Percentage, Creation_Time,
        `First Name Corruption`, `Last Name Corruption`, `Date of Birth Corruption`, `Place of Birth Corruption`,
        `Gender Corruption`, `Date of Issue Corruption`, `Date of Expiry Corruption`, `Issuing Authority Corruption`,
        `License Number Corruption`, `Address Corruption`, Metadata
    )
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    """
    values = (
        dataset_name, total_entries, corrupt_fields, corruption_percentage, creation_time,
        fname_corr, lname_corr, dob_corr, pob_corr, gender_corr, issue_corr, expiry_corr, authority_corr,
        license_num_corr, address_corr, json.dumps(metadata)
    )
    mycursor.execute(insert_query, values)

In [15]:
# Connect to mySQL database.
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  password="password",
  database="licensedatav2"
)
mycursor = mydb.cursor()

# Insert information for each dataset into the dataset_registry table
for dataset in datasets_info:
    insert_dataset_info(
        dataset['Dataset'],
        dataset['Total_Entries'],
        dataset['Corrupt_Fields'],
        dataset['Corruption_Percentage'],
        dataset['Creation_Time'],
        dataset['First Name Corruption'],
        dataset['Last Name Corruption'],
        dataset['Date of Birth Corruption'],
        dataset['Place of Birth Corruption'],
        dataset['Gender Corruption'],
        dataset['Date of Issue Corruption'],
        dataset['Date of Expiry Corruption'],
        dataset['Issuing Authority Corruption'],
        dataset['License Number Corruption'],
        dataset['Address Corruption'],
        dataset['Metadata']
    )

# Commit changes and close the connection.
mydb.commit()
mydb.close()

### License Data Table

In [16]:
# Connect to mySQL database.
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  password="password",
  database="licensedatav2"
)
mycursor = mydb.cursor()

# Create list of datasets to be uploaded.
datasets_list = [normal_dataset, all_corrupt_dataset] 

# Iterate through each DataFrame and insert its rows into the 'license_data' table.
for dataset_df in datasets_list:
    for _, row in dataset_df.iterrows():
        # Prepare the columns and values for the INSERT query.
        columns = ", ".join([f"`{col}`" for col in row.index])
        placeholders = ", ".join(["%s" for _ in range(len(row))])

        # Construct the INSERT query using parameterized placeholders.
        insert_query = f"INSERT INTO license_data ({columns}) VALUES ({placeholders})"
        values = tuple(row)
        
        # Execute the INSERT query with the row values.
        mycursor.execute(insert_query, values)

# Commit changes and close the connection.
mydb.commit()
mydb.close()