## Imports

In [1]:
# Imports for all modules needed.
import mysql.connector
import sys
import json
from datetime import datetime
import pandas as pd
sys.path.append('../DataGeneration')
from license_data_generator import License, Corrupt, Validate, Stats

## Database Setup

In [2]:
# Connect to mySQL.
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  password="password"
)

mycursor = mydb.cursor()

# Create a database for license data if it doesn't already exist.
try:
    mycursor.execute("CREATE DATABASE licensedatav2")
except:
   pass

# List all available databases.
mycursor.execute("SHOW DATABASES")

for x in mycursor:
  print(x)

('information_schema',)
('licensedata',)
('licensedatav2',)
('mysql',)
('performance_schema',)
('sys',)


### Dataset Registry Table Creation

In [3]:
# Connect to mySQL database.
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  password="password",
  database="licensedatav2"
)
mycursor = mydb.cursor()

# CREATE TABLE query for dataset_registry.
# Note to future self, if you wonder why we didn't set dataset as the primary key, its because you can't have a key refer to multiple rows.
create_dataset_registry_query = """
CREATE TABLE IF NOT EXISTS dataset_registry (
    dataset_id INT AUTO_INCREMENT PRIMARY KEY,
    Dataset VARCHAR(255), 
    Total_Entries INT,
    Corrupt_Fields INT,
    Corruption_Percentage DECIMAL(5,2),
    Creation_Time TIMESTAMP,
    `First Name Corruption` INT,
    `Last Name Corruption` INT,
    `Date of Birth Corruption` INT,
    `Place of Birth Corruption` INT,
    `Gender Corruption` INT,
    `Date of Issue Corruption` INT,
    `Date of Expiry Corruption` INT,
    `Issuing Authority Corruption` INT,
    `License Number Corruption` INT,
    `Address Corruption` INT,
    Metadata LONGTEXT
);
"""

# Execute the CREATE TABLE query.
mycursor.execute(create_dataset_registry_query)

# Commit changes and close the connection.
mydb.commit()
mydb.close()

### License Data Table Creation

In [4]:
# Generate a dataset just to extract the column names.
header = Validate.validate(License.generate_dataset(0))
header["Dataset"] = "header"
header.head(0)

Unnamed: 0,First Name,First Name Corruption,Last Name,Last Name Corruption,Date of Birth,Date of Birth Corruption,Place of Birth,Place of Birth Corruption,Gender,Gender Corruption,...,Date of Issue Corruption,Date of Expiry,Date of Expiry Corruption,Issuing Authority,Issuing Authority Corruption,License Number,License Number Corruption,Address,Address Corruption,Dataset


In [5]:
# Connect to mySQL database.
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  password="password",
  database="licensedatav2"
)
mycursor = mydb.cursor()

table_name = 'license_data'
create_table_query = f"CREATE TABLE IF NOT EXISTS {table_name} (id INT AUTO_INCREMENT PRIMARY KEY, "

# Extract column names from DataFrame and generate SQL query with backticks for column names to avoid conflicts with variable names.
for column in header.columns:
    column_name = column
    create_table_query += f"`{column_name}` VARCHAR(255), "
    

# Put together the SQL query to create the table.
try:
    create_table_query = create_table_query.rstrip(', ') + ");"
    mycursor.execute(create_table_query)

    # ALTER TABLE query to add Dataset column as a foreign key.
    alter_license_data_query = """
    ALTER TABLE license_data
    ADD COLUMN Dataset VARCHAR(255),
    ADD CONSTRAINT fk_dataset
        FOREIGN KEY (Dataset)
        REFERENCES dataset_registry(Dataset);
    """
    mycursor.execute(alter_license_data_query)

except:
    pass

# Commit changes and close the connection.
mydb.commit()
mydb.close()

## Dataset Creation

### Normal License Dataset

In [6]:
# Creating a normal dataset.
normal_dataset = License.generate_dataset(100)
normal_dataset = Validate.validate(normal_dataset)
normal_dataset["Dataset"] = "normal_dataset"
normal_dataset.head()

Unnamed: 0,First Name,First Name Corruption,Last Name,Last Name Corruption,Date of Birth,Date of Birth Corruption,Place of Birth,Place of Birth Corruption,Gender,Gender Corruption,...,Date of Issue Corruption,Date of Expiry,Date of Expiry Corruption,Issuing Authority,Issuing Authority Corruption,License Number,License Number Corruption,Address,Address Corruption,Dataset
0,Guy,0,Barnes,0,05.11.1993,0,Madagascar,0,Male,0,...,0,04.03.2028,0,DA1,0,BARNE911053G99PL,0,"320 Karl bypass, Griffithsmouth, S03 5NB",0,normal_dataset
1,Hannah,0,Jordan,0,12.04.1987,0,Canada,0,Female,0,...,0,30.12.2032,0,DA1,0,JORDA854127H99JB,0,"993 Leah crescent, New Zoeland, E9T 4FE",0,normal_dataset
2,Jasmine,0,Harrison,0,13.11.2003,0,Syrian Arab Republic,0,Female,0,...,0,24.02.2031,0,DA1,0,HARRI061133J99RD,0,"639 Taylor common, Heatherfort, B1D 6WX",0,normal_dataset
3,Diane,0,Parker,0,06.11.1963,0,Macao,0,Male,0,...,0,14.01.2028,0,DA1,0,PARKE611063D99NM,0,"247 Johnson mission, Walkerberg, W17 7QN",0,normal_dataset
4,Jamie,0,Edwards,0,15.11.1953,0,Ethiopia,0,Male,0,...,0,17.09.2010,0,DA1,0,EDWAR511153J99ME,0,"590 Bates highway, Lake Lorraineshire, MK14 3BX",0,normal_dataset


### All Corrupt License Dataset

In [7]:
# Creating a dataset with only corrupt entries.
all_corrupt_dataset = Corrupt.introduce_corruptions(License.generate_dataset(100), 1)
all_corrupt_dataset = Validate.validate(all_corrupt_dataset)
all_corrupt_dataset["Dataset"] = "all_corrupt_dataset"
all_corrupt_dataset.head()

Unnamed: 0,First Name,First Name Corruption,Last Name,Last Name Corruption,Date of Birth,Date of Birth Corruption,Place of Birth,Place of Birth Corruption,Gender,Gender Corruption,...,Date of Issue Corruption,Date of Expiry,Date of Expiry Corruption,Issuing Authority,Issuing Authority Corruption,License Number,License Number Corruption,Address,Address Corruption,Dataset
0,`;)^2/,1,>!:21,1,48.25.1979,1,"+~)""~`|",1,Molestias,1,...,1,67.41.2008,1,PTR,1,SMITH756059J99NU,1,"996 Derek center, Smithmouth, L11 4PN",0,all_corrupt_dataset
1,3~;1$,1,",69$!:}",1,94.92.1954,1,"),6+40",1,Occaecati,1,...,1,69.55.1993,1,FC6N,1,JOHNS555164W99XMEI,1,,1,all_corrupt_dataset
2,_-?>,1,<8)_,1,54.68.1972,1,<&>7]<*+@96,1,Eum,1,...,1,93.70.2019,1,5DEG,1,REEDJ707P82KS9FX,1,423 Christopher haven,1,all_corrupt_dataset
3,*'8-{2,1,}*5|#!,1,72.19.2000,1,)9+!7<,1,Odio,1,...,1,60.24.2032,1,QF57,1,HNTE012310N99AU,1,"165 Chelsea field, Lake Lindsey, OX5 8DE",0,all_corrupt_dataset
4,+?:?(,1,1<.`{<,1,21/06/1950,1,`[{4702=018\7^;^@453#,1,Laudantium,1,...,1,07.16.2000,1,2A,1,MURPH55621B99WO,1,"600 Matthew plains, Bellfurt",1,all_corrupt_dataset


### Some Corrupt License Dataset

In [8]:
# Creating a dataset with some corrupt entries.
some_corrupt_dataset = Corrupt.introduce_corruptions(License.generate_dataset(100), 0.2)
some_corrupt_dataset = Validate.validate(some_corrupt_dataset)
some_corrupt_dataset["Dataset"] = "some_corrupt_dataset"
some_corrupt_dataset.head()

Unnamed: 0,First Name,First Name Corruption,Last Name,Last Name Corruption,Date of Birth,Date of Birth Corruption,Place of Birth,Place of Birth Corruption,Gender,Gender Corruption,...,Date of Issue Corruption,Date of Expiry,Date of Expiry Corruption,Issuing Authority,Issuing Authority Corruption,License Number,License Number Corruption,Address,Address Corruption,Dataset
0,Ca0h=rin},1,Blake,0,71.20.2002,1,Tanz-nia,1,Female,0,...,1,92.52.2033,1,DA1,0,BLAKE056162C99WU,1,"654 Middleton ridge, Patelmouth, GL2 1GY",0,some_corrupt_dataset
1,Antony,0,Jones,0,11.08.1951,0,Pale@t*4ian *e?#ito`@,1,Female,0,...,1,84.95.2029,1,DA1,0,JONES558111A99JD,0,"687 Hollie ville, Robertsside, L73 1PJ",0,some_corrupt_dataset
2,G@;lia),1,W~t%s,1,03.02.1999,0,Saint 4arthelemy,1,Eveniet,1,...,1,43.22.2033,1,6L8,1,WATTS952039G99FV,1,"456 Katie locks, New Marc, M44 9UU",0,some_corrupt_dataset
3,Ann,0,"Ba,ley",1,15.07.2001,0,Cuba,0,Female,0,...,1,24.09.2028,1,DA1,0,BAILE057151A99LT,1,"700 Martin unions, Port Rita, MK8 6RD",0,some_corrupt_dataset
4,Marga4et,1,Miller,0,30.05.1971,0,A|dorra,1,Male,0,...,1,51.21.2009,1,M59,1,MILLE705301M99BL,0,"831 Tomlinson landing, Robertsview, B6D 7DW",0,some_corrupt_dataset


### Only License Number Corrupt Dataset

In [9]:
# Creating a dataset where only the license number entries are corrupt, everything else is fine.
only_lnum_corrupt_dataset = Corrupt.introduce_drivernum_corruption(License.generate_dataset(100), 'License Number', 1)
only_lnum_corrupt_dataset = Validate.validate(only_lnum_corrupt_dataset)
only_lnum_corrupt_dataset["Dataset"] = "only_lnum_corrupt_dataset"
only_lnum_corrupt_dataset.head()

Unnamed: 0,First Name,First Name Corruption,Last Name,Last Name Corruption,Date of Birth,Date of Birth Corruption,Place of Birth,Place of Birth Corruption,Gender,Gender Corruption,...,Date of Issue Corruption,Date of Expiry,Date of Expiry Corruption,Issuing Authority,Issuing Authority Corruption,License Number,License Number Corruption,Address,Address Corruption,Dataset
0,Craig,0,Price,0,09.05.1984,0,Liechtenstein,0,Male,0,...,0,01.01.2032,0,DA1,0,PRCE805094C99JWT,1,"820 Reece mount, Abigailton, B54 3HP",0,only_lnum_corrupt_dataset
1,Holly,0,Jennings,0,24.09.1963,0,Australia,0,Female,0,...,0,02.01.2013,0,DA1,0,CENNI659243H99WB,1,"113 Anderson glens, New Robert, B5 7JB",0,only_lnum_corrupt_dataset
2,Jill,0,Fraser,0,25.11.1996,0,Iraq,0,Female,0,...,0,06.09.2025,0,DA1,0,FBASE961256J99UW2W,1,"644 Matthew path, Reynoldsborough, S9 4JP",0,only_lnum_corrupt_dataset
3,Mary,0,Burrows,0,27.03.1955,0,South Georgia and the South Sandwich Islands,0,Male,0,...,0,01.03.2010,0,DA1,0,BURRO50327M99DO,1,"159 Reynolds circle, Morrisbury, TQ26 1QL",0,only_lnum_corrupt_dataset
4,Leon,0,Mason,0,01.10.1980,0,Madagascar,0,Male,0,...,0,17.05.2024,0,DA1,0,MASON81001WL99PW,1,"34 Smith parkway, Lake Victorton, EX87 0XW",0,only_lnum_corrupt_dataset


### Only Names Corrupt Dataset

In [10]:
# Creating a dataset where only the first and last name entries are corrupt, everything else is fine.
only_names_corrupt_dataset = Corrupt.introduce_name_corruption(License.generate_dataset(100), 'First Name', 1)
only_names_corrupt_dataset = Corrupt.introduce_name_corruption(only_names_corrupt_dataset, 'Last Name', 1)
only_names_corrupt_dataset = Validate.validate(only_names_corrupt_dataset)
only_names_corrupt_dataset["Dataset"] = "only_names_corrupt_dataset"
only_names_corrupt_dataset.head()

Unnamed: 0,First Name,First Name Corruption,Last Name,Last Name Corruption,Date of Birth,Date of Birth Corruption,Place of Birth,Place of Birth Corruption,Gender,Gender Corruption,...,Date of Issue Corruption,Date of Expiry,Date of Expiry Corruption,Issuing Authority,Issuing Authority Corruption,License Number,License Number Corruption,Address,Address Corruption,Dataset
0,)3~&9%>,1,\!<]:0,1,25.08.1991,0,Costa Rica,0,Female,0,...,0,08.04.2028,0,DA1,0,BARKE958251R99DO,1,"594 Allen point, Coleborough, DE6 0DT",0,only_names_corrupt_dataset
1,%;\:4_'3,1,"{+(\._^,%*",1,01.08.1978,0,Ghana,0,Male,0,...,0,15.08.2006,0,DA1,0,RICHA708018M99DK,1,"367 Malcolm lane, Jordanstad, L2U 3FQ",0,only_names_corrupt_dataset
2,\(|_?#,1,(57+.,1,03.07.2005,0,Australia,0,Female,0,...,0,07.03.2033,0,DA1,0,BOYLE057035J99AP,1,"512 Whitehouse lane, North Janetmouth, B46 0BN",0,only_names_corrupt_dataset
3,[1',1,)_=_@,1,29.06.2001,0,Guyana,0,Female,0,...,0,21.09.2031,0,DA1,0,LEWIS056291A99VZ,1,"92 Phillips drive, New Tobymouth, L4G 3WU",0,only_names_corrupt_dataset
4,":3,!=",1,55%*9,1,21.05.1999,0,Benin,0,Male,0,...,0,25.10.2026,0,DA1,0,JONES905219D99WL,1,"452 Glover mountain, Frenchville, OL0R 5XP",0,only_names_corrupt_dataset


### Only Dates Corrupt Dataset

In [11]:
# Creating a dataset where only the dates entries are corrupt, everything else is fine.
only_dates_corrupt_dataset = Corrupt.introduce_date_corruption(License.generate_dataset(100), 'Date of Birth', 1)
only_dates_corrupt_dataset = Corrupt.introduce_date_corruption(only_dates_corrupt_dataset, 'Date of Issue', 1)
only_dates_corrupt_dataset = Corrupt.introduce_date_corruption(only_dates_corrupt_dataset, 'Date of Expiry', 1)
only_dates_corrupt_dataset = Validate.validate(only_dates_corrupt_dataset)
only_dates_corrupt_dataset["Dataset"] = "only_dates_corrupt_dataset"
only_dates_corrupt_dataset.head()

Unnamed: 0,First Name,First Name Corruption,Last Name,Last Name Corruption,Date of Birth,Date of Birth Corruption,Place of Birth,Place of Birth Corruption,Gender,Gender Corruption,...,Date of Issue Corruption,Date of Expiry,Date of Expiry Corruption,Issuing Authority,Issuing Authority Corruption,License Number,License Number Corruption,Address,Address Corruption,Dataset
0,Marie,0,Mitchell,0,89.61.1961,1,Brazil,0,Male,0,...,1,81.62.2017,1,DA1,0,MITCH607051M99IN,1,"324 Bentley knoll, Mohamedfort, EN5W 5WS",0,only_dates_corrupt_dataset
1,Nicola,0,Turnbull,0,19/05/1952,1,Cote d'Ivoire,1,Female,0,...,1,34.75.1980,1,DA1,0,TURNB555192N99VA,0,"876 Cross shores, Suttonmouth, ST2R 5EX",0,only_dates_corrupt_dataset
2,Wendy,0,Robinson,0,59.98.1956,1,France,0,Female,0,...,1,09.21.1998,1,DA1,0,ROBIN553206W99ZY,1,"712 June curve, Walkerland, DT66 3QE",0,only_dates_corrupt_dataset
3,Sara,0,Bell,0,65.58.1951,1,Tonga,0,Male,0,...,1,22/04/2005,1,DA1,0,BELL9507201S99NV,1,"89 Rhys keys, New Callum, RH0 0ER",0,only_dates_corrupt_dataset
4,Suzanne,0,Gray,0,70.22.1980,1,Mexico,0,Female,0,...,1,63.97.2033,1,DA1,0,GRAY9857310S99UN,1,"17 Miller mills, Port Peterfort, TS90 3FS",0,only_dates_corrupt_dataset


## Database Population

### Data Registry Table

In [12]:
# Check the dataset attributes for corruption.
normal_dataset_attributes = Stats.attribute_corruption(normal_dataset)
all_corrupt_dataset_attributes = Stats.attribute_corruption(all_corrupt_dataset)
some_corrupt_dataset_attributes = Stats.attribute_corruption(some_corrupt_dataset)
only_lnum_corrupt_dataset_attributes = Stats.attribute_corruption(only_lnum_corrupt_dataset)
only_names_corrupt_dataset_attributes = Stats.attribute_corruption(only_names_corrupt_dataset)
only_dates_corrupt_dataset_attributes = Stats.attribute_corruption(only_dates_corrupt_dataset)

In [13]:
# Summarize the information for each dataset to be uploaded into the database registry table.
datasets_info = [
    {
        'Dataset': 'normal_dataset', 
        'Total_Entries': len(normal_dataset),
        'Corrupt_Fields': Stats.is_corrupt(normal_dataset),
        'Corruption_Percentage': Stats.corrupt_percent(normal_dataset),
        'Creation_Time': datetime.now().strftime("%d.%m.%y %H:%M:%S"),
        'First Name Corruption': normal_dataset_attributes['First Name'],
        'Last Name Corruption': normal_dataset_attributes['Last Name'],
        'Date of Birth Corruption': normal_dataset_attributes['Date of Birth'],
        'Place of Birth Corruption': normal_dataset_attributes['Place of Birth'],
        'Gender Corruption': normal_dataset_attributes['Gender'],
        'Date of Issue Corruption': normal_dataset_attributes['Date of Issue'],
        'Date of Expiry Corruption': normal_dataset_attributes['Date of Expiry'],
        'Issuing Authority Corruption': normal_dataset_attributes['Issuing Authority'],
        'License Number Corruption': normal_dataset_attributes['License Number'],
        'Address Corruption': normal_dataset_attributes['Address'],
        'Metadata': {
            'description': 'This dataset contains uncorrupted synthetic data for driver license data.',
            'language': 'English',
            'license': 'ODC-By',
            'dataset_source': 'https://github.com/AatishDA1/PracticeSDProject.git'
        }
    },

    {
        'Dataset': 'all_corrupt_dataset', 
        'Total_Entries': len(all_corrupt_dataset),
        'Corrupt_Fields': Stats.is_corrupt(all_corrupt_dataset),
        'Corruption_Percentage': Stats.corrupt_percent(all_corrupt_dataset),
        'Creation_Time': datetime.now().strftime("%d.%m.%y %H:%M:%S"),
        'First Name Corruption': all_corrupt_dataset_attributes['First Name'],
        'Last Name Corruption': all_corrupt_dataset_attributes['Last Name'],
        'Date of Birth Corruption': all_corrupt_dataset_attributes['Date of Birth'],
        'Place of Birth Corruption': all_corrupt_dataset_attributes['Place of Birth'],
        'Gender Corruption': all_corrupt_dataset_attributes['Gender'],
        'Date of Issue Corruption': all_corrupt_dataset_attributes['Date of Issue'],
        'Date of Expiry Corruption': all_corrupt_dataset_attributes['Date of Expiry'],
        'Issuing Authority Corruption': all_corrupt_dataset_attributes['Issuing Authority'],
        'License Number Corruption': all_corrupt_dataset_attributes['License Number'],
        'Address Corruption': all_corrupt_dataset_attributes['Address'],
        'Metadata': {
            'description': 'This dataset contains synthetic data for driver license data where every value has been corrupted.',
            'language': 'English',
            'license': 'ODC-By',
            'dataset_source': 'https://github.com/AatishDA1/PracticeSDProject.git'
        }
    },

    {
        'Dataset': 'some_corrupt_dataset', 
        'Total_Entries': len(some_corrupt_dataset),
        'Corrupt_Fields': Stats.is_corrupt(some_corrupt_dataset),
        'Corruption_Percentage': Stats.corrupt_percent(some_corrupt_dataset),
        'Creation_Time': datetime.now().strftime("%d.%m.%y %H:%M:%S"),
        'First Name Corruption': some_corrupt_dataset_attributes['First Name'],
        'Last Name Corruption': some_corrupt_dataset_attributes['Last Name'],
        'Date of Birth Corruption': some_corrupt_dataset_attributes['Date of Birth'],
        'Place of Birth Corruption': some_corrupt_dataset_attributes['Place of Birth'],
        'Gender Corruption': some_corrupt_dataset_attributes['Gender'],
        'Date of Issue Corruption': some_corrupt_dataset_attributes['Date of Issue'],
        'Date of Expiry Corruption': some_corrupt_dataset_attributes['Date of Expiry'],
        'Issuing Authority Corruption': some_corrupt_dataset_attributes['Issuing Authority'],
        'License Number Corruption': some_corrupt_dataset_attributes['License Number'],
        'Address Corruption': some_corrupt_dataset_attributes['Address'],
        'Metadata': {
            'description': 'This dataset contains synthetic data for driver license data where some values have been corrupted.',
            'language': 'English',
            'license': 'ODC-By',
            'dataset_source': 'https://github.com/AatishDA1/PracticeSDProject.git'
        }
    },

    {
        'Dataset': 'only_lnum_corrupt_dataset', 
        'Total_Entries': len(only_lnum_corrupt_dataset),
        'Corrupt_Fields': Stats.is_corrupt(only_lnum_corrupt_dataset),
        'Corruption_Percentage': Stats.corrupt_percent(only_lnum_corrupt_dataset),
        'Creation_Time': datetime.now().strftime("%d.%m.%y %H:%M:%S"),
        'First Name Corruption': only_lnum_corrupt_dataset_attributes['First Name'],
        'Last Name Corruption': only_lnum_corrupt_dataset_attributes['Last Name'],
        'Date of Birth Corruption': only_lnum_corrupt_dataset_attributes['Date of Birth'],
        'Place of Birth Corruption': only_lnum_corrupt_dataset_attributes['Place of Birth'],
        'Gender Corruption': only_lnum_corrupt_dataset_attributes['Gender'],
        'Date of Issue Corruption': only_lnum_corrupt_dataset_attributes['Date of Issue'],
        'Date of Expiry Corruption': only_lnum_corrupt_dataset_attributes['Date of Expiry'],
        'Issuing Authority Corruption': only_lnum_corrupt_dataset_attributes['Issuing Authority'],
        'License Number Corruption': only_lnum_corrupt_dataset_attributes['License Number'],
        'Address Corruption': only_lnum_corrupt_dataset_attributes['Address'],
        'Metadata': {
            'description': 'This dataset contains synthetic data for driver license data where only the license number values have been corrupted, everything else is fine.',
            'language': 'English',
            'license': 'ODC-By',
            'dataset_source': 'https://github.com/AatishDA1/PracticeSDProject.git'
        }
    },

    {
        'Dataset': 'only_names_corrupt_dataset', 
        'Total_Entries': len(only_names_corrupt_dataset),
        'Corrupt_Fields': Stats.is_corrupt(only_names_corrupt_dataset),
        'Corruption_Percentage': Stats.corrupt_percent(only_names_corrupt_dataset),
        'Creation_Time': datetime.now().strftime("%d.%m.%y %H:%M:%S"),
        'First Name Corruption': only_names_corrupt_dataset_attributes['First Name'],
        'Last Name Corruption': only_names_corrupt_dataset_attributes['Last Name'],
        'Date of Birth Corruption': only_names_corrupt_dataset_attributes['Date of Birth'],
        'Place of Birth Corruption': only_names_corrupt_dataset_attributes['Place of Birth'],
        'Gender Corruption': only_names_corrupt_dataset_attributes['Gender'],
        'Date of Issue Corruption': only_names_corrupt_dataset_attributes['Date of Issue'],
        'Date of Expiry Corruption': only_names_corrupt_dataset_attributes['Date of Expiry'],
        'Issuing Authority Corruption': only_names_corrupt_dataset_attributes['Issuing Authority'],
        'License Number Corruption': only_names_corrupt_dataset_attributes['License Number'],
        'Address Corruption': only_names_corrupt_dataset_attributes['Address'],
        'Metadata': {
            'description': 'This dataset contains synthetic data for driver license data where only the first and last name values have been corrupted, everything else is fine.',
            'language': 'English',
            'license': 'ODC-By',
            'dataset_source': 'https://github.com/AatishDA1/PracticeSDProject.git'
        }
    },


    {
        'Dataset': 'only_dates_corrupt_dataset', 
        'Total_Entries': len(only_dates_corrupt_dataset),
        'Corrupt_Fields': Stats.is_corrupt(only_dates_corrupt_dataset),
        'Corruption_Percentage': Stats.corrupt_percent(only_dates_corrupt_dataset),
        'Creation_Time': datetime.now().strftime("%d.%m.%y %H:%M:%S"),
        'First Name Corruption': only_dates_corrupt_dataset_attributes['First Name'],
        'Last Name Corruption': only_dates_corrupt_dataset_attributes['Last Name'],
        'Date of Birth Corruption': only_dates_corrupt_dataset_attributes['Date of Birth'],
        'Place of Birth Corruption': only_dates_corrupt_dataset_attributes['Place of Birth'],
        'Gender Corruption': only_dates_corrupt_dataset_attributes['Gender'],
        'Date of Issue Corruption': only_dates_corrupt_dataset_attributes['Date of Issue'],
        'Date of Expiry Corruption': only_dates_corrupt_dataset_attributes['Date of Expiry'],
        'Issuing Authority Corruption': only_dates_corrupt_dataset_attributes['Issuing Authority'],
        'License Number Corruption': only_dates_corrupt_dataset_attributes['License Number'],
        'Address Corruption': only_dates_corrupt_dataset_attributes['Address'],
        'Metadata': {
            'description': 'This dataset contains synthetic data for driver license data where only the date values (Birth/Issue/Expiry) have been corrupted, everything else is fine.',
            'language': 'English',
            'license': 'ODC-By',
            'dataset_source': 'https://github.com/AatishDA1/PracticeSDProject.git'
        }
    }
]

In [14]:
def insert_dataset_info(dataset_name, total_entries, corrupt_fields, corruption_percentage, creation_time,
                        fname_corr, lname_corr, dob_corr, pob_corr, gender_corr, issue_corr, expiry_corr,
                        authority_corr, license_num_corr, address_corr, metadata):
    insert_query = """
    INSERT INTO dataset_registry (
        Dataset, Total_Entries, Corrupt_Fields, Corruption_Percentage, Creation_Time,
        `First Name Corruption`, `Last Name Corruption`, `Date of Birth Corruption`, `Place of Birth Corruption`,
        `Gender Corruption`, `Date of Issue Corruption`, `Date of Expiry Corruption`, `Issuing Authority Corruption`,
        `License Number Corruption`, `Address Corruption`, Metadata
    )
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    """
    values = (
        dataset_name, total_entries, corrupt_fields, corruption_percentage, creation_time,
        fname_corr, lname_corr, dob_corr, pob_corr, gender_corr, issue_corr, expiry_corr, authority_corr,
        license_num_corr, address_corr, json.dumps(metadata)
    )
    mycursor.execute(insert_query, values)

In [15]:
# Connect to mySQL database.
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  password="password",
  database="licensedatav2"
)
mycursor = mydb.cursor()

# Insert information for each dataset into the dataset_registry table
for dataset in datasets_info:
    insert_dataset_info(
        dataset['Dataset'],
        dataset['Total_Entries'],
        dataset['Corrupt_Fields'],
        dataset['Corruption_Percentage'],
        dataset['Creation_Time'],
        dataset['First Name Corruption'],
        dataset['Last Name Corruption'],
        dataset['Date of Birth Corruption'],
        dataset['Place of Birth Corruption'],
        dataset['Gender Corruption'],
        dataset['Date of Issue Corruption'],
        dataset['Date of Expiry Corruption'],
        dataset['Issuing Authority Corruption'],
        dataset['License Number Corruption'],
        dataset['Address Corruption'],
        dataset['Metadata']
    )

# Commit changes and close the connection.
mydb.commit()
mydb.close()

### License Data Table

In [16]:
# Connect to mySQL database.
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  password="password",
  database="licensedatav2"
)
mycursor = mydb.cursor()

# Create list of datasets to be uploaded.
datasets_list = [normal_dataset, all_corrupt_dataset, some_corrupt_dataset, only_lnum_corrupt_dataset, only_names_corrupt_dataset, only_dates_corrupt_dataset] 

# Iterate through each DataFrame and insert its rows into the 'license_data' table.
for dataset_df in datasets_list:
    for _, row in dataset_df.iterrows():
        # Prepare the columns and values for the INSERT query.
        columns = ", ".join([f"`{col}`" for col in row.index])
        placeholders = ", ".join(["%s" for _ in range(len(row))])

        # Construct the INSERT query using parameterized placeholders.
        insert_query = f"INSERT INTO license_data ({columns}) VALUES ({placeholders})"
        values = tuple(row)
        
        # Execute the INSERT query with the row values.
        mycursor.execute(insert_query, values)

# Commit changes and close the connection.
mydb.commit()
mydb.close()