In [None]:
import os
import glob
import pandas as pd
import numpy as np
import mysql.connector
import requests
from mysql.connector.connection import MySQLConnection
from typing import Dict,List,Tuple,Any,Optional
import shutil
import py7zr  # For handling .7z files
import zipfile
from tqdm import tqdm # progress bar for download

# Connection Variables
USER = 'Sudo'
PASSWORD = 'password'
DATABASE = 'sys' # Do not change! This is the default database for MySQL


#Server Connection Configuration
CONN_CONFIG: Dict[str, str]  = {
    "host": "localhost",
    "user": USER,
    "password": PASSWORD,
    "database": DATABASE
}

CONTOSO_DOWNLOAD_LINK = r"https://github.com/sql-bi/Contoso-Data-Generator-V2-Data/releases/download/ready-to-use-data/csv-10m.7z"
CONTOSO_FILENAME = "csv-10m.7z"# DO NOT CHANGE THIS VALUE!

DBSTART = 'Contoso' # Name of the database to create and where all other tables will be created
EXTRACT_DIR = f"{os.path.expanduser('~')}\\Downloads\\{DBSTART}\\"


def download_file(url: str, filename: str) -> None:
    """
    Downloads a file from a given URL and saves it to the user's Downloads folder,
    with a progress bar displayed in the console.

    :param url: The URL of the file to download.
    :param filename: The name of the file to save.
    :return: None
    """
    user_downloads_dir = os.path.join(os.path.expanduser("~"), "Downloads")
    file_path = os.path.join(user_downloads_dir, filename)

    with requests.get(url, stream=True) as response:
        response.raise_for_status()  # Raise an error for bad responses (4xx or 5xx)

        # Get the total file size from the headers
        total_size = int(response.headers.get('content-length', 0))

        with open(file_path, 'wb') as f:
            with tqdm(total=total_size, unit='B', unit_scale=True, desc="Downloading") as progress_bar:
                for chunk in response.iter_content(chunk_size=1024):  # Download in 1KB chunks
                    f.write(chunk)
                    progress_bar.update(len(chunk))  # Update the progress bar

    print(f"File downloaded to: {file_path}")


def extract_archive(filename: str) -> Optional[str]:
    """
    Searches for a given .7z archive file in the user's Downloads folder,
    and extracts its contents to DOWNLOADS_DIR with a progress bar and detailed error handling.

    :param filename: The name of the .7z archive file to extract.
    :return: The path where files were extracted or None if extraction failed.
    """
    user_downloads_dir = os.path.join(os.path.expanduser("~"), "Downloads")
    file_path = os.path.join(user_downloads_dir, filename)

    # Check if the file exists in Downloads folder
    if not os.path.exists(file_path):
        print(f"Error: File '{filename}' not found in {user_downloads_dir}.")
        return None

    # Ensure the extraction directory exists
    os.makedirs(EXTRACT_DIR, exist_ok=True)

    try:    
        if filename.endswith(".7z"):
            # Extract .7z files with a progress bar
            try:
                counter = 0 
                with py7zr.SevenZipFile(file_path, mode='r', blocksize=1024*1024 ) as archive:
                    file_list = archive.getnames()
                    
                    with tqdm(total=len(file_list), unit="Files Extracted", desc="Extracting") as  files_progress_bar:
                
                        for file in file_list:
                            try:
                                files_progress_bar.set_description(f"Extracting {file}...")
                                files_progress_bar.refresh()
                                archive.extract(targets=[os.path.join(user_downloads_dir,CONTOSO_FILENAME),file], path=EXTRACT_DIR, recursive=False)
                                archive.reset() 
                                files_progress_bar.update(1)
                                
                            except Exception as file_error:
                                print(f"Error extracting file '{file}': {file_error}")
                                continue
                            
                            counter += 1
                            if counter == len(file_list):
                                archive.close()
                                break
                
                print(f"Extracted '{filename}' to '{EXTRACT_DIR}'.")
                return EXTRACT_DIR
           
            except py7zr.Bad7zFile:
                print(f"Error: '{filename}' is not a valid 7z file.")
                return None
           
            except Exception as e:
                print(f"Error extracting 7z file '{filename}': {e}")
                return None

        else:
            print(f"Error: '{filename}' is not a supported archive format (7z).")
            return None

    except PermissionError:
        print(f"Error: Permission denied while accessing '{filename}' or writing to '{EXTRACT_DIR}'.")
        return None
    
    except FileNotFoundError:
        print(f"Error: File '{filename}' not found during extraction.")
        return None
    
    except Exception as e:
        print(f"Unexpected error while extracting '{filename}': {e}")
        return None


def create_database(db_name: str = None) -> None:
    """
    Creates a new MySQL database if it does not exist.

    :param db_name: Name of the database to create.
    :return: None
    """
    try:
        # Connect to MySQL Server (without specifying a database)
        conn: MySQLConnection = mysql.connector.connect(**CONN_CONFIG)
        cursor = conn.cursor()

        # Create database if it doesn't exist
        cursor.execute(f"CREATE DATABASE IF NOT EXISTS `{db_name}`;")
        print(f"Database `{db_name}` created or already exists.")

        # Close connection
        cursor.close()
        conn.close()

    except mysql.connector.Error as err:
        print(f"Error: {err}")


def infer_mysql_dtype(value: object) -> str:
    """
    Infers the MySQL data type based on a sample value from CSV data.
    Tries INT, FLOAT, DATETIME/DATE formats, then falls back to VARCHAR/TEXT.
    """
    if value is None or (isinstance(value, str) and value.strip() == ""):
        return "TEXT"  # Treat empty values as TEXT/nullables

    # Handle actual numeric Python types (already parsed)
    if isinstance(value, np.int64) or isinstance(value, int):
        return "INT"
    
    elif isinstance(value, np.float64) or isinstance(value, float):
        return "DECIMAL(18, 6)"
    
    elif isinstance(value, str):
        val = value.strip()
        
        # Try INT
        try:
            int(val)
            return "INT"
        except ValueError:
            pass

        # Try FLOAT
        try:
            float(val)
            return "DECIMAL(18, 6)"
        except ValueError:
            pass

        # Try known datetime formats
        datetime_formats = {
            "DATETIME": [
                "%Y-%m-%d %H:%M:%S"
                ,"%d-%m-%Y %H:%M:%S"
                ,"%m/%d/%Y %H:%M:%S"
            ],
            "DATE": [
                "%Y-%m-%d"
                ,"%d-%m-%Y"
                ,"%m/%d/%Y"
                ,"%d %b %Y"
                ,"%d %B %Y"
            ],
        }

        for fmt in datetime_formats["DATETIME"]:
            try:
                pd.to_datetime(val, format=fmt)
                return "DATETIME"
            except ValueError:
                continue

        for fmt in datetime_formats["DATE"]:
            try:
                pd.to_datetime(val, format=fmt)
                return "DATE"
            except ValueError:
                continue

        # Fallback for short strings
        return "VARCHAR(255)" if len(val) < 255 else "TEXT"


def create_tables_from_csv(targetdirectory: str = EXTRACT_DIR ) -> None:
    """
    Scans CSV files in a given directory, infers table schema, 
    and creates MySQL tables dynamically based on CSV headers and data types.

    :return: None
    """
    csv_files = glob.glob(os.path.join(targetdirectory, "*.csv"))
    
    # Create list of tuples [(filename, filepath)]
    file_list = [(os.path.basename(file), file) for file in csv_files]
    
    conn = mysql.connector.connect(**CONN_CONFIG)
    cursor = conn.cursor()

    # Table Creation Loop
    for filename, filepath in file_list:
        table_name = os.path.splitext(filename)[0]  # Remove .csv extension

        df = pd.read_csv(filepath, nrows=1)

        columns = df.columns.tolist()

        inferred_types = [infer_mysql_dtype(df.iloc[0][col]) for col in columns]

        columns_sql = ", ".join(f"{col} {dtype}" for col, dtype in zip(columns, inferred_types))
        create_table_sql = f" CREATE TABLE IF NOT EXISTS {DBSTART}.{table_name} ({columns_sql});"
        
        cursor.execute(create_table_sql)
        
    #Confirm the table was created in the SQL server
        cursor.execute(
            f"""
            SELECT EXISTS(
                SELECT * FROM information_schema.tables 
                WHERE
                    table_type = 'BASE TABLE'
                    AND table_name = '{table_name}');
            """
        )
        table_check = cursor.fetchall()        
        
        try:
            if len(table_check) == 1 & table_check[0][0] == table_name:
                print(f"Table `{table_name}` created successfully.")
        except Exception as err:
            print(f"Table creation unsuccessfull, Error: {err}")
            
            
    csv_files = [r"C:\\Users\\Abunch\\Downloads\\Contoso\\date.csv"]        
    file_list = [(os.path.basename(file), file) for file in csv_files]
    # Table Insert Loop
    for filename, filepath in file_list:            
        
        table_name = os.path.splitext(filename)[0] 

        try:
            #load the CSV to dataframe and index in blocks of 100 rows
            data = pd.read_csv(filepath, chunksize=10)
            num_chunks = sum(1 for _ in data)
            counter = 0
            
            with tqdm(total=num_chunks, unit="chunks", desc=f"Loading {table_name}") as progress_bar:

                for index,chunk in enumerate(data,start=1):
                    try:
                        chunk.to_sql(f"{DBSTART}.{table_name}", conn=conn, if_exists='append', index=False, method='multi')
                        print(f"Chunk of data loaded into table `{table_name}` successfully.")
                        progress_bar.update(1)
                        counter += 1
                    
                    except Exception as err:
                        print(f"Error loading chunk # {counter} into table `{table_name}`: {err}")
                        break
                        
            
                # Load data into the table
                df.to_sql(table_name, conn, if_exists='append', index=False)
                print(f"Data loaded into table `{table_name}` successfully.")
            
    
        except Exception as err:
            print(f"Error loading data into table `{table_name}`: {err}")     
               
    # Close DB connection
    cursor.close()
    conn.close()


def create_table(databse_name: str, table_name: str, columns: Dict[str, str]) -> None:
    """
    Creates a new MySQL table with the specified columns.

    :param table_name: The name of the table to create.
    :param columns: A dictionary where keys are column names and values are MySQL data types.
    :return: None
    """
    try:
        conn = mysql.connector.connect(**CONN_CONFIG)
        cursor = conn.cursor()
        columns_sql = ", ".join(f"`{col}` {dtype}" for col, dtype in columns.items())

        create_table_sql = f"CREATE TABLE IF NOT EXISTS `{databse_name}.{table_name}` ({columns_sql});"

        cursor.execute(create_table_sql)
        
         #Confirm the table was created in the SQL server
        cursor.execute(
            f"""
            SELECT EXISTS(
                SELECT * FROM information_schema.tables 
                WHERE
                        table_type = 'BASE TABLE'
                        AND table_name = '{table_name}');
            """
        )
        table_check = cursor.fetchall()        
        try:
            if len(table_check) == 1 & table_check[0][0] == table_name:
                print(f"Table `{table_name}` created successfully.")
        except Exception as err:
            print(f"Table not created successfully, Error: {err}")
            

        # Close DB connection
        cursor.close()
        conn.close()

    except Exception as err:
        print(f"Error: {err}")


# def load_to_table(table_name : str) -> None:
        
#     try:
#         #load the CSV to dataframe and index in blocks of 100 rows
#         data_df = pd.read_csv(filepath, chunksize=1000)
#         # find the number of chunks
#         num_chunks = sum(1 for _ in data_df)
#         counter = 0
        
#         with tqdm(total=num_chunks, unit="chunks", desc=f"Loading {table_name}") as progress_bar:

#             for chunk in data_df:
#                 try:
#                     chunk.to_sql(f"{DBSTART}.{table_name}", conn, if_exists='append', index=False)
#                     print(f"Chunk of data loaded into table `{table_name}` successfully.")
#                     counter += 1
                
#                 except Exception as err:
#                     print(f"Error loading chunk # {counter} into table `{table_name}`: {err}")
#                     break
                    
        
#             # Load data into the table
#             df.to_sql(table_name, conn, if_exists='append', index=False)
#             print(f"Data loaded into table `{table_name}` successfully.")
        
    
#     except Exception as err:
#         print(f"Error loading data into table `{table_name}`: {err}")

# download_file(CONTOSO_DOWNLOAD_LINK, CONTOSO_FILENAME) # works
# extract_archive(CONTOSO_FILENAME) # works
# create_database(DBSTART) # works
create_tables_from_csv(EXTRACT_DIR)


  df.to_sql(table_name, conn, if_exists='append', index=False)
Loading store:   0%|          | 0/366 [00:00<?, ?chunks/s]

Error loading data into table `store`: Execution failed on sql '
        SELECT
            name
        FROM
            sqlite_master
        WHERE
            type IN ('table', 'view')
            AND name=?;
        ': Not all parameters were used in the SQL statement





In [None]:
    
csv_files = [r"C:\\Users\\Abunch\\Downloads\\Contoso\\date.csv"]        
file_list = [(os.path.basename(file), file) for file in csv_files]
# Table Insert Loop
for filename, filepath in file_list:            
    
    table_name = os.path.splitext(filename)[0] 

    try:
        #load the CSV to dataframe and index in blocks of 100 rows
        data = pd.read_csv(filepath, chunksize=10)
        num_chunks = sum(1 for _ in data)
        counter = 0
        
        with tqdm(total=num_chunks, unit="chunks", desc=f"Loading {table_name}") as progress_bar:

            for index,chunk in enumerate(data,start=1):
                try:
                   
                    print(f"Chunk of data loaded into table `{table_name}` successfully.")
                    progress_bar.update(1)
                    counter += 1
                
                except Exception as err:
                    print(f"Error loading chunk # {counter} into table `{table_name}`: {err}")
                    break
                    
        


    except Exception as err:
        print(f"Error loading data into table `{table_name}`: {err}")     

Creating table `currencyexchange` with SQL:  CREATE TABLE IF NOT EXISTS Contoso.currencyexchange (Date DATE, FromCurrency VARCHAR(255), ToCurrency VARCHAR(255), Exchange DECIMAL(18, 6));
Creating table `customer` with SQL:  CREATE TABLE IF NOT EXISTS Contoso.customer (CustomerKey INT, GeoAreaKey INT, StartDT DATE, EndDT DATE, Continent VARCHAR(255), Gender VARCHAR(255), Title VARCHAR(255), GivenName VARCHAR(255), MiddleInitial VARCHAR(255), Surname VARCHAR(255), StreetAddress VARCHAR(255), City VARCHAR(255), State VARCHAR(255), StateFull VARCHAR(255), ZipCode INT, Country VARCHAR(255), CountryFull VARCHAR(255), Birthday DATE, Age INT, Occupation VARCHAR(255), Company VARCHAR(255), Vehicle VARCHAR(255), Latitude DECIMAL(18, 6), Longitude DECIMAL(18, 6));
Creating table `date` with SQL:  CREATE TABLE IF NOT EXISTS Contoso.date (Date DATE, DateKey INT, Year INT, YearQuarter VARCHAR(255), YearQuarterNumber INT, Quarter VARCHAR(255), YearMonth VARCHAR(255), YearMonthShort VARCHAR(255), Year

In [None]:

# # confirm the tables were created in the SQL server
# conn = mysql.connector.connect(**CONN_CONFIG)
# cursor = conn.cursor()

# cursor.execute(
#     f"""
#     SELECT table_name 
#     FROM information_schema.tables 
#     WHERE table_schema = '{DBSTART}';
#     """
# )
# table_list = cursor.fetchall()

# # glob the extract directory for all files
# file_list = glob.glob(os.path.join(EXTRACT_DIR, "*.csv"))    

    
# # only if all tables in the filelist are created in the SQL server, then delete the files
# for file in file_list:
#     table_name = os.path.splitext(file)[0]  # Remove .csv extension
#     if table_name in table_list:
#         print(f"Deleting {file} from {EXTRACT_DIR}")
#         os.remove(os.path.join(EXTRACT_DIR, file))
#     else:
#         print(f"File {file} not found in SQL server, will not delete.")
        
# # check if all files were deleted
# for file in file_list:
#     if os.path.exists(os.path.join(EXTRACT_DIR, file)):
#         print(f"File {file} still exists in {EXTRACT_DIR}")
#         # attempt to create the table again 2 more times, else exit the script
#         counter = 0
#         while counter < 2:
#             table_name = os.path.splitext(file)[0]
#             columns = []
#             df = pd.read_csv(file, nrows=2)
#             columns = df.columns.tolist()
#             inferred_types = [infer_mysql_dtype(df.iloc[0][col]) for col in columns]
#             columns = {col: dtype for col, dtype in zip(columns, inferred_types)}
#             create_table(DBSTART, file, columns=columns)
#             # confirm the table was created in the SQL server
#             cursor.execute(
#                 f"""
#                 SELECT EXISTS(
#                     SELECT * 
#                     FROM information_schema.tables 
#                     WHERE
#                         table_type = 'BASE TABLE'
#                         AND table_name = '{table_name}');
#                 """
#             )
#             table_check = cursor.fetchall()
#             try:
#                 if len(table_check) == 1 & table_check[0][0] == table_name:
#                     print(f"Table `{table_name}` created successfully.")
#                     os.remove(os.path.join(EXTRACT_DIR, file))
#                     break
#             except Exception as err:
#                 print(f"Table not created successfully, Error: {err}")
#                 counter += 1
#                 if counter == 2:
#                     print(f"File {file} still exists in {EXTRACT_DIR}, exiting script.")
#                     exit()
#                 else:
#                     print(f"Retrying to create table {table_name}...")
#     else:
#         print(f"File {file} deleted from {EXTRACT_DIR}")
  