Notebook para poder validar la transferencia correcta de la data comparando valores aleatorios de la data en MySQL y la data en Snowflake para cada tabla.

In [1]:
import mysql.connector
import random   
import snowflake.connector
from os import getenv


In [14]:
def compare_data(table_name, id_column_name, no_comparisons, rango, mysqlcnx, mysqlcursor,snowflakecnx, snowflakecursor):
    ids=[]
    for _ in range(no_comparisons):
        id = random.randint(1, rango)
        ids.append(id)
    for id in ids:
        mysqlcursor.execute("SELECT * FROM %s WHERE %s = %%s" % (table_name, id_column_name), (id,))
        mysqlresult = mysqlcursor.fetchall()
        snowflakecursor.execute("SELECT * FROM %s WHERE %s = %%s" % (table_name, id_column_name), (id,))
        snowflakeresult = snowflakecursor.fetchall()
        if mysqlresult != snowflakeresult:
            print("Data mismatch for id "+str(id))
            print("MySQL: "+str(mysqlresult))
            print("Snowflake: "+str(snowflakeresult))
            return False
    return True
        
    

In [3]:
config_mysql = {
    'user': getenv('DB_USER'),
    'password': getenv('DB_PASSWORD'),
    'host': '127.0.0.1',
    'database': 'instacart_db',
    'port': '3306'}
config_snowflake = {
    'user': getenv('SNOW_USER'),
    'password': getenv('SNOW_PASSWORD'),
    'account': getenv('SNOW_ACCOUNT'),
    'schema': 'RAW',
    'database': 'INSTACART_DB',}
mysqlcnx = mysql.connector.connect(**config_mysql)
mysqlcursor = mysqlcnx.cursor()
snowflakecnx = snowflake.connector.connect(**config_snowflake)
snowflakecursor = snowflakecnx.cursor()
    

In [8]:
#Consultar tabla aisles en MySQL para validar la conexión
mysqlcursor.execute("SELECT * FROM departments")
for x in mysqlcursor:
    print(x)

(1, 'frozen')
(2, 'other')
(3, 'bakery')
(4, 'produce')
(5, 'alcohol')
(6, 'international')
(7, 'beverages')
(8, 'pets')
(9, 'dry goods pasta')
(10, 'bulk')
(11, 'personal care')
(12, 'meat seafood')
(13, 'pantry')
(14, 'breakfast')
(15, 'canned goods')
(16, 'dairy eggs')
(17, 'household')
(18, 'babies')
(19, 'snacks')
(20, 'deli')
(21, 'missing')


In [9]:
#Consultar tabla aisles en Snowflake para validar la conexión
snowflakecursor.execute("SELECT * FROM departments")
for x in snowflakecursor:
    print(x)

(1, 'frozen')
(2, 'other')
(3, 'bakery')
(4, 'produce')
(5, 'alcohol')
(6, 'international')
(7, 'beverages')
(8, 'pets')
(9, 'dry goods pasta')
(10, 'bulk')
(11, 'personal care')
(12, 'meat seafood')
(13, 'pantry')
(14, 'breakfast')
(15, 'canned goods')
(16, 'dairy eggs')
(17, 'household')
(18, 'babies')
(19, 'snacks')
(20, 'deli')
(21, 'missing')


In [16]:
# Tabla aisles, 134 filas
print("Comparing aisles")
print("Data is consistent" if compare_data("aisles", "aisle_id", 10, 134, mysqlcnx, mysqlcursor, snowflakecnx, snowflakecursor)
      else "Data is inconsistent")
print()
# Tabla departments, 21 filas
print("Comparing departments")
print("Data is consistent" if compare_data("departments", "department_id", 10, 21, mysqlcnx, mysqlcursor, snowflakecnx, snowflakecursor)
      else "Data is inconsistent")
print()
# Tabla products, 49694 filas
print("Comparing products")
print("Data is consistent" if compare_data("products", "product_id", 10, 49694, mysqlcnx, mysqlcursor, snowflakecnx, snowflakecursor)
      else "Data is inconsistent")
print()
# Tabla order_products, 4545007 filas
print("Comparing order_products")
print("Data is consistent" if compare_data("order_products", "order_id", 10, 4545007, mysqlcnx, mysqlcursor, snowflakecnx, snowflakecursor)
      else "Data is inconsistent")
print()
# Tabla instacart_orders, 478967 filas
print("Comparing instacart_orders")
print("Data is consistent" if compare_data("instacart_orders", "order_id", 10, 478967, mysqlcnx, mysqlcursor, snowflakecnx, snowflakecursor)
      else "Data is inconsistent")


Comparing aisles
Data is consistent

Comparing departments
Data is consistent

Comparing products
Data is consistent

Comparing order_products
Data is consistent

Comparing instacart_orders
Data is consistent
