In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import unicodedata
import re
import os
from sqlalchemy import create_engine, text
from dotenv import load_dotenv

df_test = pd.read_csv("../ventas_limpias.csv")

print(df_test)
# [1.048.574  rows x 11 columns]
# 11 Columnas
# 1.048.575 rows


                   ciudad       fecha producto      tipo_producto  cantidad  \
0                santiago  2025-10-30    arepa          abarrotes       2.0   
1                 cordoba  2025-11-17    arepa          abarrotes       7.0   
2            barranquilla  2025-10-22    leche             lacteo       9.0   
3                new york  2025-10-20   cereal             lacteo       3.0   
4                  madrid  2025-10-20    leche              hogar       2.0   
...                   ...         ...      ...                ...       ...   
1048570       guadalajara  2025-10-23   yogurt             lacteo       9.0   
1048571  ciudad de mexico  2025-11-13  gaseosa              hogar       7.0   
1048572              lima  2025-10-30    arepa             bebida       8.0   
1048573            madrid  2025-10-23     cafe          abarrotes      10.0   
1048574             cusco  2025-10-22    queso  alimentopercedero       1.0   

         precio_unitario tipo_de_venta tipo_cliente

In [2]:
#Carga el archivo .env
load_dotenv(override=True)

#Variables de entorno

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")


URL = f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(URL)

conn = engine.connect()
try:
    print("Connection Succesfull!!" if conn else "")

except Exception as e:
    print("Error al conectar la base datos en", e)
    
set = conn.execute(text("SET search_path TO riwi_ventas"))


Connection Succesfull!!


### Functions

In [3]:
with engine.connect() as conn:
    # Establecer el schema
    conn.execute(text("SET search_path TO riwi_ventas"))
    
    # 1. Get tables list en schema
    table_list = pd.read_sql(text("SELECT table_name FROM information_schema.tables WHERE table_schema = 'riwi_ventas';"), conn)

    print("Tables founded")

    idx = 0
       
    for table in table_list["table_name"]:
        idx +=1
        print(f"{idx}. {table}")

    

Tables founded
1. tipo_producto
2. producto
3. ciudad
4. factura_ventas
5. tipo_venta
6. tipo_cliente


In [4]:
with engine.connect() as conn:
    conn.execute(text("SET search_path TO riwi_ventas"))
    for table in table_list["table_name"]:
        verify = pd.read_sql(text(f"SELECT * FROM {table} limit 1000;"),conn)

        if not verify.empty:
            print(f"- {table}: exist and is filled!.\n")
        else:
            print(f"- {table}: doesn't exist or is empty.\n")

- tipo_producto: exist and is filled!.

- producto: exist and is filled!.

- ciudad: exist and is filled!.

- factura_ventas: exist and is filled!.

- tipo_venta: exist and is filled!.

- tipo_cliente: exist and is filled!.



In [5]:
with engine.connect() as conn:
    # Set the schema
    conn.execute(text("SET search_path TO riwi_ventas"))
    
    # 1. Get tables
    table_list = pd.read_sql(text("SELECT table_name FROM information_schema.tables WHERE table_schema = 'riwi_ventas';"), conn)
    
    # 2. Process each table
    for table in table_list["table_name"]:
        print(f"\nProcessing table: {table}")
        
        try:
            # Read the complete table
            query = text(f'SELECT * FROM riwi_ventas."{table}"')
            df = pd.read_sql(query, conn)
            
            print(f"Original rows: {len(df):,}")
            
            # Count nulls BEFORE
            nulls_before = df.isnull().sum().sum()
            if nulls_before > 0:
                print(f"Null values found: {nulls_before:,}")
            
            # Delete all rows with any null value
            clean_df = df.dropna()
            
            # Calculate removed rows
            removed_rows = len(df) - len(clean_df)
            
            print(f"Rows removed: {removed_rows:,}")
            print(f"Clean rows: {len(clean_df):,}")
            
            if removed_rows > 0:
                # 3. Delete ALL data from table
                print(f"Deleting old data...")
                conn.execute(text(f'DELETE FROM riwi_ventas."{table}"'))
                
                # 4. Insert clean data by chunks
                CHUNK_SIZE = 10000 
                
                # Read clean data by chunks
                for i in range(0, len(clean_df), CHUNK_SIZE):
                    # Get chunk
                    chunk = clean_df.iloc[i:i + CHUNK_SIZE]
                    
                    # Replace NaN with None for SQL
                    chunk = chunk.where(pd.notnull(chunk), None)
                    
                    # Get column names
                    cols = chunk.columns.tolist()
                    colnames = ", ".join([f'"{c}"' for c in cols])
                    placeholders = ", ".join([f":{c}" for c in cols])
                    
                    # Create query
                    query = text(f"""
                        INSERT INTO riwi_ventas."{table}" ({colnames})
                        VALUES ({placeholders});
                    """)
                    
                    # Execute all rows in chunk
                    conn.execute(query, chunk.to_dict(orient="records"))
                    
                    print(f"Chunk {i//CHUNK_SIZE + 1} inserted ({len(chunk)} rows)")
                
                print(f"Table {table} was inserted successfully!")
            else:
                print(f"Table {table} was already clean, no changes")
            
        except Exception as e:
            print(f"Error processing {table}: {str(e)}")
            continue
    
    # Commit all changes
    conn.commit()

print("\nAll tables processed successfully!")


Processing table: tipo_producto
Original rows: 7
Null values found: 1
Rows removed: 1
Clean rows: 6
Deleting old data...
Error processing tipo_producto: (psycopg2.errors.ForeignKeyViolation) update or delete on table "tipo_producto" violates foreign key constraint "producto_tipo_producto_id_fkey" on table "producto"
DETAIL:  Key (tipo_producto_id)=(1) is still referenced from table "producto".

[SQL: DELETE FROM riwi_ventas."tipo_producto"]
(Background on this error at: https://sqlalche.me/e/20/gkpj)

Processing table: producto
Error processing producto: (psycopg2.errors.InFailedSqlTransaction) current transaction is aborted, commands ignored until end of transaction block

[SQL: SELECT * FROM riwi_ventas."producto"]
(Background on this error at: https://sqlalche.me/e/20/2j85)

Processing table: ciudad
Error processing ciudad: (psycopg2.errors.InFailedSqlTransaction) current transaction is aborted, commands ignored until end of transaction block

[SQL: SELECT * FROM riwi_ventas."ciuda

# **REPORTE DE CALIDAD DE DATOS**

## **RESUMEN GENERAL**

| ASPECTO | VALOR | OBSERVACIONES | CALIFICACION |
|---------|-------|---------------|--------------|
| Total Tablas Analizadas | 6 tablas | Todas las tablas del schema riwi_ventas | COMPLETO |
| Total Registros | 1,048,745 registros | Suma de todas las filas de todas las tablas | ALTO VOLUMEN |
| Registros con Problemas | 1,154 registros | Aproximadamente 0.11% del total | BUENA CALIDAD |
| Tablas con Problemas | 4 de 6 tablas | 66% de las tablas tienen algun problema | ATENCION REQUERIDA |
| Principal Problema | Valores Nulos | 1,146 nulos en fechas de ventas | PROBLEMA CRITICO |

---

## **DETALLE POR TABLA**

| NOMBRE TABLA | TOTAL FILAS | FILAS CON PROBLEMAS | % PROBLEMAS | TIPO DE PROBLEMA | RECOMENDACION |
|--------------|-------------|---------------------|-------------|------------------|---------------|
| factura_ventas | 1,048,575 | 1,146 | 0.11% | Fechas nulas (1,146) | ELIMINAR registros |
| ciudad | 34 | 1 | 2.94% | Nombre ciudad nulo | Completar manualmente |
| producto | 90 | 6 | 6.67% | Nombre producto nulo | Completar manualmente |
| tipo_cliente | 5 | 0 | 0.00% | Sin problemas | MANTENER |
| tipo_producto | 7 | 1 | 14.29% | Tipo producto nulo | Completar manualmente |
| tipo_venta | 5 | 1 | 20.00% | Tipo venta nulo | Completar manualmente |

**ESCALA DE CALIDAD:**
- 0-1% = EXCELENTE
- 1-5% = ACEPTABLE  
- 5-10% = REGULAR
- 10% = CRITICO

---

## **TIPOS DE PROBLEMAS ENCONTRADOS**

| TIPO DE PROBLEMA | CANTIDAD | % DEL TOTAL | TABLAS AFECTADAS | GRAVEDAD |
|------------------|----------|-------------|------------------|----------|
| Valores Nulos | 1,155 | 99.9% | factura_ventas, ciudad, producto, tipo_producto, tipo_venta | ALTA |
| Duplicados | 0 | 0% | Ninguna | NULA |
| Formatos Incorrectos | 0 | 0% | Ninguna | NULA |
| Valores Fuera de Rango | 0 | 0% | Ninguna | NULA |
| Relaciones Rotas | 0 | 0% | Ninguna | NULA |