# CSV a txts

Vamos a generar los inserts para las tablas de Autosummit Perú SAC

Importar librerías

In [29]:
import pandas as pd
import numpy as np

Abrir CSV

In [30]:
ds_ASP = pd.read_csv('ASP.csv', sep=',')

Ejemplos

In [31]:
ds_ASP['Cliente']

0                (42206340) DANIELLA MARIA BOLAÑOS GAMERO
1                         (20100115663) PANDERO S.A. EAFC
2       (09468059) KATIA NATHALI DE LOAYZA WONG DE PAC...
3        (46472213) JHONATHAN MITCHELL ANTEZANA ESCALANTE
4                (42607724) KRISCIA ZULAY REATEGUI ZAMORA
                              ...                        
1253                                                  NaN
1254                                                  NaN
1255                                                  NaN
1256                                                  NaN
1257                                                  NaN
Name: Cliente, Length: 1258, dtype: object

<h3>Filtrado de colores</h3>

In [None]:
colors_rep = ds_ASP['Color']
colors = []

for c in colors_rep:
    if isinstance(c, str) and c.strip() != '' and c.lower() != 'nan':
        if c not in colors:
            colors.append(c)

colors = sorted(colors)

text_colors = '-- migrate:up\n\n'
id = 1

for c in colors:
    c_escaped = c.replace("'", "''")
    text_colors += f"INSERT INTO colors (id, nombre) VALUES ({id}, '{c_escaped}');\n"
    id += 1

text_colors += '\n-- migrate:down\nDELETE FROM colors;'
with open('inserts_colors.sql', 'w') as f:
    f.write(text_colors)


<p>Filtrado por Asesor </p>

In [33]:
import re
asesores_rep = ds_ASP['Asesor']
asesores = []

#En caso tenga nombres "raros" el asesor
PALABRAS_INVALIDAS = {
    "PDI", "EXHIBICION", "ATE", "CASO", "ENTREGA"
}

#Almacenar todo en diccionario - para evitar duplicados
asesores = {} 

def es_asesor_valido(texto):
    if not isinstance(texto, str):
        return False

    texto = texto.strip().upper()

    # Debe tener al menos un espacio (nombre + apellido)
    if " " not in texto:
        return False

    # No debe contener comas ni números
    if "," in texto or re.search(r"\d", texto):
        return False

    # No debe contener palabras inválidas
    for palabra in PALABRAS_INVALIDAS:
        if palabra in texto:
            return False

    # Solo letras y espacios
    if not re.match(r"^[A-ZÁÉÍÓÚÑ ]+$", texto):
        return False

    return True


for c in asesores_rep:
    if not es_asesor_valido(c):
        continue

    c = c.strip().upper()
    nombres, apellidos = c.split(" ", 1)

    key = (nombres, apellidos)
    asesores[key] = True

sql_asesores = "-- migrate:up\n\n"
id_asesor = 1

for nombres, apellidos in sorted(asesores.keys()):
    sql_asesores += (
        "INSERT INTO asesores (id, nombres, apellidos) "
        f"VALUES ({id_asesor}, '{nombres}', '{apellidos}');\n"
    )
    id_asesor += 1

sql_asesores += "\n-- migrate:down\nDELETE FROM asesores;\n"

with open("inserts_asesores.sql", "w", encoding="utf-8") as f:
    f.write(sql_asesores)


<h3>Descomposición en marca (nombre) y modelo (nombre, version, año)</h3>

<p> Considerar que Marca no está de forma explícita en el CSV. Solución: inferir y filtrar con diccionarios </p>

In [34]:
import re

# =========================
# 1. COLUMNA CSV
# =========================
modelos_rep = ds_ASP['Modelo']

# Diccionarios (evitan duplicados automáticamente)
marcas = {}      # { "FORD": 1 }
modelos = {}     # { (modelo_base, version, traccion, anio, motor, marca): True }

# =========================
# CATÁLOGOS CONTROLADOS
# =========================
MAPA_MARCAS = {
    "FORD": [
        "TERRITORY", "RANGER", "F-150", "MAVERICK",
        "EXPLORER", "ESCAPE", "BRONCO",
        "MUSTANG", "EXPEDITION"
    ],
    "CHERY": [
        "TIGGO", "ARRIZO", "M7", "HIMLA"
    ]
}

VERSIONES_VALIDAS = {
    "TITANIUM", "TREND", "XLS", "XLT", "XL", "LTD",
    "PLATINUM", "RAPTOR", "BADLANDS", "LARIAT",
    "ACTIVE", "ST", "PREMIUM", "PRO", "MAX",
    "BIG", "BEND", "TREMOR", "GT"
}

TRACCIONES = {"4X2", "4X4", "AWD", "4WD"}
DESCARTES = {"MT", "AT", "CVT", "DCT", "FHEV", "MHEV", "PHEV", "GLP", "GNV"}

# =========================
# FUNCIONES
# =========================

def detectar_marca(texto: str):
    texto = texto.upper()
    for marca, modelos in MAPA_MARCAS.items():
        for m in modelos:
            if m in texto:
                return marca
    return None


def extraer_anio(texto: str):
    match = re.search(r"(20\d{2})", texto)
    return int(match.group(1)) if match else None


def separar_modelo(texto: str):
    tokens = texto.upper().split()

    motor = None
    traccion = None
    usados = set()

    for t in tokens:
        # Motor (1.5L, 2.0T, etc.)
        if re.match(r"\d\.\d(T|L)?", t):
            motor = float(re.findall(r"\d\.\d", t)[0])
            usados.add(t)

        # Tracción
        elif t in TRACCIONES:
            traccion = t
            usados.add(t)

        # Tokens técnicos descartables
        elif t in DESCARTES:
            usados.add(t)

    # limpiar tokens ya usados
    limpio = [t for t in tokens if t not in usados]

    modelo_base = limpio[0]
    version_tokens = [t for t in limpio[1:] if t in VERSIONES_VALIDAS]
    version_modelo = " ".join(version_tokens)

    return modelo_base, version_modelo, traccion, motor

# =========================
# PROCESAMIENTO DEL CSV
# =========================

for fila in modelos_rep:
    if not isinstance(fila, str) or fila.strip() == "":
        continue

    fila = fila.strip().upper()

    marca = detectar_marca(fila)
    anio = extraer_anio(fila)

    if not marca or not anio:
        continue

    modelo_base, version, traccion, motor = separar_modelo(fila)

    if not all([modelo_base, version, traccion, motor]):
        continue

    # registrar marca (PK lógica)
    if marca not in marcas:
        marcas[marca] = len(marcas) + 1

    key = (modelo_base, version, traccion, anio, motor, marca)
    modelos[key] = True

# =========================
# SQL: MARCAS
# =========================

sql_marcas = "-- migrate:up\n\n"

for marca, id_marca in marcas.items():
    sql_marcas += (
        f"INSERT INTO marcas (id_marca, nombre) "
        f"VALUES ({id_marca}, '{marca}');\n"
    )

sql_marcas += "\n-- migrate:down\nDELETE FROM marcas;\n"

with open("inserts_marcas.sql", "w", encoding="utf-8") as f:
    f.write(sql_marcas)

# =========================
# SQL: MODELOS
# =========================

sql_modelos = "-- migrate:up\n\n"
id_modelo = 1

for (modelo_base, version, traccion, anio, motor, marca) in modelos.keys():
    marca_id = marcas[marca]

    sql_modelos += (
        "INSERT INTO modelos "
        "(id_modelo, modelo_base, version_modelo, traccion, anio, motor, marca_id) "
        f"VALUES ({id_modelo}, '{modelo_base}', '{version}', "
        f"'{traccion}', {anio}, {motor}, {marca_id});\n"
    )
    id_modelo += 1

sql_modelos += "\n-- migrate:down\nDELETE FROM modelos;\n"

with open("inserts_modelos.sql", "w", encoding="utf-8") as f:
    f.write(sql_modelos)


<h2>Filtrado de GPS</h2>
<p>Eliminar si tiene la palabra "GPS" para dejar el resto del texto </p>

In [35]:
import pandas as pd
import re

def limpiar_gps(texto):
    if not isinstance(texto, str):
        return None

    texto = texto.strip().upper()

    # Eliminar fechas entre paréntesis
    texto = re.sub(r"\(.*?\)", "", texto)

    #Eliminar comentarios
    texto = re.sub(r"\d+/\d+", "", texto)

    #PALABRAS NO ACEPTADAS: Eliminar si existen
    FiltroNoAceptado = [
        "OK", "INSTALADO", "COORDINAR", "CLIENTE",
        "NO APLICA", "CAMPAÑA", "AUTOPLAN"
    ]

    for palabra in FiltroNoAceptado:
        texto = texto.replace(palabra, "")

    #Solo queremos letras y espacios
    texto = re.sub(r"[^A-Z ]", "", texto)
    texto = re.sub(r"\s+", " ", texto).strip()

    return texto #La versión limpia del texto que pasa

def detectar_gps_base(texto):
    GPS_VALIDOS = {"COMSATEL","HUNTER", "SUPRA","PANDERO", "PROTEMAX", "MAQUISISTEMAS","EURORENTING"}
    
    # Forma limpia: 'GPS {PALABRACLAVE}'
    for palabraGPS in GPS_VALIDOS:
        if palabraGPS in texto: 
            return f"GPS {palabraGPS}"
    return None #Si no encaja, no es válido

ds = pd.read_csv("ASP.csv", encoding="utf-8")

gps_dict = {}
id_gps = 1

for g in ds['GPS']:
    nombreGPSLimpio = limpiar_gps(g)
    if not nombreGPSLimpio:
        continue

    gps_base = detectar_gps_base(nombreGPSLimpio)

    if not gps_base:
        continue 

    if gps_base not in gps_dict:
        gps_dict[gps_base] = id_gps
        id_gps += 1

# Generar SQL GPS

sql_gps = "-- migrate:up\n\n"

for nombre, id_ in gps_dict.items():
    sql_gps += (
        "INSERT INTO gps (id, nombre) "
        f"VALUES ({id_}, '{nombre}');\n"
    )

sql_gps += "\n-- migrate:down\nDELETE FROM gps;\n"

with open("inserts_gps.sql", "w", encoding="utf-8") as f:
    f.write(sql_gps)


<h2>Filtrado de Clientes </h2>
<p>Al estar en cliente el nombre de empresa / persona natural + DNI/RUC El objetivos principal será separar número de identificación de identidad y el nombre, así como clasificar en caso sea persona natural o empresa </p>

In [36]:
import re

clientes_rep = ds_ASP['Cliente']

# ===================== FUNCIONES =====================

def limpiar_cliente(texto):
    """
    (20100115663) PANDERO S.A. EAFC
    """
    if not isinstance(texto, str):
        return None, None

    texto = texto.strip()

    match = re.match(r"\((\d+)\)\s*(.+)", texto)
    if not match:
        return None, None

    numero = match.group(1)
    nombre = match.group(2).strip().upper()

    return numero, nombre


PALABRAS_EMPRESA = {
    "S.A", "S.A.", "S.A.C", "SAC", "SOCIEDAD",
    "EMPRESA", "E.A.F.C", "EAFC", "CORPORACION",
    "GRUPO", "GROUP", "E.I.R.L", "SERVICIOS",
    "SRL", "S.R.L"
}

def es_empresa(nombre):
    return any(p in nombre for p in PALABRAS_EMPRESA)


# ===================== ESTRUCTURAS =====================

clientes = {}          # id_cliente -> (numero, nombre)
personas = set()       # ids
empresas = set()       # ids

id_cliente = 1

# ===================== PROCESAMIENTO =====================

for fila in clientes_rep:
    numero, nombre = limpiar_cliente(fila)

    if not numero or not nombre:
        continue

    clientes[id_cliente] = (numero, nombre)

    if es_empresa(nombre):
        empresas.add(id_cliente)
    else:
        personas.add(id_cliente)

    id_cliente += 1


# ===================== SQL CLIENTES =====================

sql_clientes = "-- migrate:up\n\n"

for id_cliente, (numero, nombre) in clientes.items():
    nombre = nombre.replace("'", "''")
    sql_clientes += (
        "INSERT INTO clientes (id_cliente, Numero_Identificacion, nombre) "
        f"VALUES ({id_cliente}, '{numero}', '{nombre}');\n"
    )

sql_clientes += "\n-- migrate:down\nDELETE FROM clientes;\n"

with open("inserts_clientes.sql", "w", encoding="utf-8") as f:
    f.write(sql_clientes)


# ===================== SQL PERSONA NATURAL =====================

sql_personas = "-- migrate:up\n\n"

for id_cliente in personas:
    sql_personas += (
        "INSERT INTO persona_natural (id_persona, cliente_id) "
        f"VALUES ({id_cliente}, {id_cliente});\n"
    )

sql_personas += "\n-- migrate:down\nDELETE FROM persona_natural;\n"

with open("inserts_persona_natural.sql", "w", encoding="utf-8") as f:
    f.write(sql_personas)


# ===================== SQL EMPRESA =====================

sql_empresas = "-- migrate:up\n\n"

for id_cliente in empresas:
    sql_empresas += (
        "INSERT INTO empresa (id_empresa, cliente_id) "
        f"VALUES ({id_cliente}, {id_cliente});\n"
    )

sql_empresas += "\n-- migrate:down\nDELETE FROM empresa;\n"

with open("inserts_empresa.sql", "w", encoding="utf-8") as f:
    f.write(sql_empresas)


Filtrado vehiculo

In [None]:
import pandas as pd

# ===============================
# IMPORTAR DICCIONARIOS
# ===============================

from colors import dict_color
from modelo import dict_modelo
from gps import gps_dict, limpiar_gps, detectar_gps_base
from clientes import dict_cliente, limpiar_cliente


# ===============================
# FUNCIONES
# ===============================

def limpiar(texto):
    if pd.isna(texto):
        return None
    return str(texto).strip().replace("'", "''")


# ===============================
# LEER CSV
# ===============================

df = pd.read_csv("ASP.csv", encoding="utf-8")


# ===============================
# EXTRAER DATOS ÚNICOS
# ===============================

vehiculos = df[['PLACA', 'VIN', 'COLOR', 'MODELO', 'GPS', 'CLIENTE']].drop_duplicates().to_numpy()


# ===============================
# GENERAR INSERTS
# ===============================

text = '-- migrate:up \n\n'

for v in vehiculos:

    placa = limpiar(v[0])
    vin = limpiar(v[1])

    color_text = limpiar(v[2])
    modelo_text = limpiar(v[3])
    gps_text = limpiar_gps(v[4])
    cliente_text = limpiar_cliente(v[5])

    color_id = dict_color.get(color_text)
    modelo_id = dict_modelo.get(modelo_text)
    gps_base = detectar_gps_base(gps_text)
    gps_id = gps_dict.get(gps_base)
    cliente_id = dict_cliente.get(cliente_text)

    # Validación de datos faltantes
    if None in (color_id, modelo_id, gps_id, cliente_id):
        print(f"Error en datos: {placa}, {vin}")
        continue

    text += f"""INSERT INTO vehiculos 
(placa, vin, color_id, modelo_id, gps_id, cliente_id)
VALUES ('{placa}', '{vin}', {color_id}, {modelo_id}, {gps_id}, {cliente_id});
"""

text += '\n-- migrate:down \n\nDELETE FROM vehiculos;'


# ===============================
# GUARDAR ARCHIVO
# ===============================

with open('inserts_vehiculos.sql', 'w', encoding='utf-8') as archivo:
    archivo.write(text)

In [68]:
import pandas as pd
import re

# ===============================
# Cargar dataset
# ===============================

ds = pd.read_csv("ASP.csv", encoding="utf-8")

# ===============================
# FUNCIONES
# ===============================

def limpiar_texto(t):
    if not isinstance(t, str):
        return None
    t = t.strip().upper()
    return t if t and t != "NAN" else None


def extraer_numero_cliente(texto):
    if not isinstance(texto, str):
        return None
    m = re.match(r"\((\d+)\)", texto.strip())
    return m.group(1) if m else None


# ===============================
# REQUISITOS PREVIOS
# ===============================

# Deben existir previamente:
# colors         -> lista ordenada de colores
# modelos_dict   -> dict { texto_modelo : id_modelo }
# gps_dict       -> dict { texto_gps : id_gps }
# clientes_dict  -> dict { dni/ruc : id_cliente }

# ===============================
# GENERACIÓN INSERT VEHÍCULOS
# ===============================

sql_vehiculos = "-- migrate:up\n\n"
id_vehiculo = 1

for _, row in ds.iterrows():

    placa  = limpiar_texto(row['PLACA'])
    vin    = limpiar_texto(row['VIN'])
    color  = limpiar_texto(row['Color'])
    modelo = limpiar_texto(row['Modelo'])
    gps    = limpiar_texto(row['GPS'])

    cliente_num = extraer_numero_cliente(row['Cliente'])

    if not all([placa, vin, color, modelo, cliente_num]):
        continue

    # ===== COLOR → LISTA =====
    try:
        color_id = colors.index(color) + 1
    except ValueError:
        continue

    # ===== MODELO → DICCIONARIO =====
    modelo_id = modelos.get(modelo)
    if not modelo_id:
        continue

    # ===== GPS → DICCIONARIO =====
    gps_id = gps_dict.get(gps) if gps else None

    # ===== CLIENTE → DICCIONARIO =====
    cliente_id = dict_cliente.get(cliente_num)
    if not cliente_id:
        continue

    gps_sql = "NULL" if gps_id is None else gps_id

    sql_vehiculos += (
        "INSERT INTO vehiculos "
        "(id, placa, vin, colores_id, modelos_id, gps_id, clientes_id) "
        f"VALUES ({id_vehiculo}, '{placa}', '{vin}', "
        f"{color_id}, {modelo_id}, {gps_sql}, {cliente_id});\n"
    )

    id_vehiculo += 1


sql_vehiculos += "\n-- migrate:down\nDELETE FROM vehiculos;\n"

with open("inserts_vehiculos.sql", "w", encoding="utf-8") as f:
    f.write(sql_vehiculos)


Filtrado de recepciones_campañas

In [None]:
import pandas as pd

# ===============================
# Cargar dataset
# ===============================

ds = pd.read_csv("ASP.csv", encoding="utf-8")

# ===============================
# CAMPAÑAS
# ===============================

campanias = {}
id_campania = 1

for c in ds['CAMPAÑA']:
    if not isinstance(c, str):
        continue

    c = c.strip().upper()

    if c and c not in campanias:
        campanias[c] = id_campania
        id_campania += 1

# ===============================
# RECEPCIONES
# ===============================

recepciones = {}
id_recepcion = 1

for f in ds['FECHA DE RECEPCION DEL VEHICULO']:
    if pd.isna(f):
        continue

    f = str(f).strip()

    if f not in recepciones:
        recepciones[f] = id_recepcion
        id_recepcion += 1

# ===============================
# RELACION N:M
# ===============================

relaciones = set()

for _, row in ds.iterrows():

    camp = str(row['CAMPAÑA']).strip().upper()
    fecha = str(row['FECHA DE RECEPCION DEL VEHICULO']).strip()

    if not camp or camp == "NAN" or not fecha or fecha == "NAN":
        continue

    relaciones.add((campanias[camp], recepciones[fecha]))

# ===============================
# GENERAR SQL
# ===============================

sql_rel = "-- migrate:up\n\n"

for camp_id, rec_id in sorted(relaciones):
    sql_rel += (
        "INSERT INTO recepciones_campañas (campañas_id, recepciones_id) "
        f"VALUES ({camp_id}, {rec_id});\n"
    )

sql_rel += "\n-- migrate:down\nDELETE FROM recepciones_campañas;\n"

with open("inserts_recepciones_campañas.sql", "w", encoding="utf-8") as f:
    f.write(sql_rel)


Filtrado de ubicaciones

In [None]:
import pandas as pd

# ===============================
# Cargar dataset
# ===============================

ds = pd.read_csv("ASP.csv", encoding="utf-8")

# ===============================
# LIMPIEZA
# ===============================

def limpiar_texto(txt):
    if not isinstance(txt, str):
        return None
    return txt.strip().upper()

# ===============================
# DISTRITOS (CATALOGO)
# ===============================

distritos = {}
id_distrito = 1

for u in ds['UBICACIÓN']:
    u = limpiar_texto(u)
    if not u:
        continue

    if u not in distritos:
        distritos[u] = id_distrito
        id_distrito += 1

# ===============================
# UBICACIONES
# ===============================

ubicaciones = {}

for u in ds['UBICACIÓN']:
    u = limpiar_texto(u)
    if not u:
        continue

    ubicaciones[u] = distritos[u]

# ===============================
# SQL DISTRITOS
# ===============================

sql_distritos = "-- migrate:up\n\n"

for nombre, id_ in distritos.items():
    sql_distritos += (
        "INSERT INTO distritos (id, nombre) "
        f"VALUES ({id_}, '{nombre}');\n"
    )

sql_distritos += "\n-- migrate:down\nDELETE FROM distritos;\n"

with open("inserts_distritos.sql", "w", encoding="utf-8") as f:
    f.write(sql_distritos)

# ===============================
# SQL UBICACIONES
# ===============================

sql_ubicaciones = "-- migrate:up\n\n"

for ubicacion, distrito_id in ubicaciones.items():
    sql_ubicaciones += (
        "INSERT INTO ubicaciones (ubicacion, distritos_id) "
        f"VALUES ('{ubicacion}', {distrito_id});\n"
    )

sql_ubicaciones += "\n-- migrate:down\nDELETE FROM ubicaciones;\n"

with open("inserts_ubicaciones.sql", "w", encoding="utf-8") as f:
    f.write(sql_ubicaciones)


filtrado telefono

In [None]:
import re

telefonos_rep = ds_ASP['Celular']

telefonos = {}

def telefono_valido(t):
  if not isinstance(t, str):
    return False
  t = t.strip()
  return re.fullmatch(r"\d{9}", t) is not None





for t in telefonos_rep:
  if not telefono_valido(t):
    continue

  numero = int(t)
  telefonos[numero] = True  # evita duplicados





sql_tel = "-- migrate:up\n\n"
id_telefono = 1

cliente_id = 1  # correlativo simple



for numero in sorted(telefonos.keys()):

  sql_tel += (

    "INSERT INTO telefonos (id_telefono, numero, cliente_id) "

    f"VALUES ({id_telefono}, {numero}, {cliente_id});\n"

  )

  id_telefono += 1

  cliente_id += 1





sql_tel += "\n-- migrate:down\nDELETE FROM telefonos;\n"



with open("inserts_telefono.sql", "w", encoding="utf-8") as f:

  f.write(sql_tel)



filtrado formato pago

In [None]:
import pandas as pd

formas_unicas = ds_ASP['FORMA DE PAGO'].dropna().unique()

formas_unicas = sorted(formas_unicas)  # orden alfabético como el profe



formatos_diccionario = {}



text = '-- migrate:up \n\n'

id_counter = 1

for forma in formas_unicas:

    forma_escaped = str(forma).replace("'", "''").strip()

    if forma_escaped:  # evita vacíos

        text += f"INSERT INTO formatos_pago (id, nombre) VALUES ({id_counter}, '{forma_escaped}');\n"

        formatos_diccionario[forma_escaped] = id_counter

        id_counter += 1



text += '\n-- migrate:down \n\nDELETE FROM formatos_pago;'



with open('inserts_formatos_pago.sql', 'w', encoding='utf-8') as f:

    f.write(text)



print("Diccionario generado (para usarlo después en pagos):")

print(formatos_diccionario)

print(f"\nTotal de formatos únicos: {len(formatos_diccionario)}")

Diccionario generado (para usarlo después en pagos):
{'AUTOPLAN': 1, 'AUTOPLAN VB ENTREGA+BCP': 2, 'BBVA': 3, 'BBVA BCP': 4, 'BBVA BCP INTBK': 5, 'BBVA BCP INTBK SCTBK': 6, 'BBVA BCP INTERBANK': 7, 'BBVA BCP ITBK': 8, 'BBVA BCP ITBK NIUBIZ': 9, 'BBVA BCP SCTBK': 10, 'BBVA IBK BCP': 11, 'BBVA INTBK': 12, 'BBVA INTBK BCP': 13, 'BBVA INTERBANK': 14, 'BBVA ITBK': 15, 'BBVA NIUBIZ': 16, 'BBVA SCOTBK': 17, 'BBVA SCOTIABANK': 18, 'BBVA SCOTIABANK MAF': 19, 'BBVA SCTBK BCP': 20, 'BBVA VISA': 21, 'BCP': 22, 'BCP BBVA': 23, 'BCP BBVA INTBK': 24, 'BCP BBVA MAF': 25, 'BCP INTBK': 26, 'BCP INTBK FONBIENES': 27, 'BCP INTBK NIUBIZ': 28, 'BCP INTERBANK': 29, 'BCP INTERBANK SCOTIABANK BBVA': 30, 'BCP ITBK': 31, 'BCP ITBK NIUBIZ': 32, 'BCP NIUBIZ': 33, 'BCP NIUBIZ+AUTOPLAN PEDIR VB': 34, 'BCP SANTANDER': 35, 'BCP SCOTBK': 36, 'BCP SCOTIABANK': 37, 'BCP SCTBK': 38, 'BCP SCTBK INTBK': 39, 'BCP VISA': 40, 'BCP VISA+PANDERO': 41, 'BCP+AUTOPLAN': 42, 'BCP+AUTOPLAN PEDIR VB PARA ENTREGA': 43, 'BCP+MAQUIMAS': 

filtrado campañas

In [None]:
import re



campanas_rep = ds_ASP['CAMPAÑA']



campanas = {}



def campaña_valida(nombre):

  if not isinstance(nombre, str):

    return False



  nombre = nombre.strip().upper()



  if nombre == "" or nombre in {"SIN CAMPAÑA", "NO APLICA"}:

    return False



  return True





for c in campanas_rep:

  if not campaña_valida(c):

    continue



  nombre = c.strip().upper()

  campanas[nombre] = True





sql_cam = "-- migrate:up\n\n"

id_campaña = 1



for nombre in sorted(campanas.keys()):

  # descuento asignado por criterio

  descuento = 0.10



  sql_cam += (

    "INSERT INTO campañas (id_campaña, nombre, descuento) "

    f"VALUES ({id_campaña}, '{nombre}', {descuento});\n"

  )

  id_campaña += 1





sql_cam += "\n-- migrate:down\nDELETE FROM campañas;\n"



with open("inserts_campañas.sql", "w", encoding="utf-8") as f:

  f.write(sql_cam)

Filtrado bancos

In [None]:
import pandas as pd
import re

# Lista de bancos/entidades financieras conocidas (ajustada a tus datos)

bancos_conocidos = [

    'BCP', 'BBVA', 'SANTANDER', 'INTERBANK', 'INTBK', 'ITBK', 'IBK',

    'SCOTIABANK', 'SCTBK', 'SCOTBK', 'NIUBIZ', 'NIUBIS',

    'MAF', 'VISA', 'PANDERO', 'LEASING', 'AUTOPLAN', 'FONBIENES',

    'MAQUIMAS', 'MAQUISITEMA', 'PROMOTORA OPCION', 'BANBIF'

]



def extraer_bancos(texto):

    if pd.isna(texto):

        return set()

    texto_upper = str(texto).upper()

    encontrados = set()

    for banco in bancos_conocidos:

        # Busca palabra completa o casi completa

        if re.search(r'\b' + re.escape(banco) + r'\b', texto_upper) or banco in texto_upper:

            encontrados.add(banco)

    return encontrados



# Colectar todos los bancos únicos

all_bancos = set()

for forma in ds_ASP['FORMA DE PAGO'].dropna():

    all_bancos.update(extraer_bancos(forma))



bancos_unicos = sorted(list(all_bancos))



# Generar SQL

bancos_diccionario = {}

text = '-- migrate:up \n\n'

id_counter = 1

for banco in bancos_unicos:

    banco_escaped = banco.replace("'", "''")

    text += f"INSERT INTO bancos (id, nombre) VALUES ({id_counter}, '{banco_escaped}');\n"

    bancos_diccionario[banco] = id_counter

    id_counter += 1



text += '\n-- migrate:down \n\nDELETE FROM bancos;'



with open('inserts_bancos.sql', 'w', encoding='utf-8') as f:

    f.write(text)



print("Bancos únicos encontrados:", bancos_unicos)

print("Diccionario de bancos:", bancos_diccionario)

print(f"Total de bancos únicos: {len(bancos_unicos)}")

Bancos únicos encontrados: ['AUTOPLAN', 'BANBIF', 'BBVA', 'BCP', 'FONBIENES', 'IBK', 'INTBK', 'INTERBANK', 'ITBK', 'LEASING', 'MAF', 'MAQUIMAS', 'MAQUISITEMA', 'NIUBIS', 'NIUBIZ', 'PANDERO', 'PROMOTORA OPCION', 'SANTANDER', 'SCOTBK', 'SCOTIABANK', 'SCTBK', 'VISA']
Diccionario de bancos: {'AUTOPLAN': 1, 'BANBIF': 2, 'BBVA': 3, 'BCP': 4, 'FONBIENES': 5, 'IBK': 6, 'INTBK': 7, 'INTERBANK': 8, 'ITBK': 9, 'LEASING': 10, 'MAF': 11, 'MAQUIMAS': 12, 'MAQUISITEMA': 13, 'NIUBIS': 14, 'NIUBIZ': 15, 'PANDERO': 16, 'PROMOTORA OPCION': 17, 'SANTANDER': 18, 'SCOTBK': 19, 'SCOTIABANK': 20, 'SCTBK': 21, 'VISA': 22}
Total de bancos únicos: 22


FILTRADO DISTRITOS

In [None]:
# ===============================
# IMPORTS
# ===============================
import pandas as pd
import random
import re


# ===============================
# DICCIONARIO DE DISTRITOS (LIMA)
# ===============================
DIC_DISTRITOS = {
    "ATE": "Ate",
    "SAN ISIDRO": "San Isidro",
    "MIRAFLORES": "Miraflores",
    "SURCO": "Santiago de Surco",
    "SANTIAGO DE SURCO": "Santiago de Surco",
    "LA MOLINA": "La Molina",
    "MOLINA": "La Molina",
    "CAMACHO": "La Molina",
    "SAN BORJA": "San Borja",
    "SJL": "San Juan de Lurigancho",
    "SAN JUAN DE LURIGANCHO": "San Juan de Lurigancho",
    "SJM": "San Juan de Miraflores",
    "SAN JUAN DE MIRAFLORES": "San Juan de Miraflores",
    "VES": "Villa El Salvador",
    "VILLA EL SALVADOR": "Villa El Salvador",
    "VMT": "Villa María del Triunfo",
    "VILLA MARIA DEL TRIUNFO": "Villa María del Triunfo",
    "LOS OLIVOS": "Los Olivos",
    "COMAS": "Comas",
    "INDEPENDENCIA": "Independencia",
    "PUENTE PIEDRA": "Puente Piedra",
    "CARABAYLLO": "Carabayllo",
    "RIMAC": "Rímac",
    "BREÑA": "Breña",
    "PUEBLO LIBRE": "Pueblo Libre",
    "MAGDALENA": "Magdalena del Mar",
    "MAGDALENA DEL MAR": "Magdalena del Mar",
    "LINCE": "Lince",
    "JESUS MARIA": "Jesús María",
    "BARRANCO": "Barranco",
    "CHORRILLOS": "Chorrillos",
    "SAN MIGUEL": "San Miguel",
    "CALLAO": "Callao"
}

DISTRITOS_ALEATORIOS = list(set(DIC_DISTRITOS.values()))


# ===============================
# FUNCIÓN PARA OBTENER DISTRITO
# ===============================
def obtener_distrito(ubicacion):
    if not isinstance(ubicacion, str):
        return random.choice(DISTRITOS_ALEATORIOS)

    texto = ubicacion.upper()

    for clave, distrito in DIC_DISTRITOS.items():
        if clave in texto:
            return distrito

    return random.choice(DISTRITOS_ALEATORIOS)


# ===============================
# LEER CSV
# ===============================
ds_ASP = pd.read_csv("ASP.csv")


# ===============================
# CREAR COLUMNA DISTRITO
# ===============================
ds_ASP["Distrito"] = ds_ASP["UBICACIÓN"].apply(obtener_distrito)
ds_ASP = ds_ASP[ds_ASP["Distrito"].notna()]


# ===============================
# FILTRADO PARA INSERT
# ===============================
distritos = {}

for d in ds_ASP["Distrito"]:
    distritos[d] = True


# ===============================
# GENERAR SQL INSERT
# ===============================
sql_dist = "-- migrate:up\n\n"
id_distrito = 1

for nombre in sorted(distritos.keys()):
    sql_dist += (
        "INSERT INTO distritos (id_distrito, nombre) "
        f"VALUES ({id_distrito}, '{nombre}');\n"
    )
    id_distrito += 1


# ===============================
# GUARDAR ARCHIVO SQL
# ===============================
with open("inserts_distritos.sql", "w", encoding="utf-8") as f:
    f.write(sql_dist)