## Limpieza de la data

In [7]:
import pandas as pd
import sqlite3

In [8]:
ruta = "../data/processed/copia.db"

conexion = sqlite3.connect(ruta)
cursor = conexion.cursor()

Lo que haremos en esta fase:

2. Ahora si procedemos hacer la limpieza:

    ✅ Datos faltantes (Nulos).
    
    ✅ Registros duplicados.
    
    ✅ Formatos inconsistentes (fechas, nombres, números).
    
    ✅ Valores atípicos (outliers).

### ✅ Datos Faltantes (Nulos)

In [9]:
revisar_nulos = """
                SELECT 
                    SUM(CASE WHEN datetime IS NULL THEN 1 ELSE 0 END) AS datetime_nulos,
                    SUM(CASE WHEN city IS NULL THEN 1 ELSE 0 END) AS city_nulos,
                    SUM(CASE WHEN state IS NULL THEN 1 ELSE 0 END) AS state_nulos,
                    SUM(CASE WHEN country IS NULL THEN 1 ELSE 0 END) AS country_nulos,
                    SUM(CASE WHEN shape IS NULL THEN 1 ELSE 0 END) AS shape_nulos,
                    SUM(CASE WHEN `duration (seconds)` IS NULL THEN 1 ELSE 0 END) AS duration_seconds_nulos,
                    SUM(CASE WHEN `duration (hours/min)` IS NULL THEN 1 ELSE 0 END) AS duration_hours_nulos,
                    SUM(CASE WHEN comments IS NULL THEN 1 ELSE 0 END) AS comments_nulos,
                    SUM(CASE WHEN `date posted` IS NULL THEN 1 ELSE 0 END) AS date_posted_nulos,
                    SUM(CASE WHEN latitude IS NULL THEN 1 ELSE 0 END) AS latitude_nulos
                FROM copia.ufo_table;
                """
revisar_nulos = pd.read_sql_query(revisar_nulos, conexion)
revisar_nulos

DatabaseError: Execution failed on sql '
                SELECT 
                    SUM(CASE WHEN datetime IS NULL THEN 1 ELSE 0 END) AS datetime_nulos,
                    SUM(CASE WHEN city IS NULL THEN 1 ELSE 0 END) AS city_nulos,
                    SUM(CASE WHEN state IS NULL THEN 1 ELSE 0 END) AS state_nulos,
                    SUM(CASE WHEN country IS NULL THEN 1 ELSE 0 END) AS country_nulos,
                    SUM(CASE WHEN shape IS NULL THEN 1 ELSE 0 END) AS shape_nulos,
                    SUM(CASE WHEN `duration (seconds)` IS NULL THEN 1 ELSE 0 END) AS duration_seconds_nulos,
                    SUM(CASE WHEN `duration (hours/min)` IS NULL THEN 1 ELSE 0 END) AS duration_hours_nulos,
                    SUM(CASE WHEN comments IS NULL THEN 1 ELSE 0 END) AS comments_nulos,
                    SUM(CASE WHEN `date posted` IS NULL THEN 1 ELSE 0 END) AS date_posted_nulos,
                    SUM(CASE WHEN latitude IS NULL THEN 1 ELSE 0 END) AS latitude_nulos
                FROM copia.ufo_table;
                ': no such table: copia.ufo_table

In [10]:
# Reemplazar Nulos
consulta = """
           UPDATE copia.ufo_table
           SET
               state = COALESCE(state, "Desconocido"),
               country = COALESCE(country, "Desconocido"),
               shape = COALESCE(shape, "No especificado"),
               comments = COALESCE(comments, "Sin comentarios")
           """
conexion.execute(consulta)
conexion.commit()
print("Valores reemplazados con éxito")

OperationalError: no such table: copia.ufo_table

In [11]:
nombre_columnas = "PRAGMA table_info(ufo_table);"
nombre_columnas = pd.read_sql_query(nombre_columnas, conexion)
nombre_columnas

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,datetime,TEXT,0,,0
1,1,city,TEXT,0,,0
2,2,state,TEXT,0,,0
3,3,country,TEXT,0,,0
4,4,shape,TEXT,0,,0
5,5,duration (seconds),TEXT,0,,0
6,6,duration (hours/min),TEXT,0,,0
7,7,comments,TEXT,0,,0
8,8,date posted,TEXT,0,,0
9,9,latitude,TEXT,0,,0


### ✅ Registros Duplicados

In [12]:
duplicados = """
             SELECT *
             FROM (
                 SELECT *,
                 ROW_NUMBER() OVER(PARTITION BY datetime, city, state, country, shape) AS num_fila
                 FROM copia.ufo_table
             ) subconsulta
             WHERE num_fila > 1;
             """
duplicados = pd.read_sql_query(duplicados, conexion)
duplicados

DatabaseError: Execution failed on sql '
             SELECT *
             FROM (
                 SELECT *,
                 ROW_NUMBER() OVER(PARTITION BY datetime, city, state, country, shape) AS num_fila
                 FROM copia.ufo_table
             ) subconsulta
             WHERE num_fila > 1;
             ': no such table: copia.ufo_table

In [13]:
registros_unicos = """
                   SELECT DISTINCT datetime, city, state, country, shape
                   FROM copia.ufo_table;
                   """
registros_unicos = pd.read_sql_query(registros_unicos, conexion)
registros_unicos

DatabaseError: Execution failed on sql '
                   SELECT DISTINCT datetime, city, state, country, shape
                   FROM copia.ufo_table;
                   ': no such table: copia.ufo_table

In [14]:
# Eliminar duplicados
eliminar = """
           WITH cte AS (
                SELECT ROWID, ROW_NUMBER() OVER(PARTITION BY datetime, city, state, country, shape) AS num_fila
                FROM copia.ufo_table
           )
           DELETE FROM copia.ufo_table
           WHERE ROWID IN (
               SELECT ROWID FROM cte WHERE num_fila > 1
           );
           """
conexion.execute(eliminar)
conexion.commit()
print("Duplicados eliminados con éxito")

OperationalError: no such table: copia.ufo_table

### ✅ Normalizar los tipos de datos (Fechas, numeros)

In [15]:
# Convertir datetime de TEXT a DATETIME(ya que es fecha y hora, si fuera solo fecha sería DATE y si solo fuera hora sería TIME)
# Convertir duration seconds de TEXT a REAL
formatear = """
            UPDATE copia.ufo_table
            SET `duration (seconds)` = CAST(`duration (seconds)` AS REAL)
            ;
            """
conexion.execute(formatear)
conexion.commit()
print("Datos normalizados con éxito")

OperationalError: no such table: copia.ufo_table

In [None]:
# Cerrar conexion
#conexion.close()