## Limpieza de la data

Lo que haremos en esta fase:

1. Hacer una copia a la base de datos original (siempre hay que hacer esto)
2. Ahora si procedemos hacer la limpieza:

    ✅ Datos faltantes (Nulos).
    
    ✅ Registros duplicados.
    
    ✅ Formatos inconsistentes (fechas, nombres, números).
    
    ✅ Valores atípicos (outliers).

In [58]:
# Cargar data
import pandas as pd
import sqlite3

# Creamos una variable y le asignamos la ruta del archivo
ruta = "../data/raw/ufo_data.db"
# Creamos la conexión
conexion = sqlite3.connect(ruta)
cursor = conexion.cursor()


In [59]:
# Crear una copia de la db
copia = "../data/processed/copia.db"

consulta = f"""
ATTACH DATABASE '{copia}' AS copia;
SELECT sql FROM sqlite_master WHERE type='table';
"""
cursor.executescript(consulta)

# Copiar las tablas a la base de datos de respaldo
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tablas = cursor.fetchall()

for tabla in tablas:
    nombre_tabla = tabla[0]
    consulta_copia = f"CREATE TABLE copia.{nombre_tabla} AS SELECT * FROM {nombre_tabla};"
    cursor.execute(consulta_copia)

print("✅ Copia de la BD creada correctamente...")
# Ya no volver a ejecutar esto
# Si lo hacemos primero hay que cerrar la conexión
# Y luego hay que eliminar el archivo copia.db

OperationalError: table ufo_table already exists

### ✅ Datos Faltantes (Nulos)

In [None]:
revisar_nulos = """
                SELECT 
                    SUM(CASE WHEN datetime IS NULL THEN 1 ELSE 0 END) AS datetime_nulos,
                    SUM(CASE WHEN city IS NULL THEN 1 ELSE 0 END) AS city_nulos,
                    SUM(CASE WHEN state IS NULL THEN 1 ELSE 0 END) AS state_nulos,
                    SUM(CASE WHEN country IS NULL THEN 1 ELSE 0 END) AS country_nulos,
                    SUM(CASE WHEN shape IS NULL THEN 1 ELSE 0 END) AS shape_nulos,
                    SUM(CASE WHEN `duration (seconds)` IS NULL THEN 1 ELSE 0 END) AS duration_seconds_nulos,
                    SUM(CASE WHEN `duration (hours/min)` IS NULL THEN 1 ELSE 0 END) AS duration_hours_nulos,
                    SUM(CASE WHEN comments IS NULL THEN 1 ELSE 0 END) AS comments_nulos,
                    SUM(CASE WHEN `date posted` IS NULL THEN 1 ELSE 0 END) AS date_posted_nulos,
                    SUM(CASE WHEN latitude IS NULL THEN 1 ELSE 0 END) AS latitude_nulos
                FROM copia.ufo_table;
                """
revisar_nulos = pd.read_sql_query(revisar_nulos, conexion)
revisar_nulos

Unnamed: 0,datetime_nulos,city_nulos,state_nulos,country_nulos,shape_nulos,duration_seconds_nulos,duration_hours_nulos,comments_nulos,date_posted_nulos,latitude_nulos
0,0,0,5797,9670,1932,0,0,15,0,0


In [None]:
# Reemplazar Nulos
consulta = """
           UPDATE copia.ufo_table
           SET
               state = COALESCE(state, "Desconocido"),
               country = COALESCE(country, "Desconocido"),
               shape = COALESCE(shape, "No especificado"),
               comments = COALESCE(comments, "Sin comentarios")
           """
conexion.execute(consulta)
conexion.commit()
print("Valores reemplazados con éxito")

Valores reemplazados con éxito


In [None]:
nombre_columnas = "PRAGMA table_info(ufo_table);"
nombre_columnas = pd.read_sql_query(nombre_columnas, conexion)
nombre_columnas

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,datetime,TEXT,0,,0
1,1,city,TEXT,0,,0
2,2,state,TEXT,0,,0
3,3,country,TEXT,0,,0
4,4,shape,TEXT,0,,0
5,5,duration (seconds),TEXT,0,,0
6,6,duration (hours/min),TEXT,0,,0
7,7,comments,TEXT,0,,0
8,8,date posted,TEXT,0,,0
9,9,latitude,TEXT,0,,0


### ✅ Registros Duplicados

In [None]:
duplicados = """
             SELECT *
             FROM (
                 SELECT *,
                 ROW_NUMBER() OVER(PARTITION BY datetime, city, state, country, shape) AS num_fila
                 FROM copia.ufo_table
             ) subconsulta
             WHERE num_fila > 1;
             """
duplicados = pd.read_sql_query(duplicados, conexion)
duplicados

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude,num_fila
0,1957-12-28 00:00:00,madison,wi,us,cigar,3600,1 hour,My mother and i worked as telephone swithcboar...,2004-06-18,43.0730556,-89.401111,2
1,1958-04-17 21:30:00,winooski,vt,us,oval,5400,1 hr 30 min,object hovered&#44 landed then left headed nor...,2005-02-22,44.4913889,-73.186111,2
2,1965-08-20 20:30:00,new york city (brooklyn),ny,us,cylinder,120.0,minutes,1965 Dark Grey Cylinder UFO Updated Statement,2006-05-15,40.7141667,-74.006389,2
3,1965-10-17 19:15:00,la jolla,ca,us,disk,6,about 6 seconds,Four UFO&#39s fly in open daylight&#44 close b...,2001-04-01,32.8472222,-117.273333,2
4,1965-12-09 16:48:00,enfield,ct,us,circle,120,2 minutes,Coincidence with later Sighting in 1982 in Gre...,2008-08-12,41.9761111,-72.592222,2
...,...,...,...,...,...,...,...,...,...,...,...,...
625,2014-04-18 22:01:00,chico,ca,us,disk,1740,~29 minutes,After going on the back porch for a smoke&#44 ...,2014-04-24,39.7286111,-121.836389,2
626,2014-04-19 13:15:00,atwater,oh,us,disk,8,8 seconds,Dull silver saucer moving northward.,2014-04-24,41.0238889,-81.163611,2
627,2014-04-26 21:45:00,cincinnati,oh,us,sphere,900,15 minutes,14 Red/White Spheres Flying Silently over Cinc...,2014-05-02,39.1619444,-84.456944,2
628,2014-04-27 21:05:00,kenmore,wa,us,light,540,9:00,Twenty (approx.) UFOs Observed Above Lake Wash...,2014-05-02,47.7575,-122.242778,2


In [None]:
registros_unicos = """
                   SELECT DISTINCT datetime, city, state, country, shape
                   FROM copia.ufo_table;
                   """
registros_unicos = pd.read_sql_query(registros_unicos, conexion)
registros_unicos

Unnamed: 0,datetime,city,state,country,shape
0,1949-10-10 20:30:00,san marcos,tx,us,cylinder
1,1949-10-10 21:00:00,lackland afb,tx,Desconocido,light
2,1955-10-10 17:00:00,chester (uk/england),Desconocido,gb,circle
3,1956-10-10 21:00:00,edna,tx,us,circle
4,1960-10-10 20:00:00,kaneohe,hi,us,light
...,...,...,...,...,...
79697,2013-09-09 21:15:00,nashville,tn,us,light
79698,2013-09-09 22:00:00,boise,id,us,circle
79699,2013-09-09 22:00:00,napa,ca,us,other
79700,2013-09-09 22:20:00,vienna,va,us,circle


In [None]:
# Eliminar duplicados
eliminar = """
           WITH cte AS (
                SELECT ROWID, ROW_NUMBER() OVER(PARTITION BY datetime, city, state, country, shape) AS num_fila
                FROM copia.ufo_table
           )
           DELETE FROM copia.ufo_table
           WHERE ROWID IN (
               SELECT ROWID FROM cte WHERE num_fila > 1
           );
           """
conexion.execute(eliminar)
conexion.commit()
print("Duplicados eliminados con éxito")

Duplicados eliminados con éxito


### ✅ Normalizar los tipos de datos (Fechas, numeros)

In [None]:
# Convertir datetime de TEXT a DATETIME(ya que es fecha y hora, si fuera solo fecha sería DATE y si solo fuera hora sería TIME)
# Convertir duration seconds de TEXT a REAL
formatear = """
            UPDATE copia.ufo_table
            SET `duration (seconds)` = CAST(`duration (seconds)` AS REAL)
            ;
            """
conexion.execute(formatear)
conexion.commit()
print("Datos normalizados con éxito")

Datos normalizados con éxito


In [None]:
# Confirmar cambios en la DB
agregar_nueva_columna = "ALTER TABLE copia.ufo_table ADD COLUMN duration_second REAL;"
conexion.execute(agregar_nueva_columna)
conexion.commit()
print("Nueva columna creada con éxito")

Nueva columna creada con éxito


In [None]:
copiar = """
        UPDATE copia.ufo_table
        SET duration_second = CAST(`duration (seconds)` AS REAL);
         """
conexion.execute(copiar)
conexion.commit()
print("Datos copiados correctamente")

Datos copiados correctamente


In [None]:
consulta = "SELECT * FROM copia.ufo_table LIMIT 1;"
consulta = pd.read_sql_query(consulta, conexion)
consulta

crear_tabla = """
              CREATE TABLE copia.ufo_t AS
              SELECT datetime, city, state, country, shape, `duration (seconds)` AS duration_second, `duration (hours/min)`, comments, `date posted`, latitude, longitude
              FROM copia.ufo_table;
              """
conexion.execute(crear_tabla)
conexion.commit()
print("Tabla nueva creada correctamente")

In [None]:
# Cerrar conexion
#conexion.close()