# Análisis Exploratorio de Datos (EDA)

Importación de librerías

In [2]:
import sqlite3 # Para manejo de bases de datos
import pandas as pd # Para manejo de dataframes

### Carga y exploración incial de datos

In [3]:
# Cargar datos
df = pd.read_csv("../data/raw/ufo_data.csv")
print("✅ Datos cargados")

✅ Datos cargados


  df = pd.read_csv("../data/raw/ufo_data.csv")


In [4]:
# Crearemos una conexión sqlite3
conexion = sqlite3.connect("../data/raw/ufo_data.db")

# Guardar el dataframe como tabla SQLite
df.to_sql("ufo_table", conexion, if_exists="replace", index=False) 
# El index col se usa para eliminar el Unnamed

print("✅ Base de datos creada correctamente...")

cursor = conexion.cursor()

✅ Base de datos creada correctamente...


### Antes de empezar a trabajar, debemos crear una copia de la DB para no alterar a la original.

In [5]:
# Crear una copia de la db
copia = "../data/processed/copia.db"

consulta = f"""
ATTACH DATABASE '{copia}' AS copia;
SELECT sql FROM sqlite_master WHERE type='table';
"""
cursor.executescript(consulta)

# Copiar las tablas a la base de datos de respaldo
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tablas = cursor.fetchall()

for tabla in tablas:
    nombre_tabla = tabla[0]
    consulta_copia = f"CREATE TABLE copia.{nombre_tabla} AS SELECT * FROM {nombre_tabla};"
    cursor.execute(consulta_copia)

print("✅ Copia de la BD creada correctamente...")
# Ya no volver a ejecutar esto
# Si lo hacemos primero hay que cerrar la conexión
# Y luego hay que eliminar el archivo copia.db

✅ Copia de la BD creada correctamente...


### Consultar todas las tablas que existen en el Dataset o DB

In [6]:
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

[('ufo_table',)]


### Explorar las 5 principales filas del Dataset(DB)

In [7]:
filtrar_5_filas = "SELECT * FROM copia.ufo_table LIMIT 5;"
filtrar_5_filas = pd.read_sql_query(filtrar_5_filas, conexion)
filtrar_5_filas

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,1949-10-10 20:30:00,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,2004-04-27,29.8830556,-97.941111
1,1949-10-10 21:00:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,2005-12-16,29.38421,-98.581082
2,1955-10-10 17:00:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,2008-01-21,53.2,-2.916667
3,1956-10-10 21:00:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,2004-01-17,28.9783333,-96.645833
4,1960-10-10 20:00:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,2004-01-22,21.4180556,-157.803611


### Obtener información sobre la estructura de la tabla

In [8]:
info = "PRAGMA table_info(ufo_table);"
info = pd.read_sql_query(info, conexion)
info

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,datetime,TEXT,0,,0
1,1,city,TEXT,0,,0
2,2,state,TEXT,0,,0
3,3,country,TEXT,0,,0
4,4,shape,TEXT,0,,0
5,5,duration (seconds),TEXT,0,,0
6,6,duration (hours/min),TEXT,0,,0
7,7,comments,TEXT,0,,0
8,8,date posted,TEXT,0,,0
9,9,latitude,TEXT,0,,0


### Estadísticas Descriptivas

#### Total de registros o filas

In [9]:
total_registros = "SELECT COUNT(*) AS total_registros FROM copia.ufo_table;"
total_registros = pd.read_sql_query(total_registros, conexion)
total_registros

Unnamed: 0,total_registros
0,80332


#### Estadísticas básicas con Python (pandas)

Usando `describe()` obtenemos:

- Conteo: count
- Media o promedio (AVG): mean
- Desviación Estandar: std
- Mínimo (Valor mínimo): min
- Perceltil 25% (P25): 25%
- Perceltil 50% (P50): 50%
- Percentil 75% (P75): 75%
- Máximo (Valor máximo): max

- Por defecto solo analiza columnas numéricas

In [None]:
df.describe()

Unnamed: 0,longitude
count,80332.0
mean,-86.772885
std,39.697205
min,-176.658056
25%,-112.073333
50%,-87.903611
75%,-78.755
max,178.4419


#### La única variable numérica es `longitude`

Para incluir columnas categóricas

In [13]:
df.describe(include='all')

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
count,80332,80332,74535,70662,78400,80332.0,80332,80317,80332,80332.0,80332.0
unique,69474,19900,67,5,29,705.0,8304,79997,317,23292.0,
top,2010-07-04 22:00:00,seattle,ca,us,light,300.0,5 minutes,Fireball,2009-12-12,47.6063889,
freq,36,525,9655,65114,16565,7070.0,4716,11,1510,481.0,
mean,,,,,,,,,,,-86.772885
std,,,,,,,,,,,39.697205
min,,,,,,,,,,,-176.658056
25%,,,,,,,,,,,-112.073333
50%,,,,,,,,,,,-87.903611
75%,,,,,,,,,,,-78.755


- unique: Número de valores únicos en la columna.
- top: El valor más frecuente en la columna.
- freq: Cuántas veces aparece el valor más frecuente.

1. Identificar o explorar:

    ✅ Datos faltantes (Nulos).
    
    ✅ Registros duplicados.
    
    ✅ Formatos inconsistentes (fechas, nombres, números).
    
    ✅ Valores atípicos (outliers).

### ✅ Identificar valores Nulos

In [14]:
revisar_nulos = """
                SELECT 
                    SUM(CASE WHEN datetime IS NULL THEN 1 ELSE 0 END) AS datetime_nulos,
                    SUM(CASE WHEN city IS NULL THEN 1 ELSE 0 END) AS city_nulos,
                    SUM(CASE WHEN state IS NULL THEN 1 ELSE 0 END) AS state_nulos,
                    SUM(CASE WHEN country IS NULL THEN 1 ELSE 0 END) AS country_nulos,
                    SUM(CASE WHEN shape IS NULL THEN 1 ELSE 0 END) AS shape_nulos,
                    SUM(CASE WHEN `duration (seconds)` IS NULL THEN 1 ELSE 0 END) AS duration_seconds_nulos,
                    SUM(CASE WHEN `duration (hours/min)` IS NULL THEN 1 ELSE 0 END) AS duration_hours_nulos,
                    SUM(CASE WHEN comments IS NULL THEN 1 ELSE 0 END) AS comments_nulos,
                    SUM(CASE WHEN `date posted` IS NULL THEN 1 ELSE 0 END) AS date_posted_nulos,
                    SUM(CASE WHEN latitude IS NULL THEN 1 ELSE 0 END) AS latitude_nulos
                FROM copia.ufo_table;
                """
revisar_nulos = pd.read_sql_query(revisar_nulos, conexion)
revisar_nulos

Unnamed: 0,datetime_nulos,city_nulos,state_nulos,country_nulos,shape_nulos,duration_seconds_nulos,duration_hours_nulos,comments_nulos,date_posted_nulos,latitude_nulos
0,0,0,5797,9670,1932,0,0,15,0,0


### ✅ Identificar registros duplicados

In [15]:
duplicados = """
             SELECT *
             FROM (
                 SELECT *,
                 ROW_NUMBER() OVER(PARTITION BY datetime, city, state, country, shape) AS num_fila
                 FROM copia.ufo_table
             ) subconsulta
             WHERE num_fila > 1;
             """
duplicados = pd.read_sql_query(duplicados, conexion)
duplicados

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude,num_fila
0,1957-12-28 00:00:00,madison,wi,us,cigar,3600,1 hour,My mother and i worked as telephone swithcboar...,2004-06-18,43.0730556,-89.401111,2
1,1958-04-17 21:30:00,winooski,vt,us,oval,5400,1 hr 30 min,object hovered&#44 landed then left headed nor...,2005-02-22,44.4913889,-73.186111,2
2,1965-08-20 20:30:00,new york city (brooklyn),ny,us,cylinder,120.0,minutes,1965 Dark Grey Cylinder UFO Updated Statement,2006-05-15,40.7141667,-74.006389,2
3,1965-10-17 19:15:00,la jolla,ca,us,disk,6,about 6 seconds,Four UFO&#39s fly in open daylight&#44 close b...,2001-04-01,32.8472222,-117.273333,2
4,1965-12-09 16:48:00,enfield,ct,us,circle,120,2 minutes,Coincidence with later Sighting in 1982 in Gre...,2008-08-12,41.9761111,-72.592222,2
...,...,...,...,...,...,...,...,...,...,...,...,...
625,2014-04-18 22:01:00,chico,ca,us,disk,1740,~29 minutes,After going on the back porch for a smoke&#44 ...,2014-04-24,39.7286111,-121.836389,2
626,2014-04-19 13:15:00,atwater,oh,us,disk,8,8 seconds,Dull silver saucer moving northward.,2014-04-24,41.0238889,-81.163611,2
627,2014-04-26 21:45:00,cincinnati,oh,us,sphere,900,15 minutes,14 Red/White Spheres Flying Silently over Cinc...,2014-05-02,39.1619444,-84.456944,2
628,2014-04-27 21:05:00,kenmore,wa,us,light,540,9:00,Twenty (approx.) UFOs Observed Above Lake Wash...,2014-05-02,47.7575,-122.242778,2


In [22]:
valores_duplicados = """
                    WITH duplicados AS (
                        SELECT *,
                            ROW_NUMBER() OVER(PARTITION BY datetime, city, state, country, shape) AS num_fila
                        FROM copia.ufo_table
                    )
                    SELECT *
                    FROM duplicados
                    WHERE num_fila > 1;
                     """
valores_duplicados = pd.read_sql_query(valores_duplicados, conexion)
valores_duplicados

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude,num_fila
0,1957-12-28 00:00:00,madison,wi,us,cigar,3600,1 hour,My mother and i worked as telephone swithcboar...,2004-06-18,43.0730556,-89.401111,2
1,1958-04-17 21:30:00,winooski,vt,us,oval,5400,1 hr 30 min,object hovered&#44 landed then left headed nor...,2005-02-22,44.4913889,-73.186111,2
2,1965-08-20 20:30:00,new york city (brooklyn),ny,us,cylinder,120.0,minutes,1965 Dark Grey Cylinder UFO Updated Statement,2006-05-15,40.7141667,-74.006389,2
3,1965-10-17 19:15:00,la jolla,ca,us,disk,6,about 6 seconds,Four UFO&#39s fly in open daylight&#44 close b...,2001-04-01,32.8472222,-117.273333,2
4,1965-12-09 16:48:00,enfield,ct,us,circle,120,2 minutes,Coincidence with later Sighting in 1982 in Gre...,2008-08-12,41.9761111,-72.592222,2
...,...,...,...,...,...,...,...,...,...,...,...,...
625,2014-04-18 22:01:00,chico,ca,us,disk,1740,~29 minutes,After going on the back porch for a smoke&#44 ...,2014-04-24,39.7286111,-121.836389,2
626,2014-04-19 13:15:00,atwater,oh,us,disk,8,8 seconds,Dull silver saucer moving northward.,2014-04-24,41.0238889,-81.163611,2
627,2014-04-26 21:45:00,cincinnati,oh,us,sphere,900,15 minutes,14 Red/White Spheres Flying Silently over Cinc...,2014-05-02,39.1619444,-84.456944,2
628,2014-04-27 21:05:00,kenmore,wa,us,light,540,9:00,Twenty (approx.) UFOs Observed Above Lake Wash...,2014-05-02,47.7575,-122.242778,2


In [17]:
# Registros únicos
registros_unicos = """
                   SELECT DISTINCT datetime, city, state, country, shape
                   FROM copia.ufo_table;
                   """
registros_unicos = pd.read_sql_query(registros_unicos, conexion)
registros_unicos

Unnamed: 0,datetime,city,state,country,shape
0,1949-10-10 20:30:00,san marcos,tx,us,cylinder
1,1949-10-10 21:00:00,lackland afb,tx,,light
2,1955-10-10 17:00:00,chester (uk/england),,gb,circle
3,1956-10-10 21:00:00,edna,tx,us,circle
4,1960-10-10 20:00:00,kaneohe,hi,us,light
...,...,...,...,...,...
79697,2013-09-09 21:15:00,nashville,tn,us,light
79698,2013-09-09 22:00:00,boise,id,us,circle
79699,2013-09-09 22:00:00,napa,ca,us,other
79700,2013-09-09 22:20:00,vienna,va,us,circle


In [None]:
# Obtener información de los tipos de datos con: .info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80332 entries, 0 to 80331
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   datetime              80332 non-null  object 
 1   city                  80332 non-null  object 
 2   state                 74535 non-null  object 
 3   country               70662 non-null  object 
 4   shape                 78400 non-null  object 
 5   duration (seconds)    80332 non-null  object 
 6   duration (hours/min)  80332 non-null  object 
 7   comments              80317 non-null  object 
 8   date posted           80332 non-null  object 
 9   latitude              80332 non-null  object 
 10  longitude             80332 non-null  float64
dtypes: float64(1), object(10)
memory usage: 6.7+ MB


### Indentificar de patrones y anomalías (Outliers)

##### Como en este caso tenemos a city, country, shape y son de tipo TEXT identificamos outlier con la siguiente consulta

In [None]:
# Para city
city_outliers = """
                SELECT city, COUNT(*) AS conteo
                FROM copia.ufo_table
                GROUP BY city
                ORDER BY COUNT(*) ASC
                LIMIT 30;
                """
city_outliers = pd.read_sql_query(city_outliers, conexion)
city_outliers

Unnamed: 0,city,COUNT(*)
0,&ccedil;anakkale (turkey),1
1,&iacute;safj&ouml;r&eth;ur (iceland),1
2,&ouml;lmstad (sweden),1
3,1-25 corridor (southbound&#44 65 miles north n...,1
4,100 mile (canada),1
5,100 mile house (canada),1
6,?,1
7,a field outside small town,1
8,a55 northope (uk/north wales),1
9,aachen (near cologne) (germany),1


In [34]:
country_outliers = """
                SELECT country, COUNT(*)
                FROM copia.ufo_table
                GROUP BY country
                ORDER BY COUNT(*) ASC;
                """
country_outliers = pd.read_sql_query(country_outliers, conexion)
country_outliers

Unnamed: 0,country,COUNT(*)
0,de,105
1,au,538
2,gb,1905
3,ca,3000
4,,9670
5,us,65114


In [35]:
shape_outliers = """
                SELECT shape, COUNT(*)
                FROM copia.ufo_table
                GROUP BY shape
                ORDER BY COUNT(*) ASC;
                 """
shape_outliers = pd.read_sql_query(shape_outliers, conexion)
shape_outliers

Unnamed: 0,shape,COUNT(*)
0,changed,1
1,dome,1
2,flare,1
3,hexagon,1
4,pyramid,1
5,crescent,2
6,round,2
7,delta,7
8,cross,233
9,cone,316


In [1]:
# Cerrar conexión una vez terminado
#conexion.close()