# Generacion del Dataset para DENStream

A partir del dataset con las features, generaremos un dataset especifico para hacer clustering

In [1]:
import pandas as pd
import numpy as np
from pyproj import Proj, transform
from sklearn.preprocessing import StandardScaler
import joblib
import geopandas as gpd
from pathlib import Path

In [2]:
df = pd.read_csv('max_df_all_features.csv')
df

Unnamed: 0,icao24,callsign,origin_country,time_position,last_contact,longitude,latitude,baro_altitude,on_ground,velocity,heading,timestamp,año,mes,dia,hora,minuto,lat_bin,lon_bin,zone_id
0,e8021b,LPE2368,Chile,1750548596,1750548596,-59.7568,-32.8455,10934.70,False,221.01,329.20,2025-06-21 23:30:00+00:00,2025,6,21,23,30,0,1,0_1
1,e80204,LAN772,Chile,1750548577,1750548577,-48.4337,-25.7672,10652.76,False,287.70,65.04,2025-06-21 23:30:00+00:00,2025,6,21,23,30,0,1,0_1
2,e0b244,ARG1255,Argentina,1750548596,1750548596,-51.8828,-29.2480,12192.00,False,208.18,226.80,2025-06-21 23:30:00+00:00,2025,6,21,23,30,0,1,0_1
3,e07583,ARG1843,Argentina,1750548595,1750548596,-58.3590,-34.5908,335.28,False,61.21,303.69,2025-06-21 23:30:00+00:00,2025,6,21,23,30,0,1,0_1
4,e49eef,GLO7705,Brazil,1750548595,1750548596,-48.9696,-28.2255,11277.60,False,262.97,47.85,2025-06-21 23:30:00+00:00,2025,6,21,23,30,0,1,0_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15617,347259,AVA217,Spain,1750560445,1750560445,-60.2880,-33.2445,11277.60,False,259.55,144.36,2025-06-22 02:47:00+00:00,2025,6,22,2,47,0,0,0_0
15618,e80218,LAN096,Chile,1750560084,1750560084,-73.3556,-45.3148,10972.80,False,249.04,5.33,2025-06-22 02:41:00+00:00,2025,6,22,2,41,0,0,0_0
15619,347259,AVA217,Spain,1750560082,1750560082,-60.8659,-32.5615,11277.60,False,252.97,144.44,2025-06-22 02:41:00+00:00,2025,6,22,2,41,0,0,0_0
15620,347259,AVA217,Spain,1750560359,1750560359,-60.4274,-33.0809,11277.60,False,259.96,144.42,2025-06-22 02:46:00+00:00,2025,6,22,2,46,0,0,0_0


Verificamos los tipos de datos, las columnas y que no hayan nulo

In [3]:
df.dtypes

icao24             object
callsign           object
origin_country     object
time_position       int64
last_contact        int64
longitude         float64
latitude          float64
baro_altitude     float64
on_ground            bool
velocity          float64
heading           float64
timestamp          object
año                 int64
mes                 int64
dia                 int64
hora                int64
minuto              int64
lat_bin             int64
lon_bin             int64
zone_id            object
dtype: object

In [4]:
df.columns

Index(['icao24', 'callsign', 'origin_country', 'time_position', 'last_contact',
       'longitude', 'latitude', 'baro_altitude', 'on_ground', 'velocity',
       'heading', 'timestamp', 'año', 'mes', 'dia', 'hora', 'minuto',
       'lat_bin', 'lon_bin', 'zone_id'],
      dtype='object')

In [5]:
df.isna().sum()

icao24            0
callsign          0
origin_country    0
time_position     0
last_contact      0
longitude         0
latitude          0
baro_altitude     0
on_ground         0
velocity          0
heading           0
timestamp         0
año               0
mes               0
dia               0
hora              0
minuto            0
lat_bin           0
lon_bin           0
zone_id           0
dtype: int64

Cambiamos el tipo de dato a timestamp para que sea de tipo datetime

In [6]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.dtypes

icao24                         object
callsign                       object
origin_country                 object
time_position                   int64
last_contact                    int64
longitude                     float64
latitude                      float64
baro_altitude                 float64
on_ground                        bool
velocity                      float64
heading                       float64
timestamp         datetime64[ns, UTC]
año                             int64
mes                             int64
dia                             int64
hora                            int64
minuto                          int64
lat_bin                         int64
lon_bin                         int64
zone_id                        object
dtype: object

Extraemos el tiempo en minutos desde el dia que inicio la absorcion de data

In [7]:
df['time_minutes'] = df['timestamp'].dt.hour * 60 + df['timestamp'].dt.minute
df

Unnamed: 0,icao24,callsign,origin_country,time_position,last_contact,longitude,latitude,baro_altitude,on_ground,velocity,...,timestamp,año,mes,dia,hora,minuto,lat_bin,lon_bin,zone_id,time_minutes
0,e8021b,LPE2368,Chile,1750548596,1750548596,-59.7568,-32.8455,10934.70,False,221.01,...,2025-06-21 23:30:00+00:00,2025,6,21,23,30,0,1,0_1,1410
1,e80204,LAN772,Chile,1750548577,1750548577,-48.4337,-25.7672,10652.76,False,287.70,...,2025-06-21 23:30:00+00:00,2025,6,21,23,30,0,1,0_1,1410
2,e0b244,ARG1255,Argentina,1750548596,1750548596,-51.8828,-29.2480,12192.00,False,208.18,...,2025-06-21 23:30:00+00:00,2025,6,21,23,30,0,1,0_1,1410
3,e07583,ARG1843,Argentina,1750548595,1750548596,-58.3590,-34.5908,335.28,False,61.21,...,2025-06-21 23:30:00+00:00,2025,6,21,23,30,0,1,0_1,1410
4,e49eef,GLO7705,Brazil,1750548595,1750548596,-48.9696,-28.2255,11277.60,False,262.97,...,2025-06-21 23:30:00+00:00,2025,6,21,23,30,0,1,0_1,1410
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15617,347259,AVA217,Spain,1750560445,1750560445,-60.2880,-33.2445,11277.60,False,259.55,...,2025-06-22 02:47:00+00:00,2025,6,22,2,47,0,0,0_0,167
15618,e80218,LAN096,Chile,1750560084,1750560084,-73.3556,-45.3148,10972.80,False,249.04,...,2025-06-22 02:41:00+00:00,2025,6,22,2,41,0,0,0_0,161
15619,347259,AVA217,Spain,1750560082,1750560082,-60.8659,-32.5615,11277.60,False,252.97,...,2025-06-22 02:41:00+00:00,2025,6,22,2,41,0,0,0_0,161
15620,347259,AVA217,Spain,1750560359,1750560359,-60.4274,-33.0809,11277.60,False,259.96,...,2025-06-22 02:46:00+00:00,2025,6,22,2,46,0,0,0_0,166


Ahora normalizamos las cordenadas y las volvemos a kilometros

In [8]:
utm_proj = Proj(proj='utm', zone=20, south=True, ellps='WGS84')
gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df.longitude, df.latitude),
    crs="EPSG:4326"
)
gdf = gdf.to_crs(utm_proj.srs)
df['x_km'] = gdf.geometry.x / 1_000
df['y_km'] = gdf.geometry.y / 1_000
df['altitude_km'] = df['baro_altitude'] / 1_000
df

Unnamed: 0,icao24,callsign,origin_country,time_position,last_contact,longitude,latitude,baro_altitude,on_ground,velocity,...,dia,hora,minuto,lat_bin,lon_bin,zone_id,time_minutes,x_km,y_km,altitude_km
0,e8021b,LPE2368,Chile,1750548596,1750548596,-59.7568,-32.8455,10934.70,False,221.01,...,21,23,30,0,1,0_1,1410,803.560394,6361.177907,10.93470
1,e80204,LAN772,Chile,1750548577,1750548577,-48.4337,-25.7672,10652.76,False,287.70,...,21,23,30,0,1,0_1,1410,1970.506554,7067.656114,10.65276
2,e0b244,ARG1255,Argentina,1750548596,1750548596,-51.8828,-29.2480,12192.00,False,208.18,...,21,23,30,0,1,0_1,1410,1583.783582,6712.750111,12.19200
3,e07583,ARG1843,Argentina,1750548595,1750548596,-58.3590,-34.5908,335.28,False,61.21,...,21,23,30,0,1,0_1,1410,925.762995,6162.532021,0.33528
4,e49eef,GLO7705,Brazil,1750548595,1750548596,-48.9696,-28.2255,11277.60,False,262.97,...,21,23,30,0,1,0_1,1410,1884.276458,6796.608623,11.27760
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15617,347259,AVA217,Spain,1750560445,1750560445,-60.2880,-33.2445,11277.60,False,259.55,...,22,2,47,0,0,0_0,167,752.680601,6318.327375,11.27760
15618,e80218,LAN096,Chile,1750560084,1750560084,-73.3556,-45.3148,10972.80,False,249.04,...,22,2,41,0,0,0_0,161,-311.634511,4929.644380,10.97280
15619,347259,AVA217,Spain,1750560082,1750560082,-60.8659,-32.5615,11277.60,False,252.97,...,22,2,41,0,0,0_0,161,700.358672,6395.314685,11.27760
15620,347259,AVA217,Spain,1750560359,1750560359,-60.4274,-33.0809,11277.60,False,259.96,...,22,2,46,0,0,0_0,166,740.134875,6336.800648,11.27760


Separacion de caracteristicas que usaremos

In [9]:
selected_features = ['x_km', 'y_km', 'altitude_km', 'time_minutes']

Escalamos los features para usarlo:

In [10]:
scaler_path = Path('DENStream_Scaler/scaler.pkl')

if scaler_path.exists():
    scaler = joblib.load(scaler_path)
else:
    scaler = StandardScaler()
    scaler.fit(df[selected_features])
    joblib.dump(scaler, scaler_path)
df_scaled = scaler.transform(df[selected_features])

Generamos el Dataframe para manejarlo mejor:

In [11]:
df_scaled = pd.DataFrame(df_scaled, columns=selected_features)

Generamos el Dataset

In [12]:
df_scaled.to_csv('DENStream_Datasets/max_denstream_preprocessed.csv', index=False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b1c0c563-baec-4f70-b443-fb48e15d9efd' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>