# Data exploration
## Overview


## Libraries


In [16]:
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
import json, yaml, math, os
import numpy as np
import pandas as pd
import cv2
from tqdm import tqdm
from sklearn.cluster import MiniBatchKMeans
import matplotlib.pyplot as plt

## Config


In [17]:
MASTER_CSV = "../data_interim/Skycam_Gallo/imagenes_con_datos_asignados_1a1.csv"  
RUTA_IMGS = "../data_interim/Skycam_Gallo/Imagenes_Procesadas"
df_master = pd.read_csv(MASTER_CSV)

## Exploration

In [18]:
print("Dimensions of master DataFrame:", df_master.shape)
print("\nAvailable columns:")
print(df_master.columns.tolist())

Dimensions of master DataFrame: (19884, 11)

Available columns:
['archivo', 'carpeta', 'probabilidad_lluvia', 'prediccion_binaria', 'timestamp_img', 'Fecha_Hora', 'Temperatura_°C', 'Humedad_%', 'Presión_hPa', 'Irradiancia_Lufft', 'Irradiancia_Meteocontrol']


In [19]:
df_master.head()

Unnamed: 0,archivo,carpeta,probabilidad_lluvia,prediccion_binaria,timestamp_img,Fecha_Hora,Temperatura_°C,Humedad_%,Presión_hPa,Irradiancia_Lufft,Irradiancia_Meteocontrol
0,image-20250407101106.jpg,20250407,0.353816,1,2025-04-07 10:11:06,,,,,,
1,image-20250407112447.jpg,20250407,0.522174,1,2025-04-07 11:24:47,,,,,,
2,image-20250407112605.jpg,20250407,0.506453,1,2025-04-07 11:26:05,,,,,,
3,image-20250407112818.jpg,20250407,0.510108,1,2025-04-07 11:28:18,,,,,,
4,image-20250407112858.jpg,20250407,0.385175,1,2025-04-07 11:28:58,,,,,,


In [20]:
print("Initial analysis of Irradiance_Lufft column:")
print(f"Total records: {len(df_master)}")
print(f"Non-NaN values: {df_master['Irradiancia_Lufft'].notna().sum()}")
print(f"NaN values: {df_master['Irradiancia_Lufft'].isna().sum()}")
print(f"Percentage of NaN values: {df_master['Irradiancia_Lufft'].isna().mean() * 100:.2f}%")

Initial analysis of Irradiance_Lufft column:
Total records: 19884
Non-NaN values: 2021
NaN values: 17863
Percentage of NaN values: 89.84%


## Validation

In [21]:
def img_exists(row):
    archivo_proc = row['archivo'].replace('.jpg', '_proc.jpg')
    ruta = os.path.join(RUTA_IMGS, str(row['carpeta']), archivo_proc)
    return os.path.isfile(ruta)

In [22]:
df_master = df_master[df_master.apply(img_exists, axis=1)].reset_index(drop=True)
print(f"verified: {len(df_master)} rows")

df_master['timestamp_img'] = pd.to_datetime(df_master['timestamp_img'])

start_time = pd.to_datetime('07:00:00').time()
end_time = pd.to_datetime('16:30:00').time()

time_mask = (
    (df_master['timestamp_img'].dt.time >= start_time) & 
    (df_master['timestamp_img'].dt.time <= end_time)
)

df_master = df_master[time_mask].reset_index(drop=True)
print(f"Time filter (07:00-16:30): {len(df_master)} rows remaining")

df_master = df_master.sort_values('timestamp_img').reset_index(drop=True)

verified: 19884 rows
Time filter (07:00-16:30): 14796 rows remaining


In [26]:
start_date = df_master['timestamp_img'].min()
end_date = df_master['timestamp_img'].max()

print(f"Date range of available data:")
print(f"Start: {start_date}")
print(f"End: {end_date}")
print(f"Total days: {(end_date - start_date).days + 1}")

print(f"\nRecords per day:")
daily_counts = df_master['timestamp_img'].dt.date.value_counts().sort_index()
print(daily_counts.mean() , daily_counts.min(), daily_counts.max())

Date range of available data:
Start: 2025-04-07 10:11:06
End: 2025-06-09 16:29:28
Total days: 64

Records per day:
328.8 6 535
