In [1]:
!pip install openmeteo_requests
!pip install requests_cache
!pip install retry_requests

Collecting openmeteo_requests
  Downloading openmeteo_requests-1.1.0-py3-none-any.whl (5.5 kB)
Collecting openmeteo-sdk>=1.4.0 (from openmeteo_requests)
  Downloading openmeteo_sdk-1.6.1-py3-none-any.whl (12 kB)
Installing collected packages: openmeteo-sdk, openmeteo_requests
Successfully installed openmeteo-sdk-1.6.1 openmeteo_requests-1.1.0
Collecting requests_cache
  Downloading requests_cache-1.1.1-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.3/60.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting cattrs>=22.2 (from requests_cache)
  Downloading cattrs-23.2.2-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.5/57.5 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting url-normalize>=1.4 (from requests_cache)
  Downloading url_normalize-1.4.3-py2.py3-none-any.whl (6.8 kB)
Installing collected packages: url-normalize, cattrs, requests_cache
Successfully installed cattrs-23.2.2 requests

In [2]:
def analizar_dataframe(df):
    # Calcula la cantidad de nulos por columna
    nulos_por_columna = df.isnull().sum()

    # Calcula la cantidad de datos duplicados
    duplicados = df.duplicated().sum()

    # Calcula la cantidad de filas con algún outlier
    # Para este caso, vamos a considerar un valor como outlier si está fuera del rango de 1.5 veces el rango intercuartil (IQR)
    # Solo consideramos las columnas numéricas para calcular los cuantiles
    columnas_numericas = df.select_dtypes(include=[np.number]).columns
    Q1 = df[columnas_numericas].quantile(0.25)
    Q3 = df[columnas_numericas].quantile(0.75)
    IQR = Q3 - Q1
    outliers = ((df[columnas_numericas] < (Q1 - 1.5 * IQR)) | (df[columnas_numericas] > (Q3 + 1.5 * IQR))).sum()

    return nulos_por_columna, duplicados, outliers

In [18]:
import openmeteo_requests

import requests_cache
import pandas as pd
from retry_requests import retry

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
	"latitude": 40.7143,
	"longitude": -74.006,
	"start_date": "2022-01-01",
	"end_date": "2023-11-22",
	"daily": ["temperature_2m_max", "temperature_2m_min", "temperature_2m_mean", "apparent_temperature_max", "apparent_temperature_min", "apparent_temperature_mean", "precipitation_sum", "shortwave_radiation_sum"],
	"timezone": "America/New_York"
}
responses = openmeteo.weather_api(url, params=params)

# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
print(f"Coordinates {response.Latitude()}°E {response.Longitude()}°N")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

# Process daily data. The order of variables needs to be the same as requested.
daily = response.Daily()
daily_temperature_2m_max = daily.Variables(0).ValuesAsNumpy()
daily_temperature_2m_min = daily.Variables(1).ValuesAsNumpy()
daily_temperature_2m_mean = daily.Variables(2).ValuesAsNumpy()
daily_apparent_temperature_max = daily.Variables(3).ValuesAsNumpy()
daily_apparent_temperature_min = daily.Variables(4).ValuesAsNumpy()
daily_apparent_temperature_mean = daily.Variables(5).ValuesAsNumpy()
daily_precipitation_sum = daily.Variables(6).ValuesAsNumpy()
daily_shortwave_radiation_sum = daily.Variables(7).ValuesAsNumpy()

daily_data = {"date": pd.date_range(
	start = pd.to_datetime(daily.Time(), unit = "s"),
	end = pd.to_datetime(daily.TimeEnd(), unit = "s"),
	freq = pd.Timedelta(seconds = daily.Interval()),
	inclusive = "left"
)}
daily_data["temperature_2m_max"] = daily_temperature_2m_max
daily_data["temperature_2m_min"] = daily_temperature_2m_min
daily_data["temperature_2m_mean"] = daily_temperature_2m_mean
daily_data["apparent_temperature_max"] = daily_apparent_temperature_max
daily_data["apparent_temperature_min"] = daily_apparent_temperature_min
daily_data["apparent_temperature_mean"] = daily_apparent_temperature_mean
daily_data["precipitation_sum"] = daily_precipitation_sum
daily_data["shortwave_radiation_sum"] = daily_shortwave_radiation_sum

daily_dataframe = pd.DataFrame(data = daily_data)
print(daily_dataframe)

Coordinates 40.738136291503906°E -74.04254150390625°N
Elevation 51.0 m asl
Timezone b'America/New_York' b'EST'
Timezone difference to GMT+0 -18000 s
                   date  temperature_2m_max  temperature_2m_min  \
0   2022-01-01 05:00:00             11.5725              7.4225   
1   2022-01-02 05:00:00             13.6225              2.0225   
2   2022-01-03 05:00:00              1.6225             -4.7775   
3   2022-01-04 05:00:00              0.6225             -6.5775   
4   2022-01-05 05:00:00              6.9725             -2.1275   
..                  ...                 ...                 ...   
694 2023-11-26 05:00:00             11.6725             -2.7775   
695 2023-11-27 05:00:00             12.5725              1.8225   
696 2023-11-28 05:00:00              3.7225             -2.0275   
697 2023-11-29 05:00:00                 NaN                 NaN   
698 2023-11-30 05:00:00                 NaN                 NaN   

     temperature_2m_mean  apparent_temperature

In [19]:
daily_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   date                       699 non-null    datetime64[ns]
 1   temperature_2m_max         697 non-null    float32       
 2   temperature_2m_min         697 non-null    float32       
 3   temperature_2m_mean        696 non-null    float32       
 4   apparent_temperature_max   697 non-null    float32       
 5   apparent_temperature_min   697 non-null    float32       
 6   apparent_temperature_mean  696 non-null    float32       
 7   precipitation_sum          696 non-null    float32       
 8   shortwave_radiation_sum    696 non-null    float32       
dtypes: datetime64[ns](1), float32(8)
memory usage: 27.4 KB


In [20]:
daily_dataframe.head()

Unnamed: 0,date,temperature_2m_max,temperature_2m_min,temperature_2m_mean,apparent_temperature_max,apparent_temperature_min,apparent_temperature_mean,precipitation_sum,shortwave_radiation_sum
0,2022-01-01 05:00:00,11.5725,7.4225,9.605833,10.621353,5.668452,8.392408,26.5,1.61
1,2022-01-02 05:00:00,13.6225,2.0225,9.101666,11.198116,-3.44729,6.312523,8.8,2.57
2,2022-01-03 05:00:00,1.6225,-4.7775,-1.579583,-3.709321,-10.994707,-7.97313,1.9,1.36
3,2022-01-04 05:00:00,0.6225,-6.5775,-3.119166,-4.213054,-12.362391,-8.126157,0.0,9.23
4,2022-01-05 05:00:00,6.9725,-2.1275,2.074583,2.60159,-7.481497,-1.940145,0.3,3.19


In [21]:
daily_dataframe.tail()

Unnamed: 0,date,temperature_2m_max,temperature_2m_min,temperature_2m_mean,apparent_temperature_max,apparent_temperature_min,apparent_temperature_mean,precipitation_sum,shortwave_radiation_sum
694,2023-11-26 05:00:00,11.6725,-2.7775,4.562084,6.421145,-7.037745,0.695555,8.4,5.92
695,2023-11-27 05:00:00,12.5725,1.8225,6.830834,9.483438,-3.722879,2.447062,1.5,7.88
696,2023-11-28 05:00:00,3.7225,-2.0275,,-3.281451,-7.231588,,,
697,2023-11-29 05:00:00,,,,,,,,
698,2023-11-30 05:00:00,,,,,,,,


In [7]:
len(daily_dataframe)

691

In [8]:
import numpy as np

In [9]:
nulos_daily, duplicados_daily, outliers_daily = analizar_dataframe(daily_dataframe)

# Imprime los resultados
print("Climate en NY :")
print()
print("Nulos por columna:", nulos_daily)
print()
print("Duplicados:", duplicados_daily)
print()
print("Outliers:", outliers_daily)
print()

Climate en NY :

Nulos por columna: date                         0
temperature_2m_max           0
temperature_2m_min           0
temperature_2m_mean          0
apparent_temperature_max     0
apparent_temperature_min     0
apparent_temperature_mean    0
precipitation_sum            0
shortwave_radiation_sum      0
dtype: int64

Duplicados: 0

Outliers: temperature_2m_max             0
temperature_2m_min             0
temperature_2m_mean            0
apparent_temperature_max       0
apparent_temperature_min       0
apparent_temperature_mean      0
precipitation_sum            107
shortwave_radiation_sum        0
dtype: int64



In [10]:
daily_dataframe.drop_duplicates(inplace = True)

In [11]:
import pandas as pd

# Supongamos que tu DataFrame se llama daily_dataframe
# Aplicar filtro a las precipitaciones
daily_dataframe = daily_dataframe[(daily_dataframe['precipitation_sum'] >= 0) & (daily_dataframe['precipitation_sum'] <= 200)]

# Aplicar filtro a las temperaturas
may_sep_condition = (daily_dataframe['date'].dt.month >= 5) & (daily_dataframe['date'].dt.month <= 9)
oct_apr_condition = ~may_sep_condition

# Para mayo a septiembre: [-10, 45]
temperature_columns = ['temperature_2m_max', 'temperature_2m_min', 'temperature_2m_mean']
apparent_temperature_columns = ['apparent_temperature_max', 'apparent_temperature_min', 'apparent_temperature_mean']

for col in temperature_columns + apparent_temperature_columns:
    daily_dataframe.loc[may_sep_condition, col] = daily_dataframe.loc[may_sep_condition, col].clip(lower=-10, upper=45)

# Para octubre a abril: [-26, 36]
for col in temperature_columns + apparent_temperature_columns:
    daily_dataframe.loc[oct_apr_condition, col] = daily_dataframe.loc[oct_apr_condition, col].clip(lower=-26, upper=36)

# Esto aplicará los filtros según las condiciones especificadas en precipitaciones y temperaturas para cada mes.


In [12]:
daily_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 691 entries, 0 to 690
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   date                       691 non-null    datetime64[ns]
 1   temperature_2m_max         691 non-null    float32       
 2   temperature_2m_min         691 non-null    float32       
 3   temperature_2m_mean        691 non-null    float32       
 4   apparent_temperature_max   691 non-null    float32       
 5   apparent_temperature_min   691 non-null    float32       
 6   apparent_temperature_mean  691 non-null    float32       
 7   precipitation_sum          691 non-null    float32       
 8   shortwave_radiation_sum    691 non-null    float32       
dtypes: datetime64[ns](1), float32(8)
memory usage: 32.4 KB


In [13]:
daily_dataframe.head()

Unnamed: 0,date,temperature_2m_max,temperature_2m_min,temperature_2m_mean,apparent_temperature_max,apparent_temperature_min,apparent_temperature_mean,precipitation_sum,shortwave_radiation_sum
0,2022-01-01 05:00:00,11.5725,7.4225,9.605833,10.621353,5.668452,8.392408,26.5,1.61
1,2022-01-02 05:00:00,13.6225,2.0225,9.101666,11.198116,-3.44729,6.312523,8.8,2.57
2,2022-01-03 05:00:00,1.6225,-4.7775,-1.579583,-3.709321,-10.994707,-7.97313,1.9,1.36
3,2022-01-04 05:00:00,0.6225,-6.5775,-3.119166,-4.213054,-12.362391,-8.126157,0.0,9.23
4,2022-01-05 05:00:00,6.9725,-2.1275,2.074583,2.60159,-7.481497,-1.940145,0.3,3.19


In [14]:
nulos_daily, duplicados_daily, outliers_daily = analizar_dataframe(daily_dataframe)

# Imprime los resultados
print("Climate en NY :")
print()
print("Nulos por columna:", nulos_daily)
print()
print("Duplicados:", duplicados_daily)
print()
print("Outliers:", outliers_daily)
print()

Climate en NY :

Nulos por columna: date                         0
temperature_2m_max           0
temperature_2m_min           0
temperature_2m_mean          0
apparent_temperature_max     0
apparent_temperature_min     0
apparent_temperature_mean    0
precipitation_sum            0
shortwave_radiation_sum      0
dtype: int64

Duplicados: 0

Outliers: temperature_2m_max             0
temperature_2m_min             0
temperature_2m_mean            0
apparent_temperature_max       0
apparent_temperature_min       0
apparent_temperature_mean      0
precipitation_sum            107
shortwave_radiation_sum        0
dtype: int64



In [15]:
daily_dataframe.to_csv('climate_ny.csv')
daily_dataframe.to_parquet('climate_ny.parquet')

In [16]:
diccionario_datos_daily = {
    'date': 'Fecha y hora',
    'temperature_2m_max': 'Temperatura máxima a 2 metros (°C)',
    'temperature_2m_min': 'Temperatura mínima a 2 metros (°C)',
    'temperature_2m_mean': 'Temperatura media a 2 metros (°C)',
    'apparent_temperature_max': 'Temperatura aparente máxima (°C)',
    'apparent_temperature_min': 'Temperatura aparente mínima (°C)',
    'apparent_temperature_mean': 'Temperatura aparente media (°C)',
    'precipitation_sum': 'Suma de precipitación (mm)',
    'shortwave_radiation_sum': 'Suma de radiación de onda corta (kJ/m²)',
}

# Crear DataFrame a partir del diccionario
df_diccionario_daily = pd.DataFrame(list(diccionario_datos_daily.items()), columns=['Nombre de la Columna', 'Descripción'])

# Mostrar el DataFrame
print(df_diccionario_daily)


        Nombre de la Columna                              Descripción
0                       date                             Fecha y hora
1         temperature_2m_max       Temperatura máxima a 2 metros (°C)
2         temperature_2m_min       Temperatura mínima a 2 metros (°C)
3        temperature_2m_mean        Temperatura media a 2 metros (°C)
4   apparent_temperature_max         Temperatura aparente máxima (°C)
5   apparent_temperature_min         Temperatura aparente mínima (°C)
6  apparent_temperature_mean          Temperatura aparente media (°C)
7          precipitation_sum               Suma de precipitación (mm)
8    shortwave_radiation_sum  Suma de radiación de onda corta (kJ/m²)


In [17]:
df_diccionario_daily.to_csv('climate_ny_dicc.csv')