Call data from open-meteo.com api

In [123]:
!pip install openmeteo-requests
!pip install requests-cache retry-requests numpy pandas



In [171]:
import numpy as np
import openmeteo_requests
import requests_cache
import pandas as pd
from retry_requests import retry
from datetime import date, timedelta, datetime, timedelta
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import f1_score

In [125]:
def get_weather_api(lat, long):
	# Setup the Open-Meteo API client with cache and retry on error
	cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
	retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
	openmeteo = openmeteo_requests.Client(session = retry_session)

	# Make sure all required weather variables are listed here
	# The order of variables in hourly or daily is important to assign them correctly below
	url = "https://archive-api.open-meteo.com/v1/archive"
	params = {
		"latitude": lat,
		"longitude": long,
		"start_date": "2020-01-01",
		"end_date": "2023-10-31",
		"hourly": ["temperature_2m", "relative_humidity_2m", "dew_point_2m", "apparent_temperature", "rain", "pressure_msl", "surface_pressure", "cloud_cover", "cloud_cover_low", "cloud_cover_mid", "cloud_cover_high", "wind_speed_10m", "wind_speed_100m", "wind_direction_10m", "wind_direction_100m", "wind_gusts_10m"]
	}
	responses = openmeteo.weather_api(url, params=params)

	# Process first location. Add a for-loop for multiple locations or weather models
	response = responses[0]
	print(f"Coordinates {response.Latitude()}°E {response.Longitude()}°N")
	print(f"Elevation {response.Elevation()} m asl")
	print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
	print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

	# Process hourly data. The order of variables needs to be the same as requested.
	hourly = response.Hourly()
	hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
	hourly_relative_humidity_2m = hourly.Variables(1).ValuesAsNumpy()
	hourly_dew_point_2m = hourly.Variables(2).ValuesAsNumpy()
	hourly_apparent_temperature = hourly.Variables(3).ValuesAsNumpy()
	hourly_rain = hourly.Variables(4).ValuesAsNumpy()
	hourly_pressure_msl = hourly.Variables(5).ValuesAsNumpy()
	hourly_surface_pressure = hourly.Variables(6).ValuesAsNumpy()
	hourly_cloud_cover = hourly.Variables(7).ValuesAsNumpy()
	hourly_cloud_cover_low = hourly.Variables(8).ValuesAsNumpy()
	hourly_cloud_cover_mid = hourly.Variables(9).ValuesAsNumpy()
	hourly_cloud_cover_high = hourly.Variables(10).ValuesAsNumpy()
	hourly_wind_speed_10m = hourly.Variables(11).ValuesAsNumpy()
	hourly_wind_speed_100m = hourly.Variables(12).ValuesAsNumpy()
	hourly_wind_direction_10m = hourly.Variables(13).ValuesAsNumpy()
	hourly_wind_direction_100m = hourly.Variables(14).ValuesAsNumpy()
	hourly_wind_gusts_10m = hourly.Variables(15).ValuesAsNumpy()

	hourly_data = {"date": pd.date_range(
		start = pd.to_datetime(hourly.Time(), unit = "s"),
		end = pd.to_datetime(hourly.TimeEnd(), unit = "s"),
		freq = pd.Timedelta(seconds = hourly.Interval()),
		inclusive = "left"
	)}
	hourly_data["temperature_2m"] = hourly_temperature_2m
	hourly_data["relative_humidity_2m"] = hourly_relative_humidity_2m
	hourly_data["dew_point_2m"] = hourly_dew_point_2m
	hourly_data["apparent_temperature"] = hourly_apparent_temperature
	hourly_data["rain"] = hourly_rain
	hourly_data["pressure_msl"] = hourly_pressure_msl
	hourly_data["surface_pressure"] = hourly_surface_pressure
	hourly_data["cloud_cover"] = hourly_cloud_cover
	hourly_data["cloud_cover_low"] = hourly_cloud_cover_low
	hourly_data["cloud_cover_mid"] = hourly_cloud_cover_mid
	hourly_data["cloud_cover_high"] = hourly_cloud_cover_high
	hourly_data["wind_speed_10m"] = hourly_wind_speed_10m
	hourly_data["wind_speed_100m"] = hourly_wind_speed_100m
	hourly_data["wind_direction_10m"] = hourly_wind_direction_10m
	hourly_data["wind_direction_100m"] = hourly_wind_direction_100m
	hourly_data["wind_gusts_10m"] = hourly_wind_gusts_10m

	hourly_dataframe = pd.DataFrame(data = hourly_data)
	return hourly_dataframe


In [126]:
# get dataframe of district 1 with latitude = 10.7807 and longtitude = 106.6994
data_quan_1 = get_weather_api(10.7807, 106.6994)

Coordinates 10.790861129760742°E 106.71087646484375°N
Elevation 18.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s


In [127]:
hourly_dataframe = pd.DataFrame(data_quan_1)

In [128]:
hourly_dataframe

Unnamed: 0,date,temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,rain,pressure_msl,surface_pressure,cloud_cover,cloud_cover_low,cloud_cover_mid,cloud_cover_high,wind_speed_10m,wind_speed_100m,wind_direction_10m,wind_direction_100m,wind_gusts_10m
0,2020-01-01 00:00:00,24.039499,71.374519,18.539499,25.566103,0.0,1015.299988,1013.201843,24.000000,0.0,0.0,80.0,11.183201,23.400000,33.178535,36.869991,18.000000
1,2020-01-01 01:00:00,25.839500,64.712624,18.689499,26.984768,0.0,1016.099976,1014.012634,9.000000,0.0,0.0,30.0,14.264361,19.881649,47.045418,47.935760,24.119999
2,2020-01-01 02:00:00,27.689499,58.037663,18.689499,29.050800,0.0,1016.400024,1014.325073,5.400000,0.0,0.0,18.0,12.768586,16.087610,49.574005,49.537849,25.559999
3,2020-01-01 03:00:00,29.739500,49.473461,18.039499,31.584839,0.0,1016.099976,1014.039673,9.300000,0.0,0.0,31.0,11.200571,13.746156,44.999897,44.999897,24.840000
4,2020-01-01 04:00:00,31.689499,39.616940,16.289499,33.300831,0.0,1015.500000,1013.453918,24.300001,0.0,0.0,81.0,12.768586,15.580141,40.425995,40.314034,26.280001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33595,2023-10-31 19:00:00,24.789499,98.516823,24.539499,30.363483,0.0,1009.799988,1007.718262,24.900000,23.0,1.0,12.0,5.959060,12.758432,334.983124,343.610382,10.080000
33596,2023-10-31 20:00:00,24.639500,97.926659,24.289499,30.209759,0.0,1009.400024,1007.318298,1.500000,1.0,1.0,0.0,4.896529,10.853866,342.897186,354.289490,8.280000
33597,2023-10-31 21:00:00,24.539499,97.339287,24.089500,30.015854,0.0,1009.500000,1007.417358,18.900000,2.0,0.0,57.0,4.680000,9.255571,360.000000,13.495748,6.840000
33598,2023-10-31 22:00:00,24.539499,96.176743,23.889500,29.657166,0.0,1009.900024,1007.816528,30.300001,1.0,1.0,96.0,6.287130,9.504272,23.629398,37.304039,7.559999


In [129]:
weather_data = hourly_dataframe.iloc[:,:].values

In [130]:
# labeling rain data
# no rain => 0 || rainy => 1 || heavy rain => 2
predict_rain = []

for element in weather_data:
  rain = element[5]
  rain_label = -1
  if rain < 0.3:
    rain_label = 0
  else: rain_label = 1
  predict_rain.append(rain_label)

In [131]:
# check if exist any element mislabeled in rain data
nprain = np.array(predict_rain)
print('Rain label:', np.unique(nprain))
unique, counts = np.unique(nprain, return_counts=True)

infor_rain = dict(zip(unique, counts))
data_len = len(weather_data)
for i in range(2):
  print(f'Number of sample in label {i}:', infor_rain[i], '||\t||', ' Account', counts[i]/data_len*100, '% of total data.')

Rain label: [0 1]
Number of sample in label 0: 28434 ||	||  Account 84.625 % of total data.
Number of sample in label 1: 5166 ||	||  Account 15.375 % of total data.


In [132]:
# labeling heat data
# cool => 0 || hot => 1 || scorching => 2
predict_heat = []

for element in weather_data:
  heat = element[4]
  heat_label = -1
  if heat < 35:
    heat_label = 0
  elif heat < 37:
    heat_label = 1
  else: heat_label = 2
  predict_heat.append(heat_label)

In [133]:
# check if exist any element mislabeled in heat data
npheat = np.array(predict_heat)
print('Heat label:', np.unique(npheat))
unique, counts = np.unique(npheat, return_counts=True)

infor_heat = dict(zip(unique, counts))
for i in range(3):
  print(f'Number of sample in label {i}:', infor_heat[i], '||\t||', ' Account', counts[i]/data_len*100, '% of total data.')

Heat label: [0 1 2]
Number of sample in label 0: 28427 ||	||  Account 84.60416666666667 % of total data.
Number of sample in label 1: 2971 ||	||  Account 8.842261904761903 % of total data.
Number of sample in label 2: 2202 ||	||  Account 6.553571428571428 % of total data.


In [134]:
# labeling cloud data
# less cloud => 0 || much cloud => 1
predict_cloud = []

for element in weather_data:
  cloud = element[8]
  cloud_label = -1
  if cloud < 50:
    cloud_label = 0
  else: cloud_label = 1
  predict_cloud.append(cloud_label)

In [135]:
# check if exist any element mislabeled in cloud data
npcloud = np.array(predict_cloud)
print('cloud label:', np.unique(npcloud))
unique, counts = np.unique(npcloud, return_counts=True)

infor_cloud = dict(zip(unique, counts))
for i in range(2):
  print(f'Number of sample in label {i}:', infor_cloud[i], '||\t||', ' Account', counts[i]/data_len*100, '% of total data.')

cloud label: [0 1]
Number of sample in label 0: 22691 ||	||  Account 67.5327380952381 % of total data.
Number of sample in label 1: 10909 ||	||  Account 32.467261904761905 % of total data.


In [136]:
hourly_dataframe['predict_rain'] = nprain
hourly_dataframe['predict_heat'] = npheat
hourly_dataframe['predict_cloud'] = npcloud

In [137]:
hourly_dataframe['date'] = hourly_dataframe['date'].astype('string')

In [138]:
datetime = datetime.fromisoformat(hourly_dataframe.iloc[15000, 0])

In [139]:
# define a function to get feature date and target date
def get_date(datetime):
  feature_date = []
  target_date = []
  date = str(datetime)
  for i in range(1, 169):
    # calculate time and add it to list
    last_year_before = datetime.fromisoformat(date) - timedelta(days=365, hours=i)
    last_year_after = datetime.fromisoformat(date) + timedelta(hours=i) - timedelta(days=365)
    last_week_time = datetime.fromisoformat(date) - timedelta(hours=i)
    next_week_time = datetime.fromisoformat(date) + timedelta(hours=i)
    feature_date.append(str(last_year_before))
    feature_date.append(str(last_year_after))
    feature_date.append(str(last_week_time))
    target_date.append(str(next_week_time))
  return feature_date, target_date

In [140]:
feature_date, target_date = get_date(datetime)

In [141]:
print(len(feature_date), len(target_date))

504 168


In [142]:
datetime

datetime.datetime(2021, 9, 17, 0, 0)

In [143]:
print(feature_date[0],feature_date[1],feature_date[2])

2020-09-16 23:00:00 2020-09-17 01:00:00 2021-09-16 23:00:00


In [144]:
columns_rain=['relative_humidity_2m', 'apparent_temperature', 'surface_pressure', 'rain', 'cloud_cover']
columns_heat=['relative_humidity_2m', 'wind_gusts_10m', 'apparent_temperature', 'temperature_2m']
columns_cloud=['rain', 'cloud_cover', 'cloud_cover_low', 'cloud_cover_mid', 'cloud_cover_high']
target_rain=['predict_rain']
target_heat=['predict_heat']
target_cloud=['predict_cloud']

tuần trước năm trước (168h) + tuần sau năm trước (168h) + tuần sau năm này (168h)</br>
=> 1 tuần sau năm này (168h)

In [145]:
def get_data(feature_time, target_time, feature, target):
  data_feat = []
  target_feat = []

  # iterate through feature_time
  for time in feature_time:
    # append found data in list (feature got include week before-later of last year, one week before of this year)
    data = hourly_dataframe[feature].to_numpy()[hourly_dataframe['date'].to_numpy() == time]
    data_feat.append(data)

  # iterate through target_time
  for time in target_time:
    # append found data in list (feature got include one week after of this year)
    data = hourly_dataframe[target].to_numpy()[hourly_dataframe['date'].to_numpy() == time]
    target_feat.append(data)

  # convert the two list to numpy array and flatten them
  data_feat = np.array(data_feat).flatten()
  target_feat = np.array(target_feat).flatten()

  return data_feat, target_feat

In [146]:
feature_data, target_data = get_data(feature_date, target_date, columns_heat, target_heat)

In [147]:
print(feature_data.shape) # 2016 mean 504*4 -> 504 hours total * 4 feature (because i flatten it)
print(target_data.shape)

(2016,)
(168,)


In [148]:
def get_all_of_type(feature_type, target_type):
  feature = []
  target = []
  for i in range(len(hourly_dataframe)):
    date = datetime.fromisoformat(hourly_dataframe.iloc[i, 0])
    feature_date, target_date = get_date(date)
    feature_data, target_data = get_data(feature_date, target_date, feature_type, target_type)
    feature.append(feature_data)
    target.append(target_data)
  feature = np.array(feature)
  target = np.array(target)
  return feature, target


In [149]:
#heat_feature, heat_target = get_all_of_type(columns_heat, target_heat)

In [150]:
print(datetime.fromisoformat(hourly_dataframe.iloc[8784, 0]))

2021-01-01 00:00:00


In [151]:
print(hourly_dataframe[columns_heat].iloc[8784-365*24-1:8784-365*24-169])

       relative_humidity_2m  wind_gusts_10m  apparent_temperature  \
23                80.390739       10.440001             25.081156   
24                73.138367        9.720000             25.864468   
25                66.515404       18.359999             27.638216   
26                61.340786       19.440001             29.895203   
27                54.054295       18.719999             32.404476   
...                     ...             ...                   ...   
33450             94.477852       10.799999             30.268993   
33451             95.326515        9.360000             30.236565   
33452             96.470146        8.640000             30.163628   
33453             95.604889        6.120000             30.331520   
33454             96.175339        5.400000             29.808720   

       temperature_2m  
23          22.689499  
24          23.789499  
25          25.639500  
26          27.489500  
27          29.289499  
...               ...  
334

In [152]:
hourly_dataframe.index[hourly_dataframe['date'].values == str(datetime.fromisoformat('2023-10-01 23:00:00') + timedelta(days=i))]

Int64Index([32903], dtype='int64')

In [153]:
def get_train_test_data(start, end, feature_type, target_type):
  feature = []
  target = []
  start = datetime.fromisoformat(start)
  end = datetime.fromisoformat(end)
  for i in range((end-start).days+1):
    print('----------------------------------')
    print(f'Get {i}...')
    date = start + timedelta(days=i)
    print(f'Checking date:', date)

    # get prev week
    last_week_end = date - timedelta(hours=1)
    last_week_start = date - timedelta(hours=24*7)

    # get prev year
    last_year_start = date - timedelta(days=365, hours=24*7)
    last_year_end = date - timedelta(days=365) + timedelta(hours=24*7)

    # get index of those feature
    last_week_start_index = np.where(hourly_dataframe['date'].values == str(last_week_start))
    last_week_end_index = np.where(hourly_dataframe['date'].values == str(last_week_end))
    last_year_start_index = np.where(hourly_dataframe['date'].values == str(last_year_start))
    last_year_end_index = np.where(hourly_dataframe['date'].values == str(last_year_end))

    # get index of those target
    next_week_index = np.where(hourly_dataframe['date'].values == str(date))

    # error checking
    print('Check indexing:', last_year_start_index[0],last_year_end_index[0],last_week_start_index[0],last_week_end_index[0],next_week_index[0])

    # get data of those feature
    feature_year = hourly_dataframe.loc[int(last_year_start_index[0]):int(last_year_end_index[0]), feature_type].to_numpy(copy=True).flatten()
    feature_week = hourly_dataframe.loc[int(last_week_start_index[0]):int(last_week_end_index[0]), feature_type].to_numpy(copy=True).flatten()

    # target
    target_week = hourly_dataframe.loc[int(next_week_index[0]), target_type].to_numpy(copy=True).flatten()

    feature_np = np.concatenate((feature_year, feature_week), axis=None)
    print('Feature shape:', feature_np.shape)
    print('Target shape:', target_week.shape)
    #target_temp.clear()
    feature.append(feature_np)
    target.append(target_week)
    print('Done!')

  return feature, target

In [154]:
x_heat, y_heat = get_train_test_data('2022-01-01 00:00:00', '2023-10-01 23:00:00', columns_heat, target_heat)
x_heat = np.array(x_heat)
y_heat = np.array(y_heat).astype('int')

----------------------------------
Get 0...
Checking date: 2022-01-01 00:00:00
Check indexing: [8616] [8952] [17376] [17543] [17544]
Feature shape: (2020,)
Target shape: (1,)
Done!
----------------------------------
Get 1...
Checking date: 2022-01-02 00:00:00
Check indexing: [8640] [8976] [17400] [17567] [17568]
Feature shape: (2020,)
Target shape: (1,)
Done!
----------------------------------
Get 2...
Checking date: 2022-01-03 00:00:00
Check indexing: [8664] [9000] [17424] [17591] [17592]
Feature shape: (2020,)
Target shape: (1,)
Done!
----------------------------------
Get 3...
Checking date: 2022-01-04 00:00:00
Check indexing: [8688] [9024] [17448] [17615] [17616]
Feature shape: (2020,)
Target shape: (1,)
Done!
----------------------------------
Get 4...
Checking date: 2022-01-05 00:00:00
Check indexing: [8712] [9048] [17472] [17639] [17640]
Feature shape: (2020,)
Target shape: (1,)
Done!
----------------------------------
Get 5...
Checking date: 2022-01-06 00:00:00
Check indexing: 

In [155]:
print(x_heat.shape)
print(y_heat.shape)

(639, 2020)
(639, 1)


In [156]:
x_train_heat, x_test_heat, y_train_heat, y_test_heat = train_test_split(x_heat, y_heat, test_size=0.2, random_state=0)

In [168]:
heat_model = SVC()
heat_model.fit(x_train_heat, y_train_heat)
y_pred_heat = heat_model.predict(x_test_heat)
print(accuracy_score(y_test_heat, y_pred_heat))
print(f1_score(y_test_heat, y_pred_heat))

1.0
0.0


  y = column_or_1d(y, warn=True)
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


In [172]:
confusion_matrix(y_test_heat, y_pred_heat)

array([[128]])

In [158]:
x_rain, y_rain = get_train_test_data('2022-01-01 00:00:00', '2023-10-01 23:00:00', columns_rain, target_rain)
x_rain = np.array(x_rain)
y_rain = np.array(y_rain).astype('int')
x_train_rain, x_test_rain, y_train_rain, y_test_rain = train_test_split(x_rain, y_rain, test_size=0.2, random_state=0)

----------------------------------
Get 0...
Checking date: 2022-01-01 00:00:00
Check indexing: [8616] [8952] [17376] [17543] [17544]
Feature shape: (2525,)
Target shape: (1,)
Done!
----------------------------------
Get 1...
Checking date: 2022-01-02 00:00:00
Check indexing: [8640] [8976] [17400] [17567] [17568]
Feature shape: (2525,)
Target shape: (1,)
Done!
----------------------------------
Get 2...
Checking date: 2022-01-03 00:00:00
Check indexing: [8664] [9000] [17424] [17591] [17592]
Feature shape: (2525,)
Target shape: (1,)
Done!
----------------------------------
Get 3...
Checking date: 2022-01-04 00:00:00
Check indexing: [8688] [9024] [17448] [17615] [17616]
Feature shape: (2525,)
Target shape: (1,)
Done!
----------------------------------
Get 4...
Checking date: 2022-01-05 00:00:00
Check indexing: [8712] [9048] [17472] [17639] [17640]
Feature shape: (2525,)
Target shape: (1,)
Done!
----------------------------------
Get 5...
Checking date: 2022-01-06 00:00:00
Check indexing: 

In [167]:
rain_model = SVC()
rain_model.fit(x_train_rain, y_train_rain)
y_pred_rain = rain_model.predict(x_test_rain)
print(accuracy_score(y_test_rain, y_pred_rain))
print(f1_score(y_test_rain, y_pred_rain))

0.96875
0.0


  y = column_or_1d(y, warn=True)


In [173]:
confusion_matrix(y_test_rain, y_pred_rain)

array([[124,   0],
       [  4,   0]])

In [160]:
x_cloud, y_cloud = get_train_test_data('2022-01-01 00:00:00', '2023-10-01 23:00:00', columns_cloud, target_cloud)
x_cloud = np.array(x_cloud)
y_cloud = np.array(y_cloud).astype('int')
x_train_cloud, x_test_cloud, y_train_cloud, y_test_cloud = train_test_split(x_cloud, y_cloud, test_size=0.2, random_state=0)

----------------------------------
Get 0...
Checking date: 2022-01-01 00:00:00
Check indexing: [8616] [8952] [17376] [17543] [17544]
Feature shape: (2525,)
Target shape: (1,)
Done!
----------------------------------
Get 1...
Checking date: 2022-01-02 00:00:00
Check indexing: [8640] [8976] [17400] [17567] [17568]
Feature shape: (2525,)
Target shape: (1,)
Done!
----------------------------------
Get 2...
Checking date: 2022-01-03 00:00:00
Check indexing: [8664] [9000] [17424] [17591] [17592]
Feature shape: (2525,)
Target shape: (1,)
Done!
----------------------------------
Get 3...
Checking date: 2022-01-04 00:00:00
Check indexing: [8688] [9024] [17448] [17615] [17616]
Feature shape: (2525,)
Target shape: (1,)
Done!
----------------------------------
Get 4...
Checking date: 2022-01-05 00:00:00
Check indexing: [8712] [9048] [17472] [17639] [17640]
Feature shape: (2525,)
Target shape: (1,)
Done!
----------------------------------
Get 5...
Checking date: 2022-01-06 00:00:00
Check indexing: 

In [166]:
cloud_model = SVC()
cloud_model.fit(x_train_cloud, y_train_cloud)
y_pred_cloud = cloud_model.predict(x_test_cloud)
print(accuracy_score(y_test_cloud, y_pred_cloud))
print(f1_score(y_test_cloud,y_pred_cloud))

  y = column_or_1d(y, warn=True)


0.7734375
0.06451612903225806


In [174]:
confusion_matrix(y_test_cloud, y_pred_cloud)

array([[98,  0],
       [29,  1]])