In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data_path = "/content/drive/MyDrive/SE355/DoAn/Data/weather_data.csv"
data = pd.read_csv(data_path)
data.head()

Unnamed: 0,date,temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,rain,pressure_msl,surface_pressure,cloud_cover,cloud_cover_low,cloud_cover_mid,cloud_cover_high,wind_speed_10m,wind_speed_100m,wind_direction_10m,wind_direction_100m,wind_gusts_10m,predict_rain,predict_heat,predict_cloud
0,01-01-20 0:00,24.0395,71.37452,18.5395,25.566103,0.0,1015.3,1013.20184,24.0,0,0,80,11.183201,23.4,33.178535,36.86999,18.0,0,0,0
1,01-01-20 1:00,25.8395,64.71262,18.689499,26.984768,0.0,1016.1,1014.01263,9.0,0,0,30,14.264361,19.881649,47.045418,47.93576,24.119999,0,0,0
2,01-01-20 2:00,27.689499,58.037663,18.689499,29.0508,0.0,1016.4,1014.3251,5.4,0,0,18,12.768586,16.08761,49.574005,49.53785,25.56,0,0,0
3,01-01-20 3:00,29.7395,49.47346,18.0395,31.584839,0.0,1016.1,1014.0397,9.3,0,0,31,11.200571,13.746156,44.999897,44.999897,24.84,0,0,0
4,01-01-20 4:00,31.689499,39.61694,16.2895,33.30083,0.0,1015.5,1013.4539,24.300001,0,0,81,12.768586,15.580141,40.425995,40.314034,26.28,0,0,0


In [None]:
def time_format(date_time):
  date, time = date_time.split(" ")
  time = "0" + time if len(time)==4 else time
  date = date.split("-")
  date = "20" + date[2] + "-" + date[1] + "-" + date[0]
  return date + " " + time

data["date"] = data["date"].apply(lambda x: time_format(x))

In [None]:
def remove_second(date):
  return str(date)[:-3]

#if a date var has near it's mean it near the 1 BC than the another var
def get_last_year_week(date):
  date_list = []
  seven_days = 7*24
  year = 365 if date.year != 2021 else 366
  week_before_far = date - timedelta(days= year, hours= 1)
  week_before_near = date - timedelta(days= year, hours= seven_days)
  week_after_near = date + timedelta(hours= 1) - timedelta(days= year)
  week_after_far = date + timedelta(hours= seven_days) - timedelta(days= year)
  return (remove_second(week_before_near), remove_second(week_before_far),
         remove_second(week_after_near), remove_second(week_after_far))

def get_last_week(date):
  week_far = date - timedelta(hours=1)
  week_near = date - timedelta(hours=24*7)
  return remove_second(week_near), remove_second(week_far)

In [None]:
def get_week_data(data, time_near, time_far):
  pos1 = np.flatnonzero(data["date"].values == time_near)[0]
  pos2 = np.flatnonzero(data["date"].values == time_far)[0]
  return data.iloc[pos1:pos2+1, 1: -3].to_numpy(copy=True).reshape(-1)

def build_data(data, start, end):
  time = data["date"].values
  start = datetime.fromisoformat(start)
  end = datetime.fromisoformat(end)
  counter = 0
  new_data = []
  for day in range(0, (end-start).days+1):
    date = start + timedelta(days=day)
    week_near, week_far = get_last_week(date)
    last_week_feats = get_week_data(data, week_near, week_far)
    for i in range(0, 24):
      hour = date + timedelta(hours = i)
      features = np.array([remove_second(hour)])
      before_near, before_far, after_near, after_far  = get_last_year_week(hour)
      ly_before_feats = get_week_data(data, before_near, before_far)
      ly_after_feats = get_week_data(data, after_near, after_far)
      pos = np.flatnonzero(data["date"].values == str(hour)[:-3])
      features = np.concatenate([features, last_week_feats, ly_before_feats,
                                 ly_after_feats, data.iloc[pos, -3:].to_numpy().flatten()])
      new_data.append(features)
  return new_data

new_data = build_data(data, "2023-01-09 00:00", "2023-07-08 00:00")

"2021-01-08 00:00", "2021-07-08 23:00"

"2021-07-09 00:00", "2022-01-08 23:00"

"2022-01-09 00:00", "2022-07-08 00:00"

"2022-07-09 00:00", "2023-01-08 00:00"

"2023-01-09 00:00", "2023-07-08 00:00"

"2023-07-09 00:00", "2023-10-31 00:00"

In [None]:
len(new_data)/24

181.0

In [None]:
new_columns = np.array(["date"])
feats = data.columns[1:-3].values
feats = np.concatenate([feats for i in range(0, 24*7*3)])
pred_labels = data.columns[-3:].values
new_columns = np.concatenate([new_columns, feats, pred_labels])

In [None]:
new_data_df = pd.DataFrame(new_data, columns=new_columns)
new_data_df.head()

Unnamed: 0,date,temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,rain,pressure_msl,surface_pressure,cloud_cover,cloud_cover_low,...,cloud_cover_mid,cloud_cover_high,wind_speed_10m,wind_speed_100m,wind_direction_10m,wind_direction_100m,wind_gusts_10m,predict_rain,predict_heat,predict_cloud
0,2023-01-09 00:00,23.5395,85.62514,20.9895,27.27233,0.0,1014.6,1012.4996,92.100006,3.0,...,0.0,43.0,5.0142193,11.246759,21.037588,39.805527,8.64,1,0,1
1,2023-01-09 01:00,23.5395,85.62514,20.9895,27.27233,0.0,1014.6,1012.4996,92.100006,3.0,...,0.0,83.0,5.8603754,8.435069,47.489597,50.194473,13.679999,0,0,1
2,2023-01-09 02:00,23.5395,85.62514,20.9895,27.27233,0.0,1014.6,1012.4996,92.100006,3.0,...,0.0,8.0,5.1165614,6.638072,39.289394,40.601215,15.4800005,0,0,1
3,2023-01-09 03:00,23.5395,85.62514,20.9895,27.27233,0.0,1014.6,1012.4996,92.100006,3.0,...,0.0,3.0,4.510787,5.495161,28.610369,31.607454,15.84,0,0,1
4,2023-01-09 04:00,23.5395,85.62514,20.9895,27.27233,0.0,1014.6,1012.4996,92.100006,3.0,...,0.0,0.0,6.28713,7.5685663,23.629398,25.346138,18.359999,0,0,1


In [None]:
new_data_df.to_csv("/content/drive/MyDrive/SE355/DoAn/Data/test_data5.csv", index=False)