In [13]:
import pandas as pd
import numpy as np
import json

import seaborn as sns
import matplotlib.pyplot as plt
from colorama import Fore, Style
import polars as pl

import torch

In [2]:
Data_Dir = "../kaggle_data/"

train_data = pd.read_csv(Data_Dir + 'train.csv')
client_data = pd.read_csv(Data_Dir + 'client.csv')
historical_weather_data = pd.read_csv(Data_Dir + 'historical_weather.csv')
forecast_weather_data = pd.read_csv(Data_Dir + 'forecast_weather.csv')
electricity_data = pd.read_csv(Data_Dir + 'electricity_prices.csv')
gas_data = pd.read_csv(Data_Dir + 'gas_prices.csv')
weather_station_to_county_mapping_data = pd.read_csv(Data_Dir + 'weather_station_to_county_mapping.csv')


In [18]:
train_d = pl.read_csv(Data_Dir + 'train.csv')
train_d.with_columns(pl.col('datetime'))

county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
i64,i64,i64,f64,i64,str,i64,i64,i64
0,0,1,0.713,0,"""2021-09-01 00:…",0,0,0
0,0,1,96.59,1,"""2021-09-01 00:…",0,1,0
0,0,2,0.0,0,"""2021-09-01 00:…",0,2,1
0,0,2,17.314,1,"""2021-09-01 00:…",0,3,1
0,0,3,2.904,0,"""2021-09-01 00:…",0,4,2
0,0,3,656.859,1,"""2021-09-01 00:…",0,5,2
0,1,0,0.0,0,"""2021-09-01 00:…",0,6,3
0,1,0,59.0,1,"""2021-09-01 00:…",0,7,3
0,1,1,0.0,0,"""2021-09-01 00:…",0,8,4
0,1,1,501.76,1,"""2021-09-01 00:…",0,9,4


In [3]:
#county mapping json data load

dir = Data_Dir + 'county_id_to_name_map.json'
with open(dir, 'r') as file:
    json_data = json.load(file)

json_data

{'0': 'HARJUMAA',
 '1': 'HIIUMAA',
 '2': 'IDA-VIRUMAA',
 '3': 'JÄRVAMAA',
 '4': 'JÕGEVAMAA',
 '5': 'LÄÄNE-VIRUMAA',
 '6': 'LÄÄNEMAA',
 '7': 'PÄRNUMAA',
 '8': 'PÕLVAMAA',
 '9': 'RAPLAMAA',
 '10': 'SAAREMAA',
 '11': 'TARTUMAA',
 '12': 'UNKNOWN',
 '13': 'VALGAMAA',
 '14': 'VILJANDIMAA',
 '15': 'VÕRUMAA'}

In [5]:
# Color printing    
def PrintColor(text:str, color = Fore.BLUE, style = Style.BRIGHT):
    print(style + color + text + Style.RESET_ALL); 

def csv_data(data):
    PrintColor(text = f"data length: {len(data)}\ndata column len: {len(data.columns)}\ndata column compo: {list(data.columns)}\n")
    display(data.head())

### train.csv

- ```county```: county의 ID 코드
- ```is_business```: 비즈니스인지 여부
- ```product_type```: {0: "Combined", 1: "Fixed", 2: "General service", 3: "Spot"}
- ```target```: 예측값, 소비 or 생산량
- ```is_consumption```: 소비인지 생산인지 여부
- ```datatime```: 에스토니아 시간(Estonian time) in EET (UTC+2) / EEST (UTC+3).
- ```data_block_id```: 동일한 data_block_id를 공유하는 모든 행은 동일한 예측 시간에 사용할 수 있습니다.
- ```row_id```: row
- ```prediction_unit_id```: 예측 조합에 대한 고유 식별자

- ```총 4가지의 column에 의해 값이 결정```  
    -> *county(0 ~ 15), is_business(0, 1), product_type(0 ~ 3), is_consumption(0, 1)* 에 따라 값이 결정됨

In [6]:
csv_data(train_data)

[1m[34mdata length: 2018352
data column len: 9
data column compo: ['county', 'is_business', 'product_type', 'target', 'is_consumption', 'datetime', 'data_block_id', 'row_id', 'prediction_unit_id']
[0m


Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
0,0,0,1,0.713,0,2021-09-01 00:00:00,0,0,0
1,0,0,1,96.59,1,2021-09-01 00:00:00,0,1,0
2,0,0,2,0.0,0,2021-09-01 00:00:00,0,2,1
3,0,0,2,17.314,1,2021-09-01 00:00:00,0,3,1
4,0,0,3,2.904,0,2021-09-01 00:00:00,0,4,2


In [10]:
train_data['datetime'].unique()

array(['2021-09-01 00:00:00', '2021-09-01 01:00:00',
       '2021-09-01 02:00:00', ..., '2023-05-31 21:00:00',
       '2023-05-31 22:00:00', '2023-05-31 23:00:00'], dtype=object)

### client.csv

- ```eic_count```: 집계된 소비 포인트 수(EICs - European Identifier Code).
- ```installed_capacity```: 설치된 태양광 패널 용량(kw)

- ```날짜와 패널 용량에 따른 소비량 포인트```

In [7]:
csv_data(client_data)

[1m[34mdata length: 41919
data column len: 7
data column compo: ['product_type', 'county', 'eic_count', 'installed_capacity', 'is_business', 'date', 'data_block_id']
[0m


Unnamed: 0,product_type,county,eic_count,installed_capacity,is_business,date,data_block_id
0,1,0,108,952.89,0,2021-09-01,2
1,2,0,17,166.4,0,2021-09-01,2
2,3,0,688,7207.88,0,2021-09-01,2
3,0,0,5,400.0,1,2021-09-01,2
4,1,0,43,1411.0,1,2021-09-01,2


### gas_prices.csv

- **```예측 날짜의 가스 가격```**
- ```forecast_data```: 예측 가격 관련 날짜
- ```[lowest/highest]_price_per_mwh```: 해당 거래일 전날 시장에 나온 천연가스의 최저/최고 가격(메가와트당 등가물당 유로).

In [8]:
csv_data(gas_data)

[1m[34mdata length: 637
data column len: 5
data column compo: ['forecast_date', 'lowest_price_per_mwh', 'highest_price_per_mwh', 'origin_date', 'data_block_id']
[0m


Unnamed: 0,forecast_date,lowest_price_per_mwh,highest_price_per_mwh,origin_date,data_block_id
0,2021-09-01,45.23,46.32,2021-08-31,1
1,2021-09-02,45.62,46.29,2021-09-01,2
2,2021-09-03,45.85,46.4,2021-09-02,3
3,2021-09-04,46.3,46.8,2021-09-03,4
4,2021-09-05,46.3,46.58,2021-09-04,5


### electricity_prices.csv

- **```예측 날짜의 전기 가격```**
- ```forecast_data```: 예측 날짜
- ```euros_per_mwh```: 다음날의 전기 사용량에 따른 가격

In [9]:
csv_data(electricity_data)

[1m[34mdata length: 15286
data column len: 4
data column compo: ['forecast_date', 'euros_per_mwh', 'origin_date', 'data_block_id']
[0m


Unnamed: 0,forecast_date,euros_per_mwh,origin_date,data_block_id
0,2021-09-01 00:00:00,92.51,2021-08-31 00:00:00,1
1,2021-09-01 01:00:00,88.9,2021-08-31 01:00:00,1
2,2021-09-01 02:00:00,87.35,2021-08-31 02:00:00,1
3,2021-09-01 03:00:00,86.88,2021-08-31 03:00:00,1
4,2021-09-01 04:00:00,88.43,2021-08-31 04:00:00,1


### forecast_weather.csv

- **```특정 위치의 날씨데이터```**
- ```latitude,lonitude```: 위,경도 
- ```origin_datatime```: 예측이 생성된 시점의 타임 스탬프  
- ```hours_ahead```: 예보 생성과 예보 날씨의 시간 수, 각 예보는 총 48시간동안 적용
- ```temperature```: 지상 2m지점의 대기온도를 섭씨 단위로 표기
- ```dewpoint```: 지상 2m지점의 이슬점 온도를 섭씨 단위로 표기
- ```cloudcover_[low/mid/high/total]```: 다음 고도 구간에서 구름으로 덮인 하늘의 비율 (0~2, 2~6, 6+, 합계)
- ```10_metre_[u/v]_wind_component```: 풍속의 [동쪽/북쪽] 성분, 지표면 10미터 상공에서 측정한 풍속을 초당 미터 단위로 표기
- ```forecast_datetime```: 예상 날씨의 타임스탬프. ```origin_datetime```에 ```hours_ahead```를 더하여 생성
- ```direct_solar_radiation```: 태양의 방향에 수직인 평면에서 지표면에 도달한 직사광선이 이전 시간동안 누적된 것, 평방 미터당 와트시 단위로 나타냄
- ```surface_solar_radiation_downwards```: 지구 표면의 수평면에 도달하는 직접 및 확산 태양 복사열을 평방미터당 와트시 단위로 나타냄
- ```snowfall```: 한 시간동안 적설량 단위


In [11]:
csv_data(forecast_weather_data)

[1m[34mdata length: 3424512
data column len: 18
data column compo: ['latitude', 'longitude', 'origin_datetime', 'hours_ahead', 'temperature', 'dewpoint', 'cloudcover_high', 'cloudcover_low', 'cloudcover_mid', 'cloudcover_total', '10_metre_u_wind_component', '10_metre_v_wind_component', 'data_block_id', 'forecast_datetime', 'direct_solar_radiation', 'surface_solar_radiation_downwards', 'snowfall', 'total_precipitation']
[0m


Unnamed: 0,latitude,longitude,origin_datetime,hours_ahead,temperature,dewpoint,cloudcover_high,cloudcover_low,cloudcover_mid,cloudcover_total,10_metre_u_wind_component,10_metre_v_wind_component,data_block_id,forecast_datetime,direct_solar_radiation,surface_solar_radiation_downwards,snowfall,total_precipitation
0,57.6,21.7,2021-09-01 02:00:00,1,15.655786,11.553613,0.904816,0.019714,0.0,0.905899,-0.411328,-9.106137,1,2021-09-01 03:00:00,0.0,0.0,0.0,0.0
1,57.6,22.2,2021-09-01 02:00:00,1,13.003931,10.689844,0.886322,0.004456,0.0,0.886658,0.206347,-5.355405,1,2021-09-01 03:00:00,0.0,0.0,0.0,0.0
2,57.6,22.7,2021-09-01 02:00:00,1,14.206567,11.671777,0.729034,0.005615,0.0,0.730499,1.451587,-7.417905,1,2021-09-01 03:00:00,0.0,0.0,0.0,0.0
3,57.6,23.2,2021-09-01 02:00:00,1,14.844507,12.264917,0.336304,0.074341,0.000626,0.385468,1.090869,-9.163999,1,2021-09-01 03:00:00,0.0,0.0,0.0,0.0
4,57.6,23.7,2021-09-01 02:00:00,1,15.293848,12.458887,0.102875,0.088074,1.5e-05,0.17659,1.268481,-8.975766,1,2021-09-01 03:00:00,0.0,0.0,0.0,0.0


In [48]:
forecast_weather_data['hours_ahead'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48],
      dtype=int64)

### historical_weather.csv

- **```과거 날씨 데이터```**
- ```rain```: 예보 규칙(forecast conventions)과 다릅니다. 이전 한 시간 동안의 대규모 기상 시스템에서 내린 비(밀리미터 단위).
- ```snowfall```: 예보 규칙과 다릅니다. 이전 시간 동안의 적설량(센티미터)입니다.
- ```surface_pressure```: 표면의 기압을 헥토파스칼 단위로 표시합니다.
- ```cloudcover_[low/mid/high/total]```: 예보 규칙과 다릅니다. 다음 고도 구간에서 구름으로 덮인 하늘의 비율 (0~3, 3~8, 8+, 및 총 구름 덮개)
- ```windspeed_10m``` - 예보 규칙과 다릅니다. 지상 10미터에서의 풍속(초당 미터)입니다.
- ```winddirection_10m``` - 예보 규칙과 다릅니다. 지상 10미터에서의 풍향(도)입니다.
- ```shortwave_radiation``` - 예측 규칙과 다릅니다. 평방미터당 와트시 단위의 전 세계 수평 조사입니다.
- ```direct_solar_radiation```
- ```diffuse_radiation``` - 예측 규칙과 다릅니다. 평방미터당 와트시 단위의 확산 일사량입니다

In [57]:
csv_data(historical_weather_data)
historical_weather_data.head()

data length: 1710802
data column len: 18
data column compo: ['datetime', 'temperature', 'dewpoint', 'rain', 'snowfall', 'surface_pressure', 'cloudcover_total', 'cloudcover_low', 'cloudcover_mid', 'cloudcover_high', 'windspeed_10m', 'winddirection_10m', 'shortwave_radiation', 'direct_solar_radiation', 'diffuse_radiation', 'latitude', 'longitude', 'data_block_id']



Unnamed: 0,datetime,temperature,dewpoint,rain,snowfall,surface_pressure,cloudcover_total,cloudcover_low,cloudcover_mid,cloudcover_high,windspeed_10m,winddirection_10m,shortwave_radiation,direct_solar_radiation,diffuse_radiation,latitude,longitude,data_block_id
0,2021-09-01 00:00:00,14.2,11.6,0.0,0.0,1015.9,31,31,0,11,7.083333,8,0.0,0.0,0.0,57.6,21.7,1.0
1,2021-09-01 00:00:00,13.9,11.5,0.0,0.0,1010.7,33,37,0,0,5.111111,359,0.0,0.0,0.0,57.6,22.2,1.0
2,2021-09-01 00:00:00,14.0,12.5,0.0,0.0,1015.0,31,34,0,0,6.333333,355,0.0,0.0,0.0,57.6,22.7,1.0
3,2021-09-01 00:00:00,14.6,11.5,0.0,0.0,1017.3,0,0,0,0,8.083333,297,358.0,277.0,81.0,57.6,23.2,1.0
4,2021-09-01 00:00:00,15.7,12.9,0.0,0.0,1014.0,22,25,0,0,8.416667,5,0.0,0.0,0.0,57.6,23.7,1.0


In [19]:
print(min(client_data['data_block_id']),max(client_data['data_block_id']))

2 637


In [16]:
train_data[train_data['data_block_id'] == 637]

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
2015232,0,0,1,2.403,0,2023-05-31 00:00:00,637,2015232,0
2015233,0,0,1,516.958,1,2023-05-31 00:00:00,637,2015233,0
2015234,0,0,2,0.000,0,2023-05-31 00:00:00,637,2015234,1
2015235,0,0,2,5.208,1,2023-05-31 00:00:00,637,2015235,1
2015236,0,0,3,16.874,0,2023-05-31 00:00:00,637,2015236,2
...,...,...,...,...,...,...,...,...,...
2018347,15,1,0,197.233,1,2023-05-31 23:00:00,637,2018347,64
2018348,15,1,1,0.000,0,2023-05-31 23:00:00,637,2018348,59
2018349,15,1,1,28.404,1,2023-05-31 23:00:00,637,2018349,59
2018350,15,1,3,0.000,0,2023-05-31 23:00:00,637,2018350,60


In [21]:
train_data[train_data['data_block_id'] == min(train_data['data_block_id'])]

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
0,0,0,1,0.713,0,2021-09-01 00:00:00,0,0,0
1,0,0,1,96.590,1,2021-09-01 00:00:00,0,1,0
2,0,0,2,0.000,0,2021-09-01 00:00:00,0,2,1
3,0,0,2,17.314,1,2021-09-01 00:00:00,0,3,1
4,0,0,3,2.904,0,2021-09-01 00:00:00,0,4,2
...,...,...,...,...,...,...,...,...,...
2923,15,0,3,57.519,1,2021-09-01 23:00:00,0,2923,58
2924,15,1,1,0.000,0,2021-09-01 23:00:00,0,2924,59
2925,15,1,1,20.209,1,2021-09-01 23:00:00,0,2925,59
2926,15,1,3,0.000,0,2021-09-01 23:00:00,0,2926,60


In [20]:
client_data[client_data['data_block_id'] == min(client_data['data_block_id'])]

Unnamed: 0,product_type,county,eic_count,installed_capacity,is_business,date,data_block_id
0,1,0,108,952.89,0,2021-09-01,2
1,2,0,17,166.40,0,2021-09-01,2
2,3,0,688,7207.88,0,2021-09-01,2
3,0,0,5,400.00,1,2021-09-01,2
4,1,0,43,1411.00,1,2021-09-01,2
...,...,...,...,...,...,...,...
56,3,14,67,3114.60,1,2021-09-01,2
57,1,15,10,83.20,0,2021-09-01,2
58,3,15,61,918.20,0,2021-09-01,2
59,1,15,7,325.00,1,2021-09-01,2


In [22]:
client_data[client_data['data_block_id'] == max(client_data['data_block_id'])]

Unnamed: 0,product_type,county,eic_count,installed_capacity,is_business,date,data_block_id
41854,1,0,508,4964.215,0,2023-05-29,637
41855,2,0,10,31.000,0,2023-05-29,637
41856,3,0,1515,15963.060,0,2023-05-29,637
41857,0,0,25,1273.200,1,2023-05-29,637
41858,1,0,97,2881.600,1,2023-05-29,637
...,...,...,...,...,...,...,...
41914,1,15,51,415.600,0,2023-05-29,637
41915,3,15,161,2035.750,0,2023-05-29,637
41916,0,15,15,620.000,1,2023-05-29,637
41917,1,15,20,624.500,1,2023-05-29,637


In [11]:
weather_station_to_county_mapping_data['size'] = 5

fig = px.scatter_mapbox(
    weather_station_to_county_mapping_data, 
    lat="latitude", 
    lon="longitude", 
    color="county",
    size='size',
    zoom=6,
    title='Weather Stations Locations'
)

fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":30,"l":0,"b":0})
fig.show()

NameError: name 'df_weather_station_to_county_mapping' is not defined