In [55]:
import pandas as pd
import matplotlib.pyplot as plt 
import csv
with open("estonia2018_2022.csv", newline='', encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=',')
    lines = [line for line in reader]
    
# Extract the header (first row) and the data (remaining rows)
header = lines[0]
data = lines[1:]

# Convert the data to a Pandas DataFrame with column names and set "Period" as the index
df_et = pd.DataFrame(data, columns=header)
df_et

Unnamed: 0,Period,Consumption
0,1/1/2018 0:00,829.7639
1,1/1/2018 1:00,815.4053
2,1/1/2018 2:00,786.5032
3,1/1/2018 3:00,780.0486
4,1/1/2018 4:00,778.6814
...,...,...
43819,12/31/2022 19:00,932.5
43820,12/31/2022 20:00,889.5
43821,12/31/2022 21:00,846.5
43822,12/31/2022 22:00,817.6


In [56]:
for row in df_et["Period"]:
    if "-" in row:
        print(row)

2022-12-31 23:00


In [57]:
df_et.iloc[43823]["Period"] = "12/31/2022 23:00"
df_et['Period'] = pd.to_datetime(df_et['Period'])
df_et

Unnamed: 0,Period,Consumption
0,2018-01-01 00:00:00,829.7639
1,2018-01-01 01:00:00,815.4053
2,2018-01-01 02:00:00,786.5032
3,2018-01-01 03:00:00,780.0486
4,2018-01-01 04:00:00,778.6814
...,...,...
43819,2022-12-31 19:00:00,932.5
43820,2022-12-31 20:00:00,889.5
43821,2022-12-31 21:00:00,846.5
43822,2022-12-31 22:00:00,817.6


In [100]:
with open("latvia2018_2022.csv", newline='', encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=',')
    lines = [line for line in reader]
    
# Extract the header (first row) and the data (remaining rows)
header = lines[0]
data = lines[1:]

# Convert the data to a Pandas DataFrame with column names and set "Period" as the index
df_lv = pd.DataFrame(data, columns=header)
df_lv = df_lv.rename(columns={"DateTime": "Period"})
print(df_lv.iloc[17336])
df_lv['Period'] = pd.to_datetime(df_lv['Period'])
df_lv

Period         
Consumption    
Name: 17336, dtype: object


Unnamed: 0,Period,Consumption
0,2018-01-01 00:00:00,671.0
1,2018-01-01 01:00:00,659.0
2,2018-01-01 02:00:00,631.0
3,2018-01-01 03:00:00,612.0
4,2018-01-01 04:00:00,596.0
...,...,...
43635,2022-12-31 19:00:00,777.0
43636,2022-12-31 20:00:00,737.0
43637,2022-12-31 21:00:00,695.0
43638,2022-12-31 22:00:00,667.0


In [84]:
with open("Lithunia2018_2022.csv", newline='', encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=',')
    lines = [line for line in reader]
    
# Extract the header (first row) and the data (remaining rows)
header = lines[0]
data = lines[1:]

# Convert the data to a Pandas DataFrame with column names and set "Period" as the index
df_lt = pd.DataFrame(data, columns=header)
df_lt = df_lt.rename(columns={"Date": "Period"})
df_lt['Period'] = pd.to_datetime(df_lt['Period'])
df_lt

Unnamed: 0,Period,Consumption
0,2018-01-01 00:00:00,1139.95
1,2018-01-01 01:00:00,1101.76
2,2018-01-01 02:00:00,1051.43
3,2018-01-01 03:00:00,1012.91
4,2018-01-01 04:00:00,989.41
...,...,...
43819,2022-12-31 19:00:00,1418.083
43820,2022-12-31 20:00:00,1282.263
43821,2022-12-31 21:00:00,1198.688
43822,2022-12-31 22:00:00,1136.912


In [85]:
for i, row in enumerate(df_lt["Period"]):
    if "-" not in str(row):
        print(i, row)

In [96]:
# Merge DataFrames on the "Period" column with indicator
merged_df = pd.merge(df_et, df_lv, on='Period', how='outer', indicator=True)

# Filter for dates present only in df_et
dates_only_in_et = merged_df[merged_df['_merge'] == 'left_only']['Period']

# Filter for dates present only in df_lv
dates_only_in_lv = merged_df[merged_df['_merge'] == 'right_only']['Period']

# Display the results
print("Dates only in df_et:")
print(len(dates_only_in_et))

print("\nDates only in df_lv:")
print(len(dates_only_in_lv))

Dates only in df_et:
207

Dates only in df_lv:
23


In [93]:
dates_missing = {}
for date in dates_only_in_et:
    if str(date).split()[0] not in dates_missing:
        dates_missing[str(date).split()[0]] = 1
    else:
        dates_missing[str(date).split()[0]] += 1
dates_missing

{'2018-01-31': 23,
 '2018-03-31': 23,
 '2018-04-30': 23,
 '2018-05-31': 23,
 '2018-06-30': 23,
 '2018-07-31': 23,
 '2018-08-31': 23,
 '2018-09-30': 23,
 '2022-09-30': 23}

We discovered that in the Latvian dataset, there are days with missing measurements of energy consumption. Dates with missing measurements are: <br>
`{'2018-01-31': 23,
 '2018-03-31': 23,
 '2018-04-30': 23,
 '2018-05-31': 23,
 '2018-06-30': 23,
 '2018-07-31': 23,
 '2018-08-31': 23,
 '2018-09-30': 23,
 '2022-09-30': 23}`
 <br>
 
 TODO: Do something about this problem. Replace with the mean value, delete the date from all the datasets or TRAIN A DEEP LEARNING NEURAL NETWORK TO PREDICT THE MISSING VALUES.

Latvian dataset contains 23 empty rows. These rows have to be removed as these rows don't mean anything.

In [101]:
for i, row in enumerate(df_lv["Period"]):
    if "-" not in str(row):
        print(i, row)
        df_lv = df_lv.drop(index=i)

17336 NaT
17337 NaT
17338 NaT
17339 NaT
17340 NaT
17341 NaT
17342 NaT
17343 NaT
17344 NaT
17345 NaT
17346 NaT
17347 NaT
17348 NaT
17349 NaT
17350 NaT
17351 NaT
17352 NaT
17353 NaT
17354 NaT
17355 NaT
17356 NaT
17357 NaT
17358 NaT
