In [126]:
import pandas as pd
import matplotlib.pyplot as plt 
import csv

## Energy consumption data

In [127]:
# Read the estonian data
df_et = pd.read_csv("estonia2018_2022.csv")
df_et

Unnamed: 0,Period,Consumption
0,1/1/2018 0:00,829.7639
1,1/1/2018 1:00,815.4053
2,1/1/2018 2:00,786.5032
3,1/1/2018 3:00,780.0486
4,1/1/2018 4:00,778.6814
...,...,...
43819,12/31/2022 19:00,932.5000
43820,12/31/2022 20:00,889.5000
43821,12/31/2022 21:00,846.5000
43822,12/31/2022 22:00,817.6000


In one of the "Period" cells date is in wrong format. We will fix it manually.

In [128]:
for row in df_et["Period"]:
    if "-" in row:
        print(row)

2022-12-31 23:00


In [129]:
df_et.loc[df_et.index == 43823, "Period"] = "12/31/2022 23:00"
df_et['Period'] = pd.to_datetime(df_et['Period'], format="%m/%d/%Y %H:%M")
df_et

Unnamed: 0,Period,Consumption
0,2018-01-01 00:00:00,829.7639
1,2018-01-01 01:00:00,815.4053
2,2018-01-01 02:00:00,786.5032
3,2018-01-01 03:00:00,780.0486
4,2018-01-01 04:00:00,778.6814
...,...,...
43819,2022-12-31 19:00:00,932.5000
43820,2022-12-31 20:00:00,889.5000
43821,2022-12-31 21:00:00,846.5000
43822,2022-12-31 22:00:00,817.6000


In [130]:
# Read the data
df_lv = pd.read_csv("latvia2018_2022.csv")
df_lv

Unnamed: 0,DateTime,Consumption
0,1/1/2018 0:00,671.0
1,1/1/2018 1:00,659.0
2,1/1/2018 2:00,631.0
3,1/1/2018 3:00,612.0
4,1/1/2018 4:00,596.0
...,...,...
43635,12/31/2022 19:00,777.0
43636,12/31/2022 20:00,737.0
43637,12/31/2022 21:00,695.0
43638,12/31/2022 22:00,667.0


In [131]:
# Rename "DateTime" column to "Period"
df_lv = df_lv.rename(columns={"DateTime": "Period"})

In [132]:
# print(df_lv.iloc[17336])
# Convert period column values to Pandas datetime
df_lv['Period'] = pd.to_datetime(df_lv['Period'])

In [133]:
# Read the data
df_lt = pd.read_csv("Lithunia2018_2022.csv")
df_lt

Unnamed: 0,Date,Consumption
0,2018-01-01 00:00:00,1139.950
1,2018-01-01 01:00:00,1101.760
2,2018-01-01 02:00:00,1051.430
3,2018-01-01 03:00:00,1012.910
4,2018-01-01 04:00:00,989.410
...,...,...
43819,2022-12-31 19:00:00,1418.083
43820,2022-12-31 20:00:00,1282.263
43821,2022-12-31 21:00:00,1198.688
43822,2022-12-31 22:00:00,1136.912


In [134]:
# Change column "Date" to "Period"
df_lt = df_lt.rename(columns={"Date": "Period"})
# Convert "Period" values to datetime
df_lt['Period'] = pd.to_datetime(df_lt['Period'])
df_lt

Unnamed: 0,Period,Consumption
0,2018-01-01 00:00:00,1139.950
1,2018-01-01 01:00:00,1101.760
2,2018-01-01 02:00:00,1051.430
3,2018-01-01 03:00:00,1012.910
4,2018-01-01 04:00:00,989.410
...,...,...
43819,2022-12-31 19:00:00,1418.083
43820,2022-12-31 20:00:00,1282.263
43821,2022-12-31 21:00:00,1198.688
43822,2022-12-31 22:00:00,1136.912


#### Compare energy consumption dataframes 
#### et vs lv

In [135]:
# Merge DataFrames on the "Period" column with indicator
merged_df = pd.merge(df_et, df_lv, on='Period', how='outer', indicator=True)

# Filter for dates present only in df_et
dates_only_in_et = merged_df[merged_df['_merge'] == 'left_only']['Period']

# Filter for dates present only in df_lv
dates_only_in_lv = merged_df[merged_df['_merge'] == 'right_only']['Period']

# Display the results
print("Dates only in df_et:")
print(len(dates_only_in_et))

print("\nDates only in df_lv:")
print(len(dates_only_in_lv))

Dates only in df_et:
207

Dates only in df_lv:
23


Some dates are only in df_et and some are only in df_lv. Let's find out these differences.

In [136]:
dates_missing = {}
for date in dates_only_in_et:
    if str(date).split()[0] not in dates_missing:
        dates_missing[str(date).split()[0]] = [str(date).split()[1]]
    else:
        dates_missing[str(date).split()[0]].append(str(date).split()[1])
dates_missing

{'2018-01-31': ['00:00:00',
  '01:00:00',
  '02:00:00',
  '03:00:00',
  '04:00:00',
  '05:00:00',
  '06:00:00',
  '07:00:00',
  '08:00:00',
  '09:00:00',
  '10:00:00',
  '11:00:00',
  '12:00:00',
  '13:00:00',
  '14:00:00',
  '15:00:00',
  '16:00:00',
  '17:00:00',
  '18:00:00',
  '19:00:00',
  '20:00:00',
  '21:00:00',
  '22:00:00'],
 '2018-03-31': ['00:00:00',
  '01:00:00',
  '02:00:00',
  '03:00:00',
  '04:00:00',
  '05:00:00',
  '06:00:00',
  '07:00:00',
  '08:00:00',
  '09:00:00',
  '10:00:00',
  '11:00:00',
  '12:00:00',
  '13:00:00',
  '14:00:00',
  '15:00:00',
  '16:00:00',
  '17:00:00',
  '18:00:00',
  '19:00:00',
  '20:00:00',
  '21:00:00',
  '22:00:00'],
 '2018-04-30': ['00:00:00',
  '01:00:00',
  '02:00:00',
  '03:00:00',
  '04:00:00',
  '05:00:00',
  '06:00:00',
  '07:00:00',
  '08:00:00',
  '09:00:00',
  '10:00:00',
  '11:00:00',
  '12:00:00',
  '13:00:00',
  '14:00:00',
  '15:00:00',
  '16:00:00',
  '17:00:00',
  '18:00:00',
  '19:00:00',
  '20:00:00',
  '21:00:00',
  '2

In [137]:
for date, period in dates_missing.items():
    print(len(period))

23
23
23
23
23
23
23
23
23


We discovered that in the Latvian dataset, there are days with missing measurements of energy consumption. Dates with missing measurements are: <br>
`{'2018-01-31': 23,
 '2018-03-31': 23,
 '2018-04-30': 23,
 '2018-05-31': 23,
 '2018-06-30': 23,
 '2018-07-31': 23,
 '2018-08-31': 23,
 '2018-09-30': 23,
 '2022-09-30': 23}`
 <br>
 
We have decided to replace the missing values with the mean value of consumption at the corresponding timestamp 5 days before and after the missing value.

In [138]:
from datetime import datetime, timedelta
new_entries = []  # List to store new entries
for date, periods in dates_missing.items():
    
    for period in periods:
        
        date = pd.to_datetime(str(date)+ " " +period)
        consumption_sum = 0
        for i in range(-5, 6):
            if i == 0:
                continue
            new_date = date + timedelta(days=i)
            consumption_sum += float(df_lv[df_lv['Period'] == new_date]['Consumption'].iloc[0])

        mean_consumption = consumption_sum/10
        new_entry = {'Period': date, 'Consumption': mean_consumption}
        
        # Add new entry
        new_entries.append(new_entry)

df_lv = pd.concat([df_lv, pd.DataFrame(new_entries)], ignore_index=True)
df_lv = df_lv.sort_values('Period').reset_index(drop=True)

Latvian dataset contains 23 empty rows. These rows have to be removed as these rows don't mean anything.

In [139]:
rows_to_remove = []
for i, row in enumerate(df_lv["Period"]):
    if pd.isna(row):
        print(i, row)
        rows_to_remove.append(i)
df_lv = df_lv.drop(index=rows_to_remove)

43824 NaT
43825 NaT
43826 NaT
43827 NaT
43828 NaT
43829 NaT
43830 NaT
43831 NaT
43832 NaT
43833 NaT
43834 NaT
43835 NaT
43836 NaT
43837 NaT
43838 NaT
43839 NaT
43840 NaT
43841 NaT
43842 NaT
43843 NaT
43844 NaT
43845 NaT
43846 NaT


#### et vs lt

In [141]:
# Merge DataFrames on the "Period" column with indicator
merged_df = pd.merge(df_et, df_lt, on='Period', how='outer', indicator=True)

# Filter for dates present only in df_et
dates_only_in_et = merged_df[merged_df['_merge'] == 'left_only']['Period']

# Filter for dates present only in df_lv
dates_only_in_lt = merged_df[merged_df['_merge'] == 'right_only']['Period']

# Display the results
print("Dates only in df_et:")
print(len(dates_only_in_et))

print("\nDates only in df_lt:")
print(len(dates_only_in_lt))

Dates only in df_et:
5

Dates only in df_lt:
5


In [142]:
dates_missing_et = {}
for date in dates_only_in_et:
    if str(date).split()[0] not in dates_missing_et:
        dates_missing_et[str(date).split()[0]] = [str(date).split()[1]]
    else:
        dates_missing_et[str(date).split()[0]].append(str(date).split()[1])
dates_missing_et

{'2018-03-25': ['03:00:00'],
 '2019-03-31': ['03:00:00'],
 '2020-03-29': ['03:00:00'],
 '2021-03-28': ['03:00:00'],
 '2022-03-27': ['03:00:00']}

In [143]:
dates_missing_lt = {}
for date in dates_only_in_lt:
    if str(date).split()[0] not in dates_missing_lt:
        dates_missing_lt[str(date).split()[0]] = [str(date).split()[1]]
    else:
        dates_missing_lt[str(date).split()[0]].append(str(date).split()[1])
dates_missing_lt

{'2018-03-25': ['02:00:00'],
 '2019-03-31': ['02:00:00'],
 '2020-03-29': ['02:00:00'],
 '2021-03-28': ['02:00:00'],
 '2022-03-27': ['02:00:00']}

* We discovered that in the Estonian dataset, there are periods with missing measurements of energy consumption. Dates with missing measurements are:
`{'2018-03-25': ['02:00:00'],
 '2019-03-31': ['02:00:00'],
 '2020-03-29': ['02:00:00'],
 '2021-03-28': ['02:00:00'],
 '2022-03-27': ['02:00:00']}`
 
* We discovered that in the Lithuanian dataset, there are periods with missing measurements of energy consumption. Dates with missing measurements are:
`{'2018-03-25': ['03:00:00'],
 '2019-03-31': ['03:00:00'],
 '2020-03-29': ['03:00:00'],
 '2021-03-28': ['03:00:00'],
 '2022-03-27': ['03:00:00']}`

We have decided to replace the missing values with the mean value of consumption at the corresponding timestamp 5 days before and after the missing value.

#### Add missing values to df_et

In [144]:
new_entries = []  # List to store new entries
for date, periods in dates_missing_lt.items():
    
    for period in periods:
        
        date = pd.to_datetime(str(date)+ " " +period)
        consumption_sum = 0
        for i in range(-5, 6):
            if i == 0:
                continue
            new_date = date + timedelta(days=i)
            consumption_sum += float(df_et[df_et['Period'] == new_date]['Consumption'].iloc[0])

        mean_consumption = consumption_sum/10
        new_entry = {'Period': date, 'Consumption': mean_consumption}
        
        # Add new entry
        new_entries.append(new_entry)

df_et = pd.concat([df_et, pd.DataFrame(new_entries)], ignore_index=True)
df_et = df_et.sort_values('Period').reset_index(drop=True)

#### Add missing values to df_lt

In [145]:
new_entries = []  # List to store new entries
for date, periods in dates_missing_et.items():
    
    for period in periods:
        
        date = pd.to_datetime(str(date)+ " " +period)
        consumption_sum = 0
        for i in range(-5, 6):
            if i == 0:
                continue
            new_date = date + timedelta(days=i)
            consumption_sum += float(df_lt[df_lt['Period'] == new_date]['Consumption'].iloc[0])

        mean_consumption = consumption_sum/10
        new_entry = {'Period': date, 'Consumption': mean_consumption}
        
        # Add new entry
        new_entries.append(new_entry)

df_lt = pd.concat([df_lt, pd.DataFrame(new_entries)], ignore_index=True)
df_lt = df_lt.sort_values('Period').reset_index(drop=True)

#### lt vs lv

In [146]:
# Merge DataFrames on the "Period" column with indicator
merged_df = pd.merge(df_lv, df_lt, on='Period', how='outer', indicator=True)

# Filter for dates present only in df_et
dates_only_in_lv = merged_df[merged_df['_merge'] == 'left_only']['Period']

# Filter for dates present only in df_lv
dates_only_in_lt = merged_df[merged_df['_merge'] == 'right_only']['Period']

# Display the results
print("Dates only in df_lv:")
print(len(dates_only_in_lv))

print("\nDates only in df_lt:")
print(len(dates_only_in_lt))

Dates only in df_lv:
0

Dates only in df_lt:
5


In [147]:
dates_missing_lt = {}
for date in dates_only_in_lt:
    if str(date).split()[0] not in dates_missing_lt:
        dates_missing_lt[str(date).split()[0]] = [str(date).split()[1]]
    else:
        dates_missing_lt[str(date).split()[0]].append(str(date).split()[1])
dates_missing_lt

{'2018-03-25': ['02:00:00'],
 '2019-03-31': ['02:00:00'],
 '2020-03-29': ['02:00:00'],
 '2021-03-28': ['02:00:00'],
 '2022-03-27': ['02:00:00']}

We discovered that in the Latvian dataset, there are periods with missing measurements of energy consumption compared to Lithuanian dataset. Dates with missing measurements are:
`{'2018-03-25': ['02:00:00'],
 '2019-03-31': ['02:00:00'],
 '2020-03-29': ['02:00:00'],
 '2021-03-28': ['02:00:00'],
 '2022-03-27': ['02:00:00']}`
 
We have decided to replace the missing values with the mean value of consumption at the corresponding timestamp 5 days before and after the missing value.

In [148]:
new_entries = []  # List to store new entries
for date, periods in dates_missing_lt.items():
    
    for period in periods:
        
        date = pd.to_datetime(str(date)+ " " +period)
        consumption_sum = 0
        for i in range(-5, 6):
            if i == 0:
                continue
            new_date = date + timedelta(days=i)
            consumption_sum += float(df_lv[df_lv['Period'] == new_date]['Consumption'].iloc[0])

        mean_consumption = consumption_sum/10
        new_entry = {'Period': date, 'Consumption': mean_consumption}
        
        # Add new entry
        new_entries.append(new_entry)

df_lv = pd.concat([df_lv, pd.DataFrame(new_entries)], ignore_index=True)
df_lv = df_lv.sort_values('Period').reset_index(drop=True)

Check if all the energy cunsumption datasets contain the same number of rows. 

In [149]:
print(len(df_lv) == len(df_et) == len(df_lt))

True


Check if all the time periods match in three datasets.

In [150]:
if set(df_lv['Period']) == set(df_et['Period']) == set(df_lt['Period']):
    print("All DataFrames have the same datetimes in the 'Period' column.")
else:
    print("DataFrames have different datetimes in the 'Period' column.")

All DataFrames have the same datetimes in the 'Period' column.


Make sure there is no differences in time periods.

In [151]:
lv_periods = set(df_lv['Period'])
et_periods = set(df_et['Period'])
lt_periods = set(df_lt['Period'])

# Find rows in df_lv that are not in df_et or df_lt
lv_diff = df_lv[~df_lv['Period'].isin(et_periods) | ~df_lv['Period'].isin(lt_periods)]
print("lv_diff", lv_diff)
# Find rows in df_et that are not in df_lv or df_lt
et_diff = df_et[~df_et['Period'].isin(lv_periods) | ~df_et['Period'].isin(lt_periods)]
print("et_diff", et_diff)
# Find rows in df_lt that are not in df_lv or df_et
lt_diff = df_lt[~df_lt['Period'].isin(lv_periods) | ~df_lt['Period'].isin(et_periods)]
print("lt_diff", lt_diff)
# Concatenate the results to get a DataFrame with all differing rows
all_diff = pd.concat([lv_diff, et_diff, lt_diff])

# Sort the DataFrame by 'Period'
all_diff = all_diff.sort_values('Period').reset_index(drop=True)

# Display the DataFrame with differing rows
all_diff

lv_diff Empty DataFrame
Columns: [Period, Consumption]
Index: []
et_diff Empty DataFrame
Columns: [Period, Consumption]
Index: []
lt_diff Empty DataFrame
Columns: [Period, Consumption]
Index: []


Unnamed: 0,Period,Consumption


No differences found - write cleaned data into new files

In [152]:
# Write the DataFrames to CSV files
df_lv.to_csv('df_energy_consumption_lv.csv', index=False)
df_et.to_csv('df_energy_consumption_et.csv', index=False)
df_lt.to_csv('df_energy_consumption_lt.csv', index=False)

## Covid-19 cumulative deaths data

In [153]:
deaths_df = pd.read_csv("Cumulative Count of COVID-19 Deaths Baltics.csv")
deaths_df

Unnamed: 0,date,Estonia CumulativeCount_MedicalConditionIncident_COVID_19_PatientDeceased,Lithuania CumulativeCount_MedicalConditionIncident_COVID_19_PatientDeceased,Latvia CumulativeCount_MedicalConditionIncident_COVID_19_PatientDeceased
0,1/3/2020,0,0,0
1,1/4/2020,0,0,0
2,1/5/2020,0,0,0
3,1/6/2020,0,0,0
4,1/7/2020,0,0,0
...,...,...,...,...
1423,11/26/2023,2906,9761,7412
1424,11/27/2023,2906,9761,7412
1425,11/28/2023,2906,9761,7412
1426,11/29/2023,2906,9761,7412


In [154]:
# Rename columns
deaths_df.rename(columns={'Estonia CumulativeCount_MedicalConditionIncident_COVID_19_PatientDeceased': 'et', 'Lithuania CumulativeCount_MedicalConditionIncident_COVID_19_PatientDeceased':'lt', 'Latvia CumulativeCount_MedicalConditionIncident_COVID_19_PatientDeceased':'lv'}, inplace=True)

In [155]:
# Remove 2023 data

# Convert 'date' to pd datetime
deaths_df['date'] = pd.to_datetime(deaths_df['date'])

cutoff_date = pd.to_datetime('2023-01-01')
filtered_deaths_df = deaths_df[deaths_df['date'] < cutoff_date]

In [156]:
# Write the DataFrame to CSV file
filtered_deaths_df.to_csv('df_covid_deaths.csv', index=False)

## GDP data

In [157]:
gdp_et_df = pd.read_csv("Real Gross Domestic Product for Estonia.csv")
gdp_lv_df = pd.read_csv("Real Gross Domestic Product for Latvia.csv")
gdp_lt_df = pd.read_csv("Real Gross Domestic Product for Lithuania.csv")

In [158]:
# Rename columns
gdp_et_df.rename(columns={'DATE':'date', 'CLVMNACNSAB1GQEE':'million'}, inplace=True)
gdp_lv_df.rename(columns={'DATE':'date', 'CLVMNACSCAB1GQLV':'million'}, inplace=True)
gdp_lt_df.rename(columns={'DATE':'date', 'CLVMNACSCAB1GQLT':'million'}, inplace=True)
# Unit - Millions of Chained 2010 Euros

In [159]:
# Convert dates to Pandas DateTime
gdp_et_df['date'] = pd.to_datetime(gdp_et_df['date'])
gdp_lv_df['date'] = pd.to_datetime(gdp_lv_df['date'])
gdp_lt_df['date'] = pd.to_datetime(gdp_lt_df['date'])

In [160]:
# Write the DataFrames to CSV files
gdp_et_df.to_csv('df_gdp_et.csv', index=False)
gdp_lv_df.to_csv('df_gdp_lv.csv', index=False)
gdp_lt_df.to_csv('df_gdp_lt.csv', index=False)

## Temperature data

In [175]:
temp_et_df = pd.read_csv("Tallinn-Harku weather.csv", encoding='latin-1')
temp_lv_df = pd.read_csv("Latvia weather.csv", sep=";")
temp_lt_df_1 = pd.read_csv("Vilnius,Lithuania 2018-01-01 to 2020-08-31.csv")
temp_lt_df_2 = pd.read_csv("Vilnius,Lithuania 2020-09-01 to 2022-12-31.csv")

### Estonian temperature data

In [176]:
column_to_keep = ['Aasta', 'Kuu', 'Päev', 'Kell (UTC)', 'Õhutemperatuur °C']

# Drop columns other than the specified column
temp_et_df = temp_et_df[column_to_keep]

In [177]:
# Combine columns into a datetime column
temp_et_df['date'] = pd.to_datetime(temp_et_df[['Aasta', 'Kuu', 'Päev', 'Kell (UTC)']].astype(str).agg('-'.join, axis=1), format='%Y-%m-%d-%H:%M')

In [178]:
column_to_keep = ['date', 'Õhutemperatuur °C']
# Drop columns other than the specified column
temp_et_df = temp_et_df[column_to_keep]

In [179]:
# Remove data not from 2018 - 2022 range
cutoff_date_1 = pd.to_datetime('2018-01-01')
cutoff_date_2 = pd.to_datetime('2023-01-01')
temp_et_df = temp_et_df[(temp_et_df['date'] > cutoff_date_1) & (temp_et_df['date'] < cutoff_date_2)]

In [180]:
# Rename "Õhutemperatuur °C" column to "temp"
temp_et_df.rename(columns={'Õhutemperatuur °C':'temp'}, inplace=True)

In [181]:
temp_et_df

Unnamed: 0,date,temp
122737,2018-01-01 01:00:00,-0.6
122738,2018-01-01 02:00:00,-0.5
122739,2018-01-01 03:00:00,-0.5
122740,2018-01-01 04:00:00,-0.4
122741,2018-01-01 05:00:00,-0.1
...,...,...
166555,2022-12-31 19:00:00,5.8
166556,2022-12-31 20:00:00,5.9
166557,2022-12-31 21:00:00,5.8
166558,2022-12-31 22:00:00,5.8


There are 24 temperatures per day, which is not needed for this project. We will calculate the mean temperature of each day and create a dataset that has one temperature per day.

In [192]:
# Extract the date without the time
temp_et_df['day'] = temp_et_df['date'].dt.date
# Group by date and calculate the mean temperature for each day
daily_mean_temps = temp_et_df.groupby('day')['temp'].mean().reset_index()
# Round the mean temperatures to two decimal places
daily_mean_temps['temp'] = daily_mean_temps['temp'].round(1)

In [193]:
daily_mean_temps

Unnamed: 0,day,temp
0,2018-01-01,1.4
1,2018-01-02,4.0
2,2018-01-03,2.0
3,2018-01-04,2.2
4,2018-01-05,3.5
...,...,...
1821,2022-12-27,-1.5
1822,2022-12-28,-0.4
1823,2022-12-29,0.8
1824,2022-12-30,3.6


### Lithuanian temperature data

In [194]:
# Concatenate lithuanian temperature dataframes
temp_lt_df = pd.concat([temp_lt_df_1, temp_lt_df_2], ignore_index=True)
temp_lt_df

Unnamed: 0,datetime,temp
0,2018-01-01,3.3
1,2018-01-02,2.8
2,2018-01-03,2.1
3,2018-01-04,2.7
4,2018-01-05,3.3
...,...,...
1821,2022-12-27,1.3
1822,2022-12-28,0.9
1823,2022-12-29,1.1
1824,2022-12-30,2.6


In [195]:
# Rename column "datetime" to "date" and convert to pandas datetime
temp_lt_df.rename(columns={'datetime':'date'}, inplace=True)
temp_lt_df["date"] = pd.to_datetime(temp_lt_df['date'])

### Latvian temperature data

In [None]:
# Write the DataFrames to CSV files
temp_et_df.to_csv('df_temp_et.csv', index=False)
temp_lv_df.to_csv('df_temp_lv.csv', index=False)
temp_lt_df.to_csv('df_temp_lt.csv', index=False)