# Data Merging

In [1]:
import os
import numpy as np
import pandas as pd

RuntimeError: The current Numpy installation ('c:\\users\\guli\\.venvs\\lewagon_project\\lib\\site-packages\\numpy\\__init__.py') fails to pass a sanity check due to a bug in the windows runtime. See this issue for more information: https://tinyurl.com/y3dm3h86

## Fetch data from file path

Get the csv file names from a given directory

In [2]:
def get_file_names(directory):
    """returns the csv files in the given string directory path"""
    file_names = []
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            file_names.append(os.path.join(directory, filename))
    return file_names

Create a dictionary from the files where the keys are the yearly intervals and the values are the respective dataframes.

In [3]:
def get_dataframes(file_names):
    """Takes in a list of csv file paths.
    Returns a dictionary whose keys are the years
    and values are the corresponding dataframes."""
    df_years = {}
    for file in file_names:
        df_years[file[-29:]] = pd.read_csv(file) # [-29:] Indexes the years.csv eg '201501010000-201601010000.csv'
    return df_years
    

Can be used as follows:

In [4]:
directory = r'../raw_data/load/'
file_names = get_file_names(directory)
df_years = get_dataframes(file_names)

print(df_years.keys())

dict_keys(['201501010000-201601010000.csv', '201601010000-201701010000.csv', '201701010000-201801010000.csv', '201801010000-201901010000.csv', '201901010000-202001010000.csv', '202001010000-202101010000.csv'])


In [5]:
df_years['201501010000-201601010000.csv'].head(3)

Unnamed: 0,Time (CET),Day-ahead Total Load Forecast [MW] - BZN|DK1,Actual Total Load [MW] - BZN|DK1
0,01.01.2015 00:00 - 01.01.2015 01:00,1870.0,1877.0
1,01.01.2015 01:00 - 01.01.2015 02:00,1841.0,1843.0
2,01.01.2015 02:00 - 01.01.2015 03:00,1785.0,1795.0


Function to get the path per feature e.g price, load

In [6]:
def get_feature_path(feature_name, main_directory):
    for root, dirs_, files in os.walk(main_directory):
        if feature_name in (root):
            return root

Function to bring it together and return a tuple containing a nested dictionary of the features and the corresponding dictionary of per year dataframes. 

In [7]:
def get_features_df(main_path):
    feats = ['price', 'load']
    feat_dict = {}
    for feat in feats:
        path = get_feature_path(feat, main_path)
        names = get_file_names(path)
        feat_dict[feat] = get_dataframes(names)
        
    return feat_dict[feats[0]], feat_dict[feats[1]]

Used as follows...

In [34]:
path = r'../raw_data/'
price, load = get_features_df(main_path=path) # unpack tuple 

## Create DataFrames from data

### Concatenate dataframes

In [131]:
def concat_dataframes(feat):
    keys = list(feat.keys())
    df = pd.concat([
        feat[keys[0]],
        feat[keys[1]],
        feat[keys[2]],
        feat[keys[3]],
        feat[keys[4]],
        feat[keys[5]]
    ]).reset_index(drop=True)
    return df

In [132]:
prices_data = concat_dataframes(price)
load_data = concat_dataframes(load)

### Convert to datetime

On peeking through the dataframes nested in the dictionaries `price` and `load`, the columns showing the time display them as time ranges e.g `31.12.2020 19:00 - 31.12.2020 20:00`. A function is created to make a new column called _time_ that will strip the string down to the initial timestamp e.g `31.12.2020 19:00` and then convert the series to datetime objects

In [150]:
def get_datetime(df):
    column = df.iloc[:,0] # column with time values
    
    # create new column 'time' by formatting the original time column to get single timepoint instead of a range
    df['time'] = column.apply(lambda _: _[:16])
    # convert new time column from str to timestamp
    df['time'] = pd.to_datetime(df['time'])
        
    return df

In [151]:
load_data = get_datetime(load_data)
prices_data = get_datetime(prices_data)

### Merge datasets

In [152]:
print(len(prices_data))

52614


In [154]:
df = prices_data.merge(load_data, on='time', how='outer').reset_index(drop=True)  # merge both

In [None]:
# total_df['hour'] = total_df['time'].apply(lambda x: x.hour)
# total_df = total_df[total_df['hour']==0].reset_index(drop=True)  # index by 

In [155]:
to_drop = ['MTU (CET)', 'Time (CET)', 'Day-ahead Total Load Forecast [MW] - BZN|DK1']
df.drop(columns=to_drop, inplace=True)

In [173]:
df[(df['time'] < '2020-11-12 19:00:00')]

Unnamed: 0,Day-ahead Price [EUR/MWh],time,Actual Total Load [MW] - BZN|DK1
0,25.02,2015-01-01 00:00:00,1877
1,18.29,2015-01-01 01:00:00,1843
2,16.04,2015-01-01 02:00:00,1795
3,14.6,2015-01-01 03:00:00,1745
4,14.95,2015-01-01 04:00:00,1743
...,...,...,...
52136,-,2020-11-12 14:00:00,-
52137,-,2020-11-12 15:00:00,-
52138,-,2020-11-12 16:00:00,-
52139,-,2020-11-12 17:00:00,-


In [None]:
class load_data():
    # return merged and cleaned dataframe
    
    def __init__(self, main_path):
        pass
        
    
    # get path per feature
    #get_feature_path(feat, main_path)
    
    # get filenames per feature per path

    # get dataframes per feature

    # concatenate dataframes per feature dropping duplicate times

    # merge features on time index or column

    # return merged dataframe