In [1]:
import pandas as pd
import os

def process_files(source_directory, destination_directory):
    # Create the destination directory if it doesn't exist
    if not os.path.exists(destination_directory):
        os.makedirs(destination_directory)

    for file in os.listdir(source_directory):
        if file.startswith('gen_') and file.endswith('.csv'):
            file_path = os.path.join(source_directory, file)
            df = pd.read_csv(file_path)

            # Removing 'Z' and parsing the StartTime and EndTime columns
            df['StartTime'] = df['StartTime'].str[:-1]  # Remove 'Z'
            df['EndTime'] = df['EndTime'].str[:-1]  # Remove 'Z'

            df['StartTime'] = pd.to_datetime(df['StartTime'], format='%Y-%m-%dT%H:%M%z')
            df['EndTime'] = pd.to_datetime(df['EndTime'], format='%Y-%m-%dT%H:%M%z')

            # Calculating the time interval
            df['time_interval'] = df['EndTime'] - df['StartTime']

            # Saving the modified dataframe in the destination directory
            modified_file_name = file.replace('.csv', '_modified.csv')
            df.to_csv(os.path.join(destination_directory, modified_file_name), index=False)


In [2]:
# Define your source and destination directories
source_directory = '/workspaces/SE-Europe-Data_Challenge/data/raw/2021_to_2022'
destination_directory = '/workspaces/SE-Europe-Data_Challenge/data/processed/2021_to_2022'
process_files(source_directory, destination_directory)

In [3]:
# Define your source and destination directories
source_directory = '/workspaces/SE-Europe-Data_Challenge/data/raw/2022_to_2023'
destination_directory = '/workspaces/SE-Europe-Data_Challenge/data/processed/2022_to_2023'
process_files(source_directory, destination_directory)

In [5]:
# Spain, Sweden, Poland, Italy, Denmark have 1 hour intervals already
# UK has 30 mins intervals, so we group 2 rows
# Netherlands, Germany, Hungary have 15 min intervals, so we group by 4 rows

uk_gen = pd.read_csv("/workspaces/SE-Europe-Data_Challenge/data/processed/2022_to_2023/gen_UK_B19_modified.csv", parse_dates=['StartTime', 'EndTime'])

In [6]:
uk_gen.dtypes

StartTime        datetime64[ns, UTC]
EndTime          datetime64[ns, UTC]
AreaID                        object
UnitName                      object
PsrType                       object
quantity                       int64
time_interval                 object
dtype: object

In [45]:
df = uk_gen.copy(deep=True)

In [46]:
df['Duration'] = df['EndTime'] - df['StartTime']

In [48]:
df.set_index('EndTime', inplace=True)

In [49]:
resampled_quantity = df['quantity'].resample('H').sum()


In [51]:
df.head()

Unnamed: 0_level_0,StartTime,AreaID,UnitName,PsrType,quantity,time_interval,Duration
EndTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-01-27 00:00:00+00:00,2022-01-26 23:30:00+00:00,10Y1001A1001A92E,MAW,B19,433,0 days 00:30:00,0 days 00:30:00
2022-01-27 00:30:00+00:00,2022-01-27 00:00:00+00:00,10Y1001A1001A92E,MAW,B19,472,0 days 00:30:00,0 days 00:30:00
2022-01-27 01:00:00+00:00,2022-01-27 00:30:00+00:00,10Y1001A1001A92E,MAW,B19,549,0 days 00:30:00,0 days 00:30:00
2022-01-27 01:30:00+00:00,2022-01-27 01:00:00+00:00,10Y1001A1001A92E,MAW,B19,520,0 days 00:30:00,0 days 00:30:00
2022-01-27 02:00:00+00:00,2022-01-27 01:30:00+00:00,10Y1001A1001A92E,MAW,B19,407,0 days 00:30:00,0 days 00:30:00


In [52]:
resampled_quantity.sort_values()

EndTime
2022-07-15 11:00:00+00:00       0
2022-07-01 17:00:00+00:00       0
2022-07-01 16:00:00+00:00       0
2022-07-01 15:00:00+00:00       0
2022-07-01 14:00:00+00:00       0
                             ... 
2022-02-05 10:00:00+00:00    4053
2022-02-05 11:00:00+00:00    4070
2022-02-05 12:00:00+00:00    4073
2022-02-06 12:00:00+00:00    4094
2022-02-05 13:00:00+00:00    4180
Name: quantity, Length: 8136, dtype: int64

In [53]:
resampled_start_time = df['StartTime'].resample('H').first()  # or use .last()


In [54]:
resampled_start_time

EndTime
2022-01-27 00:00:00+00:00   2022-01-26 23:30:00+00:00
2022-01-27 01:00:00+00:00   2022-01-27 00:30:00+00:00
2022-01-27 02:00:00+00:00   2022-01-27 01:30:00+00:00
2022-01-27 03:00:00+00:00   2022-01-27 02:30:00+00:00
2022-01-27 04:00:00+00:00   2022-01-27 03:30:00+00:00
                                       ...           
2022-12-31 19:00:00+00:00   2022-12-31 18:30:00+00:00
2022-12-31 20:00:00+00:00   2022-12-31 19:30:00+00:00
2022-12-31 21:00:00+00:00   2022-12-31 20:30:00+00:00
2022-12-31 22:00:00+00:00   2022-12-31 21:30:00+00:00
2022-12-31 23:00:00+00:00   2022-12-31 22:30:00+00:00
Freq: H, Name: StartTime, Length: 8136, dtype: datetime64[ns, UTC]

In [55]:
resampled_df = pd.DataFrame({
    'quantity_sum': resampled_quantity,
    'StartTime_first': resampled_start_time
})


In [56]:
resampled_df

Unnamed: 0_level_0,quantity_sum,StartTime_first
EndTime,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-01-27 00:00:00+00:00,1809,2022-01-26 23:30:00+00:00
2022-01-27 01:00:00+00:00,2137,2022-01-27 00:30:00+00:00
2022-01-27 02:00:00+00:00,1636,2022-01-27 01:30:00+00:00
2022-01-27 03:00:00+00:00,851,2022-01-27 02:30:00+00:00
2022-01-27 04:00:00+00:00,1068,2022-01-27 03:30:00+00:00
...,...,...
2022-12-31 19:00:00+00:00,838,2022-12-31 18:30:00+00:00
2022-12-31 20:00:00+00:00,822,2022-12-31 19:30:00+00:00
2022-12-31 21:00:00+00:00,778,2022-12-31 20:30:00+00:00
2022-12-31 22:00:00+00:00,575,2022-12-31 21:30:00+00:00


In [40]:
def basic_exploration(file_path):
    df = pd.read_csv(file_path)
    print(f"""\nThe column names in the data are: \n{df.columns}\n{'x' * 60}
          The datatypes of the columns are: \n{df.dtypes}
          {'x' * 60}
          The minimum value of StartTime: {df['StartTime'].min()}
          The maximum value of StartTime: {df['StartTime'].max()}
          The minimum value of EndTime: {df['EndTime'].min()}
          The maximum value of EndTime: {df['EndTime'].max()}
          The minimum value of energy generated:  {df['quantity'].min()}
          The maximum value of energy generated: {df['quantity'].max()}
          """)

In [39]:
basic_exploration("/workspaces/SE-Europe-Data_Challenge/data/processed/2022_to_2023/gen_UK_B19_modified.csv")


The column names in the data are: 
Index(['StartTime', 'EndTime', 'AreaID', 'UnitName', 'PsrType', 'quantity',
       'time_interval'],
      dtype='object')
    xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
          The datatypes of the columns are: 
StartTime        object
EndTime          object
AreaID           object
UnitName         object
PsrType          object
quantity          int64
time_interval    object
dtype: object
          xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
          The minimum value of StartTime: 2022-01-26 23:30:00+00:00
          The maximum value of StartTime: 2022-12-31 23:00:00+00:00
          The minimum value of EndTime: 2022-01-27 00:00:00+00:00
          The maximum value of EndTime: 2022-12-31 23:30:00+00:00
          The minimum value of energy generated:  0
          The maximum value of energy generated: 1052
          
          
          
