In [1]:
# import libraries
import pandas as pd
import numpy as np
import random
from datetime import datetime
import time

# Read and explore datasets

In [2]:
# read inputs 
meter_list = pd.read_excel('gorilla_test_data.xlsx', sheet_name='meter_list')
meter_list

Unnamed: 0,meter_id,aq_kwh,exit_zone
0,14676236,28978,EA1
1,34509937,78324,SO1
2,50264822,265667,NT1
3,88357331,484399,SE2


In [3]:
forecast_df = pd.read_excel('gorilla_test_data.xlsx', sheet_name='forecast_table')
forecast_df

Unnamed: 0,meter_id,date,kwh
0,14676236,2020-06-01,22.070768
1,14676236,2020-06-02,19.170720
2,14676236,2020-06-03,23.555111
3,14676236,2020-06-04,18.220712
4,14676236,2020-06-05,14.196134
...,...,...,...
3407,88357331,2022-09-27,441.014725
3408,88357331,2022-09-28,441.512055
3409,88357331,2022-09-29,437.240657
3410,88357331,2022-09-30,517.608354


In [4]:
rates_df = pd.read_excel('gorilla_test_data.xlsx', sheet_name='rate_table')
rates_df

Unnamed: 0,date,exit_zone,aq_min_kwh,aq_max_kwh,rate_p_per_kwh
0,2020-04-01,EA1,0,73200.0,0.2652
1,2020-04-01,EA1,73200,732000.0,0.1980
2,2020-04-01,EA1,732000,,0.2875
3,2020-04-01,EA2,0,73200.0,0.2970
4,2020-04-01,EA2,73200,732000.0,0.1524
...,...,...,...,...,...
1135,2024-10-01,WM2,73200,732000.0,0.4537
1136,2024-10-01,WM2,732000,,0.7534
1137,2024-10-01,WM3,0,73200.0,0.7263
1138,2024-10-01,WM3,73200,732000.0,0.6109


# Helper functions

Function to calculate transportation charges using meters, their forecasts and rates for various exit zones:

In [5]:

def calculate_transportation_charges(meters: pd.DataFrame, forecast: pd.DataFrame, rates: pd.DataFrame) -> pd.DataFrame:
    """
    Calculates transportation charges for each meter in meters DataFrame based on their forecasted consumption 
    and consumption rates from the rates DataFrame.
    
    Args:
    meters (pd.DataFrame): DataFrame containing information about the meters, including meter_id.
    forecast (pd.DataFrame): DataFrame containing forecasted consumption for each meter, with meter_id as one of the columns.
    rates (pd.DataFrame): DataFrame containing the consumption rates for each exit zone.
    
    Returns:
    pd.DataFrame: DataFrame with meter_id, total estimated consumption (kWh), and total cost (£) for each meter.
    """
    
    # merge meter info with forecast based on meter_id
    meter_forecast = pd.merge(meters, forecast, on='meter_id')
    
    # replace empty values with infinity (open-ended)
    rates = rates.replace(np.nan, np.inf)
    
    # rename columns to make more sense
    rates = rates.rename(columns={"date": "startdate_rate"})
    
    # merge meter forecast and rates based on the exit zone
    consumption_rates = pd.merge(meter_forecast, rates, on=['exit_zone'], how='inner')

    # only keep valid rates that apply i.e. assuring the AQ is
    # between the minimum AQ (aq_min_kwh, included) and the maximum AQ (aq_max_kwh, excluded)
    consumption_rates = consumption_rates[(consumption_rates['aq_kwh'] >= consumption_rates['aq_min_kwh']) & 
                                          (consumption_rates['aq_kwh'] < consumption_rates['aq_max_kwh'])]
    
    # sort dataframe to be able to calculate end dates
    consumption_rates = consumption_rates.sort_values(by=['exit_zone', 'startdate_rate', 'date'])    
    # calculate end of rate period for each consumption
    consumption_rates['enddate_rate'] = consumption_rates.groupby(['exit_zone', 'meter_id', 'date'])['startdate_rate'].shift(-1)
    
    # only keep those rows where forecast date is between start and end dates for rates
    consumption_rates = consumption_rates.loc[(consumption_rates.date >= consumption_rates.startdate_rate) & 
                                          (
                                              (consumption_rates.enddate_rate.isnull()) | 
                                              (consumption_rates.date < consumption_rates.enddate_rate)
                                          )]
    
    # calculate cost (forecast in kwh * rate per kwh) in pence
    consumption_rates['cost_pence'] = consumption_rates.apply(lambda row: row['kwh'] * row['rate_p_per_kwh'], axis=1)
    
    # group by meter, and sum cost for each forecast to calculate total cost of each meter
    cost_df = consumption_rates.groupby('meter_id')['cost_pence'].sum().reset_index(name='Total Cost (p)')

    # convert cost in pence to pounds
    cost_df['Total Cost (£)'] = cost_df['Total Cost (p)'] / 100
    
    # group by meter, and sum forecasted consumption to calculate total consumption for each column
    meter_consumption = consumption_rates.groupby('meter_id')['kwh'].sum().reset_index(name='Total Estimated Consumption (kWh)')
    
    # return merged dataframe of consumption and cost (only keep total cost in pounds) for each meter
    return pd.merge(meter_consumption, cost_df[['meter_id', 'Total Cost (£)']], on='meter_id').round(2)


Functions to generate sample data:

In [6]:

def generate_meter_list(num_meters: int, valid_zones: list) -> pd.DataFrame:
    """
    Generate a list of meters with randomly generated meter IDs, exit zones using the list of valid zones 
    and forecasted AQ consumption values in kwh.
    
    Args:
        num_meters (int): Number of meters to generate.
        valid_zones (list): List of valid exit zones for meters.
        
    Returns:
        pd.DataFrame: A DataFrame object containing generated meter data with columns: 
                      meter_id, exit_zone, and aq_kwh.
    """
    meters = pd.DataFrame({
        'meter_id': [random.randint(10000000, 99999999) for _ in range(num_meters)],
        'exit_zone': [random.choice(valid_zones) for _ in range(num_meters)],
        'aq_kwh': np.random.randint(low=10000, high=1000000, size=num_meters)
    })
    return meters


In [7]:

def generate_consumption_data(meters: list, start_date: datetime, num_days: int) -> pd.DataFrame:
    """
    Generate forecasted meter consumption data for a given list of meters and from a start date range.
    
    Args:
        meters (pd.DataFrame): A list of meters (meter_ids) to use for consumption data
        start_date (datetime): Start date of the consumption data as a datetime object.
        num_days (int): Number of days for which consumption data should be generated.
        
    Returns:
        pd.DataFrame: A DataFrame object containing generated meter consumption data with columns:
                      date, meter_id, and kwh.
    """
    dates = pd.date_range(start_date, periods=num_days, freq='D')
    consumption = pd.DataFrame({'date': np.repeat(dates, len(meters)),
                                'meter_id': np.tile(meters, num_days),
                                'kwh': np.random.randint(0, 1000, len(meters)*num_days)})
    return consumption


Function to benchmark:

In [8]:
def benchmark_transportation_charges(num_meters: int, valid_zones: list, start_date: datetime, num_days: int, rates: pd.DataFrame):
    """
    This function generates random meter data and consumption data based on the provided parameters, and then calls the 
    `calculate_transportation_charges` function to calculate the transportation charges for the generated data. It also 
    prints the execution time for the calculation.
    
    Parameters:
    -----------
    num_meters: int
        Number of meters to generate data for.
        
    valid_zones: list
        List of valid exit zones for the meters.
        
    start_date: datetime
        Start date for the consumption data.
        
    num_days: int
        Number of days of consumption forecast to generate.
        
    rates: pd.DataFrame
        DataFrame containing rate information for the exit zones.
        
    Returns:
    --------
    transport_cost: pd.DataFrame
        DataFrame containing the total estimated consumption and total cost for each meter.
    """
    # Generate random meter data using helper function
    meters = generate_meter_list(num_meters, valid_zones)
    
    # Generate random consumption data based on the meter data using helper function
    forecast = generate_consumption_data(meters['meter_id'].tolist(), start_date, num_days)
    
    # Calculate transportation charges for the generated data
    start_time = time.time()
    transportation_cost = calculate_transportation_charges(meters, forecast, rates)
    end_time = time.time()
    
    # Print execution time
    elapsed_time = end_time - start_time
    print(f"Execution time for {num_meters} meters and {num_days} days of consumption forecast: {elapsed_time:.4f} seconds")
    
    return transportation_cost


## Task 1: Cost using provided datasets

In [9]:
print(f"Transportation distribution charges for provided dataset: \n{calculate_transportation_charges(meter_list, forecast_df, rates_df)}")

Transportation distribution charges for provided dataset: 
   meter_id  Total Estimated Consumption (kWh)  Total Cost (£)
0  14676236                            28978.0          100.15
1  34509937                            78324.0          275.49
2  50264822                           265667.0          731.24
3  88357331                           484399.0         1433.16


## Task 2: Function that generates a list of random meters of any size. 
Examples of valid exit zones can be found in the rate table. You may randomly generate the annual quantity.

In [10]:
generated_meters = generate_meter_list(10, rates_df.exit_zone.unique().tolist())
generated_meters

Unnamed: 0,meter_id,exit_zone,aq_kwh
0,48423455,WA2,386332
1,90233576,SE2,797430
2,86117822,LS,949751
3,34845937,LC,98978
4,81269975,EA4,136745
5,98812658,SE2,72036
6,21102326,EM3,598856
7,50671192,WM2,546897
8,31821787,WM3,149757
9,66772206,NE2,507492


## Task 3: Function that generates mock consumption data given a list of meters and a start date and duration (number of days in the forecast). 
The data may be completely random and it doesn't have to match with the meters' annual quantities either

In [11]:
generate_consumption_data(generated_meters['meter_id'].tolist(), '01/04/2023', 365)

Unnamed: 0,date,meter_id,kwh
0,2023-01-04,48423455,793
1,2023-01-04,90233576,111
2,2023-01-04,86117822,769
3,2023-01-04,34845937,27
4,2023-01-04,81269975,560
...,...,...,...
3645,2024-01-03,98812658,478
3646,2024-01-03,21102326,578
3647,2024-01-03,50671192,648
3648,2024-01-03,31821787,521


## Benchmarking and Observations

In [12]:
# benchmark_transportation_charges(num_meters: int, valid_zones: list, start_date: datetime, num_days: int, rates: pd.DataFrame)
result = benchmark_transportation_charges(10, rates_df.exit_zone.unique().tolist(), datetime.now().date(), 100, rates_df)

Execution time for 10 meters and 100 days of consumption forecast: 0.0320 seconds


In [13]:
result = benchmark_transportation_charges(100, rates_df.exit_zone.unique().tolist(), datetime.now().date(), 1000, rates_df)

Execution time for 100 meters and 1000 days of consumption forecast: 1.7542 seconds


In [14]:
result = benchmark_transportation_charges(200, rates_df.exit_zone.unique().tolist(), datetime.now().date(), 1000, rates_df)

Execution time for 200 meters and 1000 days of consumption forecast: 3.3313 seconds


In [15]:
result = benchmark_transportation_charges(500, rates_df.exit_zone.unique().tolist(), datetime.now().date(), 1000, rates_df)

Execution time for 500 meters and 1000 days of consumption forecast: 8.1225 seconds


In [16]:
result = benchmark_transportation_charges(30, rates_df.exit_zone.unique().tolist(), datetime.now().date(), 3000, rates_df)

Execution time for 30 meters and 3000 days of consumption forecast: 1.4637 seconds


In [17]:
result = benchmark_transportation_charges(30, rates_df.exit_zone.unique().tolist(), datetime.now().date(), 30000, rates_df)

Execution time for 30 meters and 30000 days of consumption forecast: 15.6904 seconds


In [18]:
result = benchmark_transportation_charges(500, rates_df.exit_zone.unique().tolist(), datetime.now().date(), 500, rates_df)

Execution time for 500 meters and 500 days of consumption forecast: 4.2461 seconds


In [19]:
result = benchmark_transportation_charges(1000, rates_df.exit_zone.unique().tolist(), datetime.now().date(), 1000, rates_df)

Execution time for 1000 meters and 1000 days of consumption forecast: 16.4059 seconds



### Observations:
The function performs well for small sets of data, but the execution time increases significantly for larger sets (i.e. as the number of meters and days of consumption forecast increases). 

It isn't able to perform calculations for huge dataset because of memory issues as size of the dataframe increase more than allocated memory.

The function scales poorly for larger sets of data, as the execution time for 1000 meters and 1000 days of consumption forecast is more than 16 seconds, which is relatively high.

### Improvements:


One way to optimize the performance of the function is to reduce the number of calculations by filtering out irrelevant data or by grouping data to reduce the number of unique combinations that need to be calculated. For example, we can filter out meters that do not have any consumption data.

We can also improve the merging of the consumption, meters, and rates DataFrames, which can be memory-intensive and time-consuming for large sets of data. This can potentially be improved by using more efficient memory usage and operations like merging in chunks or using indexing

An improvement could be to parallelize some of the steps in the calculation using libraries such as Dask or Ray. This could help to distribute the computation across multiple cores or machines and speed up the overall process.

Another possible improvement is to use multi-threading or parallel processing to speed up the execution time, especially for large sets of data.

Finally, optimizing the data types used in the calculation could also improve performance. For example, using integer or categorical data types instead of strings or objects can reduce memory usage and improve processing time.
