In [7]:
import pandas as pd
import datetime
import numpy as np
import networkx as netx

In [13]:
def calculate_distance_matrix_custom():
   
    df_custom = pd.read_csv('dataset-3.csv')

    G_custom = netx.Graph()

    for row_custom in df_custom.itertuples(index=False):
        
        G_custom.add_edge(row_custom.id_start, row_custom.id_end, distance=row_custom.distance)
        G_custom.add_edge(row_custom.id_end, row_custom.id_start, distance=row_custom.distance)  # Bidirectional

    distance_matrix_custom = pd.DataFrame(index=G_custom.nodes, columns=G_custom.nodes, dtype=float)
    distance_matrix_custom = distance_matrix_custom.fillna(0)

    # Calculate cumulative distances between toll locations
    for source_custom in G_custom.nodes:
        
        for destination_custom in G_custom.nodes:
            
            if source_custom != destination_custom:
                if netx.has_path(G_custom, source_custom, destination_custom):
                    distance_matrix_custom.at[source_custom, destination_custom] = netx.shortest_path_length(G_custom, source_custom, destination_custom, weight='distance')

    distance_matrix_custom = (distance_matrix_custom + distance_matrix_custom.T) / 2
    return distance_matrix_custom

In [14]:
def unroll_distance_matrix(distance_matrix):
    
    unrolled_data = []

    for id_start in distance_matrix.index:
        
        for id_end in distance_matrix.columns:
            
            if id_start != id_end:
                distance = distance_matrix.at[id_start, id_end]
                unrolled_data.append({'id_start': id_start, 'id_end': id_end, 'distance': distance})

    unrolled_df = pd.DataFrame(unrolled_data)
    return unrolled_df

In [15]:
def find_ids_within_ten_percentage_threshold(distance_df):
    
    avg_distance = distance_df['distance'].mean()

    lower_threshold = avg_distance - 0.1 * avg_distance
    upper_threshold = avg_distance + 0.1 * avg_distance
    filtered_df = distance_df[(distance_df['distance'] >= lower_threshold) & 
                              (distance_df['distance'] <= upper_threshold)]
    result_ids = sorted(filtered_df['id_start'].unique())
    return result_ids

In [16]:
def calculate_toll_rate(distance_df):
    
    rate_coefficients = {'moto': 0.8, 'car': 1.2, 'rv': 1.5, 'bus': 2.2, 'truck': 3.6}

    for vehicle_type, rate_coefficient in rate_coefficients.items():
        
        distance_df[vehicle_type] = distance_df['distance'] * rate_coefficient
    return distance_df

In [17]:
def calculate_time_based_toll_rates(df):

    weekday_ranges = [
        (datetime.time(0, 0, 0), datetime.time(10, 0, 0), 0.8),
        (datetime.time(10, 0, 0), datetime.time(18, 0, 0), 1.2),
        (datetime.time(18, 0, 0), datetime.time(23, 59, 59), 0.8)
    ]

    weekend_ranges = [(datetime.time(0, 0, 0), datetime.time(23, 59, 59), 0.7)]
    time_based_toll_df = pd.DataFrame(columns=df.columns.tolist() + ['start_day', 'start_time', 'end_day', 'end_time'])


    for (_, group_df), start_day in zip(df.groupby(['id_start', 'id_end']), range(7)):
        
        for start_time, end_time, discount_factor in get_time_ranges(start_day, weekday_ranges, weekend_ranges):
            
            end_day = (start_day + 1) % 7 
            toll_rates = group_df.loc[:, 'moto':'truck'] * discount_factor
            time_based_toll_df = time_based_toll_df.append({
                **group_df.to_dict('records')[0],
                'start_day': get_day_name(start_day),
                'start_time': start_time,
                'end_day': get_day_name(end_day),
                'end_time': end_time,
                **toll_rates.to_dict('records')[0]
            }, ignore_index=True)
    return time_based_toll_df

def get_time_ranges(start_day, weekday_ranges, weekend_ranges):
    
    if start_day < 5:  #Weekdays
        return weekday_ranges
    else:  
        return weekend_ranges

def get_day_name(day_number):

    day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    return day_names[day_number]

# Task 2

### Question 1 Distance Matrix Calculation

Create a function named calculate_distance_matrix that takes the dataset-3.csv as input and generates a DataFrame representing distances between IDs.

The resulting DataFrame should have cumulative distances along known routes, with diagonal values set to 0. If distances between toll locations A to B and B to C are known, then the distance from A to C should be the sum of these distances. Ensure the matrix is symmetric, accounting for bidirectional distances between toll locations (i.e. A to B is equal to B to A).

In [18]:
result_matrix_custom = calculate_distance_matrix_custom()
print(result_matrix_custom)

         1001400  1001402  1001404  1001406  1001408  1001410  1001412  \
1001400      0.0      9.7     29.9     45.9     67.6     78.7     94.3   
1001402      9.7      0.0     20.2     36.2     57.9     69.0     84.6   
1001404     29.9     20.2      0.0     16.0     37.7     48.8     64.4   
1001406     45.9     36.2     16.0      0.0     21.7     32.8     48.4   
1001408     67.6     57.9     37.7     21.7      0.0     11.1     26.7   
1001410     78.7     69.0     48.8     32.8     11.1      0.0     15.6   
1001412     94.3     84.6     64.4     48.4     26.7     15.6      0.0   
1001414    112.5    102.8     82.6     66.6     44.9     33.8     18.2   
1001416    125.7    116.0     95.8     79.8     58.1     47.0     31.4   
1001418    139.3    129.6    109.4     93.4     71.7     60.6     45.0   
1001420    152.2    142.5    122.3    106.3     84.6     73.5     57.9   
1001422    161.8    152.1    131.9    115.9     94.2     83.1     67.5   
1001424    173.2    163.5    143.3    

### Question 2 Unroll Distance Matrix

Create a function unroll_distance_matrix that takes the DataFrame created in Question 1. The resulting DataFrame should have three columns: columns id_start, id_end, and distance.

All the combinations except for same id_start to id_end must be present in the rows with their distance values from the input DataFrame.

In [19]:
result_unrolled = unroll_distance_matrix(result_matrix_custom)
print(result_unrolled)

      id_start   id_end  distance
0      1001400  1001402       9.7
1      1001400  1001404      29.9
2      1001400  1001406      45.9
3      1001400  1001408      67.6
4      1001400  1001410      78.7
...        ...      ...       ...
1801   1001472  1001462      72.5
1802   1001472  1001464      45.8
1803   1001472  1001466      37.3
1804   1001472  1001468      26.6
1805   1001472  1001470      16.0

[1806 rows x 3 columns]


### Question 3 Finding IDs within Percentage Threshold

Create a function find_ids_within_ten_percentage_threshold that takes the DataFrame created in Question 2 and a reference value from the id_start column as an integer.

Calculate average distance for the reference value given as an input and return a sorted list of values from id_start column which lie within 10% (including ceiling and floor) of the reference value's average.

In [20]:
result_within_threshold = find_ids_within_ten_percentage_threshold(result_unrolled)
print(result_within_threshold)

[1001400, 1001402, 1001404, 1001406, 1001408, 1001410, 1001412, 1001414, 1001416, 1001418, 1001420, 1001422, 1001424, 1001426, 1001428, 1001430, 1001432, 1001434, 1001436, 1001437, 1001438, 1001440, 1001442, 1001444, 1001446, 1001448, 1001450, 1001452, 1001454, 1001456, 1001458, 1001460, 1001461, 1001462, 1001464, 1001466, 1001468, 1001470, 1001472, 1001488, 1004354, 1004355, 1004356]


### Question 4 Calculate Toll Rate

Create a function calculate_toll_rate that takes the DataFrame created in Question 2 as input and calculates toll rates based on vehicle types.

The resulting DataFrame should add 5 columns to the input DataFrame: moto, car, rv, bus, and truck with their respective rate coefficients. The toll rates should be calculated by multiplying the distance with the given rate coefficients for each vehicle type:

1. 0.8 for moto
2. 1.2 for car
3. 1.5 for rv
4. 2.2 for bus
5. 3.6 for truck

In [21]:
result_with_toll_rates = calculate_toll_rate(result_unrolled)
print(result_with_toll_rates)

      id_start   id_end  distance   moto    car      rv     bus   truck
0      1001400  1001402       9.7   7.76  11.64   14.55   21.34   34.92
1      1001400  1001404      29.9  23.92  35.88   44.85   65.78  107.64
2      1001400  1001406      45.9  36.72  55.08   68.85  100.98  165.24
3      1001400  1001408      67.6  54.08  81.12  101.40  148.72  243.36
4      1001400  1001410      78.7  62.96  94.44  118.05  173.14  283.32
...        ...      ...       ...    ...    ...     ...     ...     ...
1801   1001472  1001462      72.5  58.00  87.00  108.75  159.50  261.00
1802   1001472  1001464      45.8  36.64  54.96   68.70  100.76  164.88
1803   1001472  1001466      37.3  29.84  44.76   55.95   82.06  134.28
1804   1001472  1001468      26.6  21.28  31.92   39.90   58.52   95.76
1805   1001472  1001470      16.0  12.80  19.20   24.00   35.20   57.60

[1806 rows x 8 columns]


### Question 5 Calculate Time-Based Toll Rates

Create a function named calculate_time_based_toll_rates that takes the DataFrame created in Question 3 as input and calculates toll rates for different time intervals within a day.

The resulting DataFrame should have these five columns added to the input: start_day, start_time, end_day, and end_time.

1. start_day, end_day must be strings with day values (from Monday to Sunday in proper case)
2. start_time and end_time must be of type datetime.time() with the values from time range given below.
Modify the values of vehicle columns according to the following time ranges:

Weekdays (Monday - Friday):

1. From 00:00:00 to 10:00:00: Apply a discount factor of 0.8
2. From 10:00:00 to 18:00:00: Apply a discount factor of 1.2
3. From 18:00:00 to 23:59:59: Apply a discount factor of 0.8
Weekends (Saturday and Sunday):

1. Apply a constant discount factor of 0.7 for all times.
For each unique (id_start, id_end) pair, cover a full 24-hour period (from 12:00:00 AM to 11:59:59 PM) and span all 7 days of the week (from Monday to Sunday).

In [22]:
result_time_based_toll = calculate_time_based_toll_rates(result_unrolled)
print(result_time_based_toll)

   id_start   id_end  distance    moto      car       rv      bus    truck  \
0   1001400  1001402       9.7   6.208    9.312   11.640   17.072   27.936   
1   1001400  1001402       9.7   9.312   13.968   17.460   25.608   41.904   
2   1001400  1001402       9.7   6.208    9.312   11.640   17.072   27.936   
3   1001400  1001404      29.9  19.136   28.704   35.880   52.624   86.112   
4   1001400  1001404      29.9  28.704   43.056   53.820   78.936  129.168   
5   1001400  1001404      29.9  19.136   28.704   35.880   52.624   86.112   
6   1001400  1001406      45.9  29.376   44.064   55.080   80.784  132.192   
7   1001400  1001406      45.9  44.064   66.096   82.620  121.176  198.288   
8   1001400  1001406      45.9  29.376   44.064   55.080   80.784  132.192   
9   1001400  1001408      67.6  43.264   64.896   81.120  118.976  194.688   
10  1001400  1001408      67.6  64.896   97.344  121.680  178.464  292.032   
11  1001400  1001408      67.6  43.264   64.896   81.120  118.97

  time_based_toll_df = time_based_toll_df.append({
  time_based_toll_df = time_based_toll_df.append({
  time_based_toll_df = time_based_toll_df.append({
  time_based_toll_df = time_based_toll_df.append({
  time_based_toll_df = time_based_toll_df.append({
  time_based_toll_df = time_based_toll_df.append({
  time_based_toll_df = time_based_toll_df.append({
  time_based_toll_df = time_based_toll_df.append({
  time_based_toll_df = time_based_toll_df.append({
  time_based_toll_df = time_based_toll_df.append({
  time_based_toll_df = time_based_toll_df.append({
  time_based_toll_df = time_based_toll_df.append({
  time_based_toll_df = time_based_toll_df.append({
  time_based_toll_df = time_based_toll_df.append({
  time_based_toll_df = time_based_toll_df.append({
  time_based_toll_df = time_based_toll_df.append({
  time_based_toll_df = time_based_toll_df.append({
