In [2]:
import pandas as pd
import numpy as np

def calculate_distance_matrix(df):
    # Get unique IDs
    unique_ids = sorted(set(df['id_start'].unique()) | set(df['id_end'].unique()))
    n = len(unique_ids)
    
    # Create initial distance matrix filled with infinity
    distances = np.full((n, n), np.inf)
    np.fill_diagonal(distances, 0)  # Set diagonal to 0
    
    # Create ID to index mapping
    id_to_index = {id_: idx for idx, id_ in enumerate(unique_ids)}
    
    # Fill known distances
    for _, row in df.iterrows():
        i = id_to_index[row['id_start']]
        j = id_to_index[row['id_end']]
        distances[i, j] = row['distance']
        distances[j, i] = row['distance']  # Make symmetric
    
    # Floyd-Warshall algorithm to find shortest paths
    for k in range(n):
        for i in range(n):
            for j in range(n):
                if distances[i, k] != np.inf and distances[k, j] != np.inf:
                    # Update distance if path through k is shorter
                    distances[i, j] = min(
                        distances[i, j],
                        distances[i, k] + distances[k, j]
                    )
                    # Maintain symmetry
                    distances[j, i] = distances[i, j]
    
    # Create DataFrame with ID labels
    distance_df = pd.DataFrame(
        distances,
        index=unique_ids,
        columns=unique_ids
    )
    
    # Replace any remaining infinity values with NaN
    distance_df = distance_df.replace(np.inf, np.nan)
    
    return distance_df

In [5]:
# Read the dataset
df = pd.read_csv("C:\\Users\\ASUS\\Downloads\\dataset-2.csv")
# Calculate the distance matrix
distance_matrix = calculate_distance_matrix(df)
# Display the result
print(distance_matrix)

         1001400  1001402  1001404  1001406  1001408  1001410  1001412  \
1001400      0.0      9.7     29.9     45.9     67.6     78.7     94.3   
1001402      9.7      0.0     20.2     36.2     57.9     69.0     84.6   
1001404     29.9     20.2      0.0     16.0     37.7     48.8     64.4   
1001406     45.9     36.2     16.0      0.0     21.7     32.8     48.4   
1001408     67.6     57.9     37.7     21.7      0.0     11.1     26.7   
1001410     78.7     69.0     48.8     32.8     11.1      0.0     15.6   
1001412     94.3     84.6     64.4     48.4     26.7     15.6      0.0   
1001414    112.5    102.8     82.6     66.6     44.9     33.8     18.2   
1001416    125.7    116.0     95.8     79.8     58.1     47.0     31.4   
1001418    139.3    129.6    109.4     93.4     71.7     60.6     45.0   
1001420    152.2    142.5    122.3    106.3     84.6     73.5     57.9   
1001422    161.8    152.1    131.9    115.9     94.2     83.1     67.5   
1001424    173.2    163.5    143.3    

In [8]:
import pandas as pd

def unroll_distance_matrix(distance_matrix: pd.DataFrame) -> pd.DataFrame:
    # Create empty lists to store the data
    id_starts = []
    id_ends = []
    distances = []
    
    # Iterate through the matrix
    for i in distance_matrix.columns:
        for j in distance_matrix.columns:
            if i != j:
                id_starts.append(i)
                id_ends.append(j)
                distances.append(distance_matrix.loc[i, j])
    
    # Create the DataFrame from the collected data
    unrolled_df = pd.DataFrame({
        'id_start': id_starts,
        'id_end': id_ends,
        'distance': distances
    })
    
    return unrolled_df

In [9]:
# Calculate the distance matrix
unroll_distance = unroll_distance_matrix(distance_matrix)
# Display the result
print(unroll_distance)

      id_start   id_end  distance
0      1001400  1001402       9.7
1      1001400  1001404      29.9
2      1001400  1001406      45.9
3      1001400  1001408      67.6
4      1001400  1001410      78.7
...        ...      ...       ...
1801   1004356  1001470     159.8
1802   1004356  1001472     175.8
1803   1004356  1001488       4.0
1804   1004356  1004354       2.0
1805   1004356  1004355       4.0

[1806 rows x 3 columns]


In [18]:
unroll_distance

Unnamed: 0,id_start,id_end,distance
0,1001400,1001402,9.7
1,1001400,1001404,29.9
2,1001400,1001406,45.9
3,1001400,1001408,67.6
4,1001400,1001410,78.7
...,...,...,...
1801,1004356,1001470,159.8
1802,1004356,1001472,175.8
1803,1004356,1001488,4.0
1804,1004356,1004354,2.0


In [27]:
# def find_ids_within_ten_percentage_threshold(unroll_distance, reference_id):
    
#     reference_avg_distance = unroll_distance[unroll_distance['id_start'] == reference_id]['distance'].mean()

#     lower_threshold = reference_avg_distance * 0.9
#     upper_threshold = reference_avg_distance * 1.1

#     ids_within_threshold = unroll_distance.groupby('id_start').filter(lambda x: lower_threshold <= x['distance'].mean() <= upper_threshold)

#     return sorted(ids_within_threshold['id_start'].unique())

def find_ids_within_ten_percentage_threshold(unroll_distance: pd.DataFrame, reference_id: int) -> list:
    # Convert reference_id to string (assuming IDs in DataFrame are strings)
    reference_id = str(reference_id)
    
    # Calculate average distance for reference ID
    reference_avg_distance = unroll_distance[unroll_distance['id_start'] == reference_id]['distance'].mean()
    
    # Calculate thresholds (±10%)
    lower_threshold = reference_avg_distance * 0.9  # floor
    upper_threshold = reference_avg_distance * 1.1  # ceiling
    
    # Calculate average distances for all IDs
    avg_distances = unroll_distance.groupby('id_start')['distance'].mean()
    
    # Find IDs within threshold range
    mask = (avg_distances >= lower_threshold) & (avg_distances <= upper_threshold)
    ids_within_threshold = avg_distances[mask].index.tolist()
    
    return sorted(ids_within_threshold)
 

In [28]:
# Your unrolled distance DataFrame
reference_id = 1001400
result = find_ids_within_ten_percentage_threshold(unroll_distance, reference_id)
print(f"IDs within threshold: {result}")

IDs within threshold: []


In [40]:
def calculate_toll_rate(unroll_distance: pd.DataFrame) -> pd.DataFrame:
    # Create a copy of the input DataFrame
    df = unroll_distance.copy()
    
    # Define rate coefficients for each vehicle type
    rate_coefficients = {
        'moto': 0.8,
        'car': 1.2,
        'rv': 1.5,
        'bus': 2.2,
        'truck': 3.6
    }
    
    # Calculate toll for each vehicle type
    for vehicle_type, rate in rate_coefficients.items():
        df[vehicle_type] = df['distance'] * rate
        
    return df

In [42]:
# Calculate tolls
result = calculate_toll_rate(unroll_distance)
print(result)

      id_start   id_end  distance    moto     car      rv     bus   truck
0      1001400  1001402       9.7    7.76   11.64   14.55   21.34   34.92
1      1001400  1001404      29.9   23.92   35.88   44.85   65.78  107.64
2      1001400  1001406      45.9   36.72   55.08   68.85  100.98  165.24
3      1001400  1001408      67.6   54.08   81.12  101.40  148.72  243.36
4      1001400  1001410      78.7   62.96   94.44  118.05  173.14  283.32
...        ...      ...       ...     ...     ...     ...     ...     ...
1801   1004356  1001470     159.8  127.84  191.76  239.70  351.56  575.28
1802   1004356  1001472     175.8  140.64  210.96  263.70  386.76  632.88
1803   1004356  1001488       4.0    3.20    4.80    6.00    8.80   14.40
1804   1004356  1004354       2.0    1.60    2.40    3.00    4.40    7.20
1805   1004356  1004355       4.0    3.20    4.80    6.00    8.80   14.40

[1806 rows x 8 columns]


In [62]:
import pandas as pd
from datetime import time, timedelta

def calculate_time_based_toll_rates(result):
    # Create empty list to store the new rows
    new_rows = []
    
    # Define time ranges and their discount factors for weekdays
    weekday_time_ranges = [
        (time(0, 0, 0), time(10, 0, 0), 0.8),
        (time(10, 0, 0), time(18, 0, 0), 1.2),
        (time(18, 0, 0), time(23, 59, 59), 0.8)
    ]
    
    # Define days
    days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    
    # Get the vehicle columns (excluding id_start and id_end)
    vehicle_columns = [col for col in result.columns if col not in ['id_start', 'id_end']]
    
    # Iterate through each unique (id_start, id_end) pair
    for _, row in result.iterrows():
        id_start = row['id_start']
        id_end = row['id_end']
        
        # Process each day of the week
        for day in days:
            if day in ['Saturday', 'Sunday']:
                # Weekend - constant discount factor
                new_row = {
                    'id_start': id_start,
                    'id_end': id_end,
                    'start_day': day,
                    'end_day': day,
                    'start_time': time(0, 0, 0),
                    'end_time': time(23, 59, 59)
                }
                
                # Apply weekend discount to vehicle columns
                for col in vehicle_columns:
                    new_row[col] = row[col] * 0.7
                    
                new_rows.append(new_row)
                
            else:
                # Weekday - multiple time ranges
                for start_time, end_time, discount in weekday_time_ranges:
                    new_row = {
                        'id_start': id_start,
                        'id_end': id_end,
                        'start_day': day,
                        'end_day': day,
                        'start_time': start_time,
                        'end_time': end_time
                    }
                    
                    # Apply weekday discount to vehicle columns
                    for col in vehicle_columns:
                        new_row[col] = row[col] * discount
                        
                    new_rows.append(new_row)
    
    # Create new DataFrame from the processed data
    result_df = pd.DataFrame(new_rows)
    
    # Sort the DataFrame by id_start, id_end, start_day, and start_time
    result_df.sort_values(
        by=['id_start', 'id_end', 'start_day', 'start_time'],
        inplace=True,
        key=lambda x: pd.Categorical(x, categories=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']) if x.name == 'start_day' else x
    )
    
    return result_df

In [63]:
# Calculate tolls
tolls = calculate_time_based_toll_rates(result)
print(tolls)

        id_start     id_end start_day   end_day start_time  end_time  distance   moto     car     rv     bus   truck
0      1001400.0  1001402.0    Monday    Monday   00:00:00  10:00:00      7.76  6.208   9.312  11.64  17.072  27.936
1      1001400.0  1001402.0    Monday    Monday   10:00:00  18:00:00     11.64  9.312  13.968  17.46  25.608  41.904
2      1001400.0  1001402.0    Monday    Monday   18:00:00  23:59:59      7.76  6.208   9.312  11.64  17.072  27.936
3      1001400.0  1001402.0   Tuesday   Tuesday   00:00:00  10:00:00      7.76  6.208   9.312  11.64  17.072  27.936
4      1001400.0  1001402.0   Tuesday   Tuesday   10:00:00  18:00:00     11.64  9.312  13.968  17.46  25.608  41.904
...          ...        ...       ...       ...        ...       ...       ...    ...     ...    ...     ...     ...
30697  1004356.0  1004355.0    Friday    Friday   00:00:00  10:00:00      3.20  2.560   3.840   4.80   7.040  11.520
30698  1004356.0  1004355.0    Friday    Friday   10:00:00  18:0