In [226]:
import pandas as pd

# Load the dataset
# file_path = './dataset-2.csv'
# df = pd.read_csv(file_path)
# df.head()
# Question 9
def calculate_distance_matrix(df) -> pd.DataFrame:
    """
    Calculate a distance matrix based on the dataframe, df.
    
    Args:
        df (pandas.DataFrame)
    
    Returns:
        pandas.DataFrame: Distance matrix
    """
    unique_ids = pd.unique(df[['id_start', 'id_end']].values.ravel())
    distance_matrix = pd.DataFrame(index=unique_ids, columns=unique_ids, data=float('inf'))
    for id in unique_ids:
        distance_matrix.at[id, id] = 0
    for _, row in df.iterrows():
        start, end, dist = row['id_start'], row['id_end'], row['distance']
        distance_matrix.at[start, end] = dist
        distance_matrix.at[end, start] = dist  
    for k in unique_ids:
        for i in unique_ids:
            for j in unique_ids:
                distance_matrix.at[i, j] = min(distance_matrix.at[i, j], distance_matrix.at[i, k] + distance_matrix.at[k, j])
    return distance_matrix


# distance_matrix = calculate_distance_matrix(df)

# # Display the generated distance matrix
# distance_matrix


In [228]:

# Question 10
def unroll_distance_matrix(df: pd.DataFrame) -> pd.DataFrame:
    """
    Unroll a distance matrix to a DataFrame in the style of the initial dataset.

    Args:
        df (pandas.DataFrame)

    Returns:
        pandas.DataFrame: Unrolled DataFrame containing columns 'id_start', 'id_end', and 'distance'.
    """
    unrolled_data = []
    for id_start in df.index:
        for id_end in df.columns:
            if id_start != id_end:
                distance = df.at[id_start, id_end]
                unrolled_data.append([id_start, id_end, distance])
    unrolled_df = pd.DataFrame(unrolled_data, columns=['id_start', 'id_end', 'distance'])
    
    return unrolled_df

# Example usage:
# unrolled_df = unroll_distance_matrix(distance_matrix)
# unrolled_df.head()


In [240]:
# Question 11
def find_ids_within_ten_percentage_threshold(df, reference_id) -> pd.DataFrame:
    """
    Find all IDs whose average distance lies within 10% of the average distance of the reference ID.

    Args:
        df (pandas.DataFrame)
        reference_id (int)

    Returns:
        pandas.DataFrame: DataFrame with IDs whose average distance is within the specified percentage threshold
                          of the reference ID's average distance.
    """
    ref_avg_distance = df[df['id_start'] == reference_id]['distance'].mean()
    # Defining the 10% bounds (lower and upper bounds)
    lower_bound = ref_avg_distance * 0.9
    upper_bound = ref_avg_distance * 1.1
    ids_within_threshold = []
    for id_start in df['id_start'].unique():
        avg_distance = df[df['id_start'] == id_start]['distance'].mean()
        if lower_bound <= avg_distance <= upper_bound:
            ids_within_threshold.append(id_start)
    return sorted(ids_within_threshold)

# Example usage:
# ids_within_threshold = find_ids_within_ten_percentage_threshold(unrolled_df, 1001400)
# print(ids_within_threshold)


In [232]:
# Question 12
def calculate_toll_rate(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate toll rates for each vehicle type based on the unrolled DataFrame.

    Args:
        df (pandas.DataFrame)

    Returns:
        pandas.DataFrame: DataFrame with toll rates for each vehicle type
    """
    rates = {
        'moto': 0.8,
        'car': 1.2,
        'rv': 1.5,
        'bus': 2.2,
        'truck': 3.6
    }
    df['moto'] = df['distance'] * rates['moto']
    df['car'] = df['distance'] * rates['car']
    df['rv'] = df['distance'] * rates['rv']
    df['bus'] = df['distance'] * rates['bus']
    df['truck'] = df['distance'] * rates['truck']
    # df = df.drop(columns=['distance'])
    return df

# Example usage:
# Assuming df_unrolled is the unrolled DataFrame from Question 10
# df_with_toll_rates = calculate_toll_rate(unrolled_df)
# print(df_with_toll_rates.head())


In [238]:
# Question 13
def calculate_time_based_toll_rates(df)->pd.DataFrame():
    """
    Calculate time-based toll rates for different time intervals within a day.

    Args:
        df (pandas.DataFrame)

    Returns:
        pandas.DataFrame
    """
    # Question 5

    df['start_time'] = pd.to_datetime(df['start_time']).dt.time
    df['end_time'] = pd.to_datetime(df['end_time']).dt.time

    weekday_ranges = [('00:00:00', '10:00:00', 0.8), ('10:00:00', '18:00:00', 1.2), ('18:00:00', '23:59:59', 0.8)]
    weekend_range = ('00:00:00', '23:59:59', 0.7)

    def apply_discount(row):
        time_range = weekday_ranges if row['start_day'] in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] else [weekend_range]
        for start, end, discount in time_range:
            mask = (start <= str(row['start_time'])) & (str(row['start_time']) <= end) & (start <= str(row['end_time'])) & (str(row['end_time']) <= end)
            row['moto':'truck'] *= discount * mask
        return row

    df = df.apply(apply_discount, axis=1)

    return df
# Assuming df_unrolled is the df_with_toll_rates DataFrame from Question 12
# df_with_time_based_rates = calculate_time_based_toll_rates(df_with_toll_rates)
# print(df_with_time_based_rates.head())