In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from tensorflow import keras
from tensorflow.keras import layers
import ast  # Library for handling literal_eval
from geopy.distance import geodesic


df = pd.read_csv('merged-unlabeled.csv')
# columns = ['maincategory', 'longitude', 'latitude', 'starttime', 'duration', 'subCategoryA', 'attendinggroups', 'closuretype', 'mainstreet']
# df = df[columns]
# df = df[df['maincategory']=='Crash']
df=df.dropna()
# Print the column names to verify
print(df.columns)

quantile_001 = df['duration'].quantile(0.01)
quantile_099 = df['duration'].quantile(0.99)
df = df[(df['duration'] >= quantile_001) & (df['duration'] <= quantile_099)]

Index(['Main_Category', 'Longitude', 'Latitude', 'Day', 'duration',
       'Primary_Vehicle', 'Secondary_Vehicle', 'Is_Major_Incident', 'Advice_A',
       'Advice_B', 'Closure_Type', 'Direction', 'Main_Street',
       'Affected_Lanes', 'Actual_Number_of_Lanes ', 'Suburb', 'Traffic_Volume',
       'SA2_CODE21', 'SA3_CODE21', 'SA3_NAME21', 'SA4_CODE21', 'SA4_NAME21',
       'AREASQKM21', '1_Area', '2_ML', '3_TRL', '4_PRL', '5_SRL', '6_TrRL',
       '7_RRL', '8_LsRL', '9_URL', '10_ToRL', '11_EoR', '12_NoN', '13_NDEs',
       '14_NNC2L', '15_NNC3L', '16_NNC4L', '17_AND', '18_NE', '19_MCI',
       '20_CoI', '21 NBS', '22_CA', '23_EA', '24_HA', '25_IA', '26_OA',
       '27_PA', '28_PrA', '29_RA', '30_TA', '31_WbA', '32_EoLU', '33_TP',
       '34_PD0MV', '35_PD1MV', '36_PD2MV', '37_PD3MV', '38_PD≥4MV', '39_PUE',
       '40_AMI', '41_NPTtWbyPT', '42_NPTtWbyTx', '43_NPTtWbyCD',
       '44_NPTtWbyCP', '45_NPTtWbyO', '46_NPWfH', '47_PWCJH', '48_PBCJH',
       '49_ANP_FH', 'Tow Truck', 'Motorway C

In [19]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import wasserstein_distance

# Function to sort categories numerically if possible
def sort_categories(categories):
    try:
        return sorted(categories, key=lambda x: float(x))
    except ValueError:
        return sorted(categories)

# Dictionary to map categorical variable names to more descriptive labels
descriptive_names = {
    'Main_Category': 'Main Category',
    'Primary_Vehicle': 'Primary Vehicle',
    'Secondary_Vehicle': 'Secondary Vehicle',
    'Is_Major_Incident': 'Major Incident',
    'Closure_Type': 'Closure Type',
    'Direction': 'Direction of Incident',
    'Traffic_Volume': 'Traffic Volume',
    'Incident_Type': 'Incident Type',
    'Num_Vehicles_Involved': 'Number of Vehicles Involved',
    'Month': 'Month of Incident',
    'Hour': 'Hour of Day'
}

categorical_vars = [
    'Main_Category', 'Primary_Vehicle', 'Secondary_Vehicle', 
    'Is_Major_Incident', 'Closure_Type', 'Direction', 'Traffic_Volume',
    'Incident_Type', 'Num_Vehicles_Involved', 'Month', 'Hour'
]

sns.set(font_scale=1.2)
for var in categorical_vars:
    unique_categories = sort_categories(df[var].unique())
    num_categories = len(unique_categories)
    wasserstein_df = pd.DataFrame(index=unique_categories, columns=unique_categories)

    for i in range(num_categories):
        for j in range(i + 1, num_categories):
            dist = wasserstein_distance(
                df[df[var] == unique_categories[i]]['duration'],
                df[df[var] == unique_categories[j]]['duration']
            )
    
            wasserstein_df.loc[unique_categories[i], unique_categories[j]] = dist
            wasserstein_df.loc[unique_categories[j], unique_categories[i]] = dist

    # Adjust font scale for 'Hour' specifically
    if var == 'Hour':
        sns.set(font_scale=0.9)
    else:
        sns.set(font_scale=1.3)
    
    # Adjust figure size if the number of categories is less than 4
    if num_categories < 4:
        plt.figure(figsize=(6, 5))
    else:
        plt.figure(figsize=(11, 9))

    # Create the heatmap with 'minutes' added to the title
    heatmap = sns.heatmap(np.round(wasserstein_df.astype(float), 1), annot=True, fmt=".1f", cmap="rocket_r")

    # Add a title with descriptive names and units in minutes
#     plt.title(f'Heatmap of Average Wasserstein Distances for \n{descriptive_names[var]} [minutes]')
    
    # Set the colorbar label to show units
    colorbar = heatmap.collections[0].colorbar
    colorbar.set_label('Wasserstein Distance [minutes]')

    plt.tight_layout()
    plt.savefig(f'Heatmap_{var}.pdf')  # Keep file name the same
    plt.close()

# Reset font scale for density plots
sns.set(font_scale=1.0)

# Density Plots
def plot_density_with_aggregation(df, var, bin_size=3):
    plt.figure(figsize=(6, 4))
    if var == 'Hour':
        # Aggregate hours into larger bins
        df['Hour_Binned'] = (df['Hour'] // bin_size) * bin_size
        binned_hours = sort_categories(df['Hour_Binned'].unique())
        for binned_hour in binned_hours:
            sns.kdeplot(df[df['Hour_Binned'] == binned_hour]['duration'], linewidth=3, 
                        label=f'{binned_hour}-{binned_hour+bin_size-1}', bw_adjust=0.75)
    else:
        for category in sort_categories(df[var].unique()):
            sns.kdeplot(df[df[var] == category]['duration'], linewidth=3, label=category, bw_adjust=0.8)
    
    # Use descriptive names in the title
#     plt.title(f'Density Plot of Incident Duration by\n{descriptive_names[var]}')
    
    plt.xlabel('Duration [minutes]')
    plt.ylabel('Density [units]')
    plt.xlim(0, 300)
    plt.legend(title=descriptive_names[var])  # Descriptive legend title
    plt.grid(True)  # Display grid

    plt.tight_layout()
    plt.savefig(f'DensityPlot_{var}.pdf')  # Keep file name the same
    plt.close()

# Create density plots for each categorical variable
for var in categorical_vars:
    plot_density_with_aggregation(df, var)


  sns.kdeplot(df[df[var] == category]['duration'], linewidth=3, label=category, bw_adjust=0.8)
  sns.kdeplot(df[df[var] == category]['duration'], linewidth=3, label=category, bw_adjust=0.8)


In [None]:
latex_code = """
The analysis includes density plots and heatmaps to explore the distribution and relationships of various categorical variables with incident durations. Each plot is referred to by its corresponding figure number for easy reference.
"""

categorical_vars = [
    'Main_Category', 'Primary_Vehicle', 'Secondary_Vehicle', 
    'Is_Major_Incident', 'Closure_Type', 'Direction', 'Traffic_Volume',
    'Incident_Type', 'Num_Vehicles_Involved', 'Month', 'Hour'
]

# Generate LaTeX code for density plots first
latex_code += "\\section*{Density Plots}\n"
for var in categorical_vars:
    densityplot_filename = f'DensityPlot_{var}.pdf'
    var_clean = var.replace("_"," ")
    densityplot_title = f'Density Plot of Duration by {var_clean}'

    latex_code += """
\\begin{{figure}}[h]
    \\centering
    \\includegraphics[width=0.6\\textwidth]{{{densityplot_filename}}}
    \\caption{{{densityplot_title}}}
    \\label{{fig:density_{var}}}
\\end{{figure}}
Figure \\ref{{fig:density_{var}}} shows the density plot of incident duration by {var_clean}.
""".format(densityplot_filename=densityplot_filename, densityplot_title=densityplot_title, var=var, var_clean=var_clean)

# Generate LaTeX code for heatmaps
latex_code += "\\section*{Heatmaps}\n"
for var in categorical_vars:
    heatmap_filename = f'Heatmap_{var}.pdf'
    var_clean = var.replace("_"," ")
    heatmap_title = f'Heatmap of Average Wasserstein distances for {var_clean}'

    latex_code += """
\\begin{{figure}}[h]
    \\centering
    \\includegraphics[width=0.9\\textwidth]{{{heatmap_filename}}}
    \\caption{{{heatmap_title}}}
    \\label{{fig:heatmap_{var}}}
\\end{{figure}}
Figure \\ref{{fig:heatmap_{var}}} shows the heatmap of average Wasserstein distances for {var_clean}.
""".format(heatmap_filename=heatmap_filename, heatmap_title=heatmap_title, var=var, var_clean=var_clean)

latex_code += """
\\end{document}
"""

# Save the LaTeX code to a .tex file
with open('figures.tex', 'w') as f:
    f.write(latex_code)

print("LaTeX code has been generated and saved to 'figures.tex'.")
