In [1]:
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
from Functions.prepping import *
from Functions.points import clean_gps_data, classify_route_variant
from Functions.maps import comparing_map
from scipy.stats import percentileofscore

# ================================================================
# Paths to files and initial values
# Change these to fit your needs
# ================================================================
file_path = "Data/ruta5.csv"
ruta_5_json_path = "Data/ruta_5.json"
map_path = 'Maps/comparing_map.html'
branches_path = "Data/branches_df.geojson"
route = 'RUTA 5'
rounding_precision = 4
closer_threshold = 3
time_diff_threshold = 120
predictions_path = "Data/predictions_df.csv"


# ================================================================
# Get necessary data
# ================================================================

# Get data
df = pd.read_csv(file_path)
# Format and filter to this file needs
empiric_df = format_df(df)
empiric_df = empiric_df[empiric_df['Route'] == route]
empiric_df

Unnamed: 0,Time,Plate,Route,Latitude,Longitude,velocidad,contadorpasajeros,tipounidad,Scale
0,2023-02-27 00:00:06,50050,RUTA 5,19.385832,-99.226166,0,0,AUTOBUS CORTO (ENTRE 7.5 Y 10 M DE LONGITUD),2
1,2023-02-27 00:00:43,50116,RUTA 5,19.385124,-99.198440,0,0,AUTOBUS CORTO (ENTRE 7.5 Y 10 M DE LONGITUD),2
2,2023-02-27 00:00:45,50050,RUTA 5,19.385832,-99.226166,0,0,AUTOBUS CORTO (ENTRE 7.5 Y 10 M DE LONGITUD),2
3,2023-02-27 00:01:44,50050,RUTA 5,19.385832,-99.226166,0,0,AUTOBUS CORTO (ENTRE 7.5 Y 10 M DE LONGITUD),2
4,2023-02-27 00:01:50,50116,RUTA 5,19.385122,-99.198440,0,0,AUTOBUS CORTO (ENTRE 7.5 Y 10 M DE LONGITUD),2
...,...,...,...,...,...,...,...,...,...
74709,2023-03-05 23:59:39,50336,RUTA 5,19.342667,-99.279760,0,0,VAGONETA,2
74710,2023-03-05 23:59:39,50162,RUTA 5,19.342611,-99.287440,0,0,AUTOBUS CORTO (ENTRE 7.5 Y 10 M DE LONGITUD),2
74711,2023-03-05 23:59:40,50242,RUTA 5,19.380463,-99.250160,0,0,AUTOBUS CORTO (ENTRE 7.5 Y 10 M DE LONGITUD),2
74712,2023-03-05 23:59:47,50116,RUTA 5,19.385002,-99.198310,0,0,AUTOBUS CORTO (ENTRE 7.5 Y 10 M DE LONGITUD),2


In [2]:
branches_df = gpd.read_file("Data/branches_df.geojson", driver="GeoJSON")
predictions_df = pd.read_csv("Data/predictions_df.csv")
predictions_df

Unnamed: 0,Plate,Date,Deviation,Predicted_Branch
0,50050,2023-02-27,0.001195,5
1,50050,2023-02-28,0.000594,5
2,50050,2023-03-01,0.000523,1
3,50050,2023-03-02,0.000852,3
4,50050,2023-03-03,0.000860,1
...,...,...,...,...
137,50112,2023-03-03,0.000830,5
138,50112,2023-03-04,0.000643,3
139,50112,2023-03-05,0.003987,13
140,50214,2023-03-04,0.000097,1


In [3]:
import folium

In [6]:
# Iterate over each unique plate
for plate in empiric_df['Plate'].unique()[9:10]:
    plate_df = empiric_df[empiric_df['Plate'] == plate]
    sorted_dates = sorted(plate_df['Time'].dt.date.unique())
    # Iterate over each unique date for the current plate
    for day in sorted_dates[:1]:
        plate_date_df = plate_df[plate_df['Time'].dt.date == day]
        clean_df = clean_gps_data(plate_date_df, rounding_precision, time_diff_threshold, closer_threshold)
        prediction = predictions_df[(predictions_df["Plate"] == plate) & (predictions_df["Date"] == str(day))].iloc[0]
        deviation = prediction["Deviation"]
        predicted_branch = prediction["Predicted_Branch"]
        branch_df = branches_df[branches_df["Branch"] == str(predicted_branch)].iloc[0]
        percentile = percentileofscore(predictions_df["Deviation"], deviation)
        print(f'The bus with plate {plate} on the day {day} was on the {percentile}% percentile') # Bigger is worst
        # Map
        m = comparing_map(clean_df, branch_df, "Maps/trash.html", f'Placa: {plate}. Día: {day}. Variante predicha: {predicted_branch}. Discrepancia: {deviation}. Percentil de la discrepancia: {round(percentile,2)}')
m

The bus with plate 50234 on the day 2023-02-27 was on the 28.169014084507044% percentile
