In [2]:
import psycopg2
import pandas as pd
import folium
from geopy.distance import geodesic
from sklearn.cluster import DBSCAN
from scipy.spatial.distance import cdist
import numpy as np
from datetime import datetime, timedelta
from collections import Counter
from sqlalchemy import create_engine
from sklearn.metrics import pairwise_distances_argmin_min

In [None]:
USED_BUS_LINES = ['100', '108', '232', '2336', '2803', '292', '298', '3', '309', '315', '324', '328', '343', '355', '371', '388', 
                  '397', '399', '415', '422', '457', '483', '497', '550', '553', '554', '557', '565', '606', '624', '629', '634', 
                  '638', '639', '665', '756', '759', '774', '779', '803', '838', '852', '864', '867', '878', '905', '917', '918']

In [4]:
CHOSEN_DATES = ['20240426', '20240427', '20240429']

CHOSEN_TEST_LINE = '3'

In [None]:
database_uri = 'postgresql://postgres:senha@localhost:5432/postgres'
db_engine_alchemy = create_engine(database_uri)

In [7]:
def database_query(linha, dates: list, engine) -> pd.DataFrame:
    queries = []
    for date in dates:
        query = (
            f"SELECT ordem, dg.latitude, dg.longitude, datahora_ts, velocidade "
            f"FROM gps dg "
            f"LEFT JOIN garage cg "
            f"ON ST_DWithin(cg.geom, dg.geom, 200 / 111320.0) "
            f"WHERE cg.geom IS NULL "
            f"AND dg.linha = '{linha}' "
            f"AND dg.datahora_ts::date = DATE '{date}' "
            f"AND EXTRACT (HOUR FROM dg.datahora_ts) BETWEEN 9 AND 18"
        )
        queries.append(query)
    
    union_all_query = ' UNION ALL '.join(queries)
    df = pd.read_sql(union_all_query, con=engine)
    return df

In [8]:
df = database_query(CHOSEN_TEST_LINE, CHOSEN_DATES , db_engine_alchemy)
df

Unnamed: 0,ordem,latitude,longitude,datahora_ts,velocidade
0,D53514,-22.90228,-43.55031,2024-04-26 09:00:20,24
1,D53514,-22.90046,-43.55028,2024-04-26 09:00:51,37
2,D53515,-22.88349,-43.49175,2024-04-26 09:02:45,0
3,D53515,-22.88348,-43.49168,2024-04-26 09:03:15,9
4,D53515,-22.88346,-43.49065,2024-04-26 09:03:46,0
...,...,...,...,...,...
3730,D53632,-22.91011,-43.56530,2024-04-29 18:58:23,0
3731,D53586,-22.88155,-43.49958,2024-04-29 18:58:43,33
3732,D53632,-22.91010,-43.56530,2024-04-29 18:58:54,0
3733,D53632,-22.91010,-43.56529,2024-04-29 18:59:25,0


In [None]:
def get_garage_points(engine):
    query = f'SELECT latitude, longitude FROM garage' 
    df = pd.read_sql(query, con=engine)
    return list(zip(df['latitude'], df['longitude']))

In [10]:
garage_points = get_garage_points(db_engine_alchemy)

In [11]:
df.sort_values(by=['ordem', 'datahora_ts'], inplace=True)

In [None]:
def filter_df(df: pd.DataFrame):

    df = df[df['velocidade'] == 0]

    ordem_counts = df['ordem'].value_counts()

    filtered_df = df[df['ordem'] == ordem_counts.index[14]].reset_index()

    filtered_df = filtered_df[filtered_df['velocidade'] == 0]
    
    filtered_df = filtered_df.sort_values(by=['ordem', 'datahora_ts']).reset_index()
    filtered_df['datahora_ts'] = pd.to_datetime(filtered_df['datahora_ts'])

    # Calcula a distancia
    def calculate_distance(row1, row2):
        point1 = (row1['latitude'], row1['longitude'])
        point2 = (row2['latitude'], row2['longitude'])
        return geodesic(point1, point2).meters

    filtered_df['is_stop'] = False


    window_size = timedelta(minutes=10)
    i = 0
    while i < len(filtered_df):
        end_time = filtered_df.loc[i, 'datahora_ts'] + window_size
        window = filtered_df[(filtered_df['datahora_ts'] >= filtered_df.loc[i, 'datahora_ts']) & (filtered_df['datahora_ts'] <= end_time)]
        initial_point = window.iloc[0]
        
        if all(calculate_distance(initial_point, row) <= 10 for idx, row in window.iterrows()):
            filtered_df.loc[window.index, 'is_stop'] = True
            i += len(window.index)
        else:
            i += 1

    return filtered_df
filtered_df = filter_df(df)
print(len((filtered_df[filtered_df['is_stop']]==False).index))
filtered_df

7


Unnamed: 0,level_0,index,ordem,latitude,longitude,datahora_ts,velocidade,is_stop
0,0,2035,D53625,-22.88351,-43.49067,2024-04-29 09:04:28,0,False
1,1,2039,D53625,-22.88086,-43.50895,2024-04-29 09:08:05,0,True
2,2,2045,D53625,-22.90701,-43.56827,2024-04-29 09:24:33,0,False
3,3,2046,D53625,-22.90701,-43.56826,2024-04-29 09:25:04,0,False
4,4,2054,D53625,-22.91056,-43.58365,2024-04-29 09:29:09,0,False
5,5,2058,D53625,-22.91372,-43.59711,2024-04-29 09:31:13,0,False
6,6,2061,D53625,-22.9141,-43.60109,2024-04-29 09:32:46,0,True
7,7,2062,D53625,-22.9141,-43.6011,2024-04-29 09:33:17,0,True
8,8,2063,D53625,-22.9141,-43.60109,2024-04-29 09:33:48,0,True
9,9,2064,D53625,-22.9141,-43.60109,2024-04-29 09:34:18,0,True


In [None]:
def filter_df_v2(df: pd.DataFrame):
    
    df.loc[:,'datahora_ts'] = pd.to_datetime(df['datahora_ts'])
    df = df.sort_values(by=['ordem', 'datahora_ts']).reset_index()

    # Calcula a distancia
    def calculate_distance(row1, row2):
        point1 = (row1['latitude'], row1['longitude'])
        point2 = (row2['latitude'], row2['longitude'])
        return geodesic(point1, point2).meters

    df['is_stop'] = False
    
    for ordem in df['ordem'].unique():
        window_size = timedelta(minutes=5)
        df_slice = df[df['ordem']==ordem]
        i = df_slice.index[0]
        while i < df_slice.index[-1]:
            end_time = df_slice.loc[i, 'datahora_ts'] + window_size
            window = df_slice[(df_slice['datahora_ts'] >= df_slice.loc[i, 'datahora_ts']) & (df_slice['datahora_ts'] <= end_time)]
            initial_point = window.iloc[0]
            final_point = window.iloc[-1]
            
            if calculate_distance(initial_point, final_point) <= 10 and final_point['datahora_ts'] - initial_point['datahora_ts'] >= window_size - timedelta(minutes=2):
                df.loc[window.index, 'is_stop'] = True
                i += len(window.index)
            else:
                i += 1

    final_df = df[df['is_stop']==True]

    return final_df
filtered_df = filter_df_v2(df)
filtered_df

Unnamed: 0,index,ordem,latitude,longitude,datahora_ts,velocidade,is_stop
2935,1015,D53748,-22.88030,-43.50119,2024-04-27 10:06:35,0,True
2936,1017,D53748,-22.88031,-43.50119,2024-04-27 10:07:06,0,True
2937,1019,D53748,-22.88031,-43.50119,2024-04-27 10:07:36,0,True
2938,1021,D53748,-22.88031,-43.50119,2024-04-27 10:08:07,0,True
2939,1023,D53748,-22.88031,-43.50119,2024-04-27 10:08:38,0,True
...,...,...,...,...,...,...,...
3560,2504,D53928,-22.88521,-43.62017,2024-04-29 13:20:12,0,True
3561,2505,D53928,-22.88521,-43.62017,2024-04-29 13:20:42,0,True
3562,2508,D53928,-22.88520,-43.62017,2024-04-29 13:21:13,0,True
3563,2537,D53928,-22.88520,-43.62017,2024-04-29 13:21:44,0,True


In [None]:
def plot_trajectories(df: pd.DataFrame):
    df = df.sort_values(by=['ordem','datahora_ts']).reset_index(drop=True)

    # Cria mapa com centro na localizacao media
    map_center = [df['latitude'].mean(), df['longitude'].mean()]
    m = folium.Map(location=map_center, zoom_start=15)
    choosen_order = df['ordem'].unique()[13]

    print(choosen_order)

    for i, row in df.iterrows():
        point = [row['latitude'], row['longitude']]
        popup_text = f"Velocidade: {row['velocidade']} km/h<br>Hora: {row['datahora_ts'].strftime('%H:%M:%S')}<br>Coords:{point}"
        
        if row['velocidade'] > 0:
            color = 'blue'
        else:
            color = 'red'
        
        folium.CircleMarker(location=point, radius=5, color=color, fill=True, fill_color=color, popup=popup_text).add_to(m)
        
    m.save(f'maps/trajectory_{CHOSEN_TEST_LINE}.html')

In [None]:
def calculate_final_stops_v2(df: pd.DataFrame, radiusInMeters: int = 100):
    epsilon = radiusInMeters / 6371000 # raio da terra em metros

    # DBSCAN clustering
    coords = df[['latitude', 'longitude']].to_numpy()
    db = DBSCAN(eps=epsilon, min_samples=10, metric='haversine').fit(np.radians(coords))

    df['cluster'] = db.labels_

    # Numero de pontos em cada cluster
    cluster_counts = Counter(df['cluster'])

    top_clusters = cluster_counts.most_common(3) 

    print("Top 3 most selected areas:")
    i = 1

    i = 1
    centroids_result = []
    for cluster_id, count in top_clusters:
        if cluster_id != -1:
            cluster_points = df[df['cluster'] == cluster_id]
            
            dist_matrix = cdist(cluster_points[['latitude', 'longitude']], cluster_points[['latitude', 'longitude']])
            total_distances = np.sum(dist_matrix, axis=1)
            medoid_index = np.argmin(total_distances)
            medoid_point = cluster_points.iloc[medoid_index]
            medoid_lat = medoid_point['latitude']
            medoid_lon = medoid_point['longitude']
            print(f"Cluster {cluster_id}: Medoid ({medoid_lat}, {medoid_lon}), Count: {count}")
            centroids_result.append((i, (medoid_lat, medoid_lon), count))
            i += 1
    return centroids_result
    

In [17]:
bus_stops_per_line = {}
bus_stops_per_line[CHOSEN_TEST_LINE] = calculate_final_stops_v2(filtered_df)

Top 3 most selected areas:
Cluster 1: Medoid (-22.89736, -43.48696), Count: 60
Cluster 2: Medoid (-22.86142, -43.54884), Count: 20
Cluster 0: Medoid (-22.9009, -43.55199), Count: 10


In [None]:
bus_stops_per_line = {}
STOP_RADIUS = 30
for linha in USED_BUS_LINES:
    print(f'[{linha}]Querying database...')
    df = database_query(linha, CHOSEN_DATES , db_engine_alchemy)
    print(f'[{linha}]Raw dataframe size: {len(df.index)}')

    print(f'[{linha}]Filtering dataframe...')
    filtered_df = filter_df_v2(df)

    if (len(filtered_df.index)>80000):
        filtered_df = filtered_df.sample(n=80000)
    print(f'[{linha}]Filtered dataframe size: {len(filtered_df.index)}')
    print(f'[{linha}]Clustering points...')
    bus_stops_per_line[linha] = calculate_final_stops_v2(filtered_df, radiusInMeters=STOP_RADIUS)
    print('-'*50)

[100]Querying database...
[100]Raw dataframe size: 117366
[100]Filtering dataframe...
[100]Filtered dataframe size: 239
[100]Clustering points...
Top 3 most selected areas:
Cluster 12: Medoid (-22.98314, -43.19232), Count: 34
Cluster 9: Medoid (-22.9497, -43.17788), Count: 21
Cluster 5: Medoid (-22.96391, -43.17844), Count: 20
--------------------------------------------------
[108]Querying database...
[108]Raw dataframe size: 37667
[108]Filtering dataframe...
[108]Filtered dataframe size: 1133
[108]Clustering points...
Top 3 most selected areas:
Cluster 0: Medoid (-22.98162, -43.21336), Count: 1000
Cluster 4: Medoid (-22.91424, -43.19639), Count: 43
Cluster 2: Medoid (-22.98496, -43.21293), Count: 30
--------------------------------------------------
[232]Querying database...
[232]Raw dataframe size: 74959
[232]Filtering dataframe...
[232]Filtered dataframe size: 235
[232]Clustering points...
Top 3 most selected areas:
Cluster 1: Medoid (-22.90792, -43.27829), Count: 156
Cluster 4: Me

In [None]:
print(bus_stops_per_line)

{'774': [(1, (-22.77659, -43.28654), 60), (2, (-22.75386, -43.29518), 50), (3, (-22.80963, -43.30886), 20), (4, (-22.82537, -43.3127), 20)]}


In [21]:
df_results_dict = {'linha':[], 'cluster_order':[], 'latitude':[], 'longitude':[], 'cluster_count':[]}
for linha, info_list in bus_stops_per_line.items():
    for info in info_list:
        df_results_dict['linha'].append(linha)
        df_results_dict['cluster_order'].append(info[0])
        df_results_dict['latitude'].append(info[1][0])
        df_results_dict['longitude'].append(info[1][1])
        df_results_dict['cluster_count'].append(info[2])
df_results = pd.DataFrame.from_dict(df_results_dict)
df_results.to_csv('calculated_bus_stops.csv', index=False)
display(df_results)

Unnamed: 0,linha,cluster_order,latitude,longitude,cluster_count
0,100,1,-22.98314,-43.19232,34
1,100,2,-22.94970,-43.17788,21
2,100,3,-22.96391,-43.17844,20
3,108,1,-22.98162,-43.21336,1000
4,108,2,-22.91424,-43.19639,43
...,...,...,...,...,...
130,917,2,-22.88654,-43.30307,10
131,917,3,-22.88768,-43.28417,10
132,918,1,-22.86980,-43.25853,51
133,918,2,-22.89677,-43.48756,20


In [None]:
def create_single_bus_stops_map(df: pd.DataFrame, garage_points, line_choosen):
    
    map_center = [df['latitude'].mean(), df['longitude'].mean()]
    folium_map = folium.Map(location=map_center, zoom_start=12)

    for item in garage_points:
            folium.Circle(
            location=(item[0], item[1]),
            radius=100,
            color='red',
            fill=True,
            fill_color='red',
            fill_opacity=0.2
        ).add_to(folium_map)

    # adicionar marcadores de circulo no mapa
    for _, row in df.iterrows():
        popup_text = f"linha: {row['linha']}<br>cluster_order: {row['cluster_order']}<br>cluster_count: {row['cluster_count']}<br>Location:({row['latitude']}, {row['longitude']})"
        folium.Marker(
        location=(row['latitude'], row['longitude']),
        popup=folium.Popup(popup_text, max_width=300),
        icon=folium.Icon(color='blue')
        ).add_to(folium_map)
        # adicionar circulo representando o raio
        folium.Circle(
            location=(row['latitude'], row['longitude']),
            radius=STOP_RADIUS,
            color='blue',
            fill=True,
            fill_color='blue',
            fill_opacity=0.2
        ).add_to(folium_map)
        
        

        # Salva para arquivo html
        folium_map.save(f"maps/bus_stops/bus_stops_map_{line_choosen}.html")


In [32]:
for linha in USED_BUS_LINES:
    df_sliced = df_results[df_results['linha'] == linha]
    
    # Filtra apenas os cluster_order menores ou iguais a 2
    df_sliced = df_sliced[df_sliced['cluster_order'] <= 2]
    
    if len(df_sliced.index) > 0:
        create_single_bus_stops_map(df_sliced, garage_points, linha)

In [31]:
print(df_sliced)

    linha  cluster_order  latitude  longitude  cluster_count
132   918              1 -22.86980  -43.25853             51
133   918              2 -22.89677  -43.48756             20
134   918              3 -22.87162  -43.25840             18


In [None]:
TABLE_COLUMNS = ['linha', 'cluster_order', 'latitude', 'longitude', 'cluster_count']
conn = conn = psycopg2.connect(host='localhost', database='postgres', user='postgres', password='skyping1')
def copy_data_from_csv(csv_file_path, conn):
    try:
        cursor = conn.cursor()
        with open(csv_file_path, 'r') as f:
            col_order = ', '.join(map(str, TABLE_COLUMNS))
            cursor.copy_expert(f"COPY stops ({col_order}) FROM STDIN DELIMITER ',' CSV HEADER", f)
        conn.commit()
        print(f"Data loaded from {csv_file_path} successfully")
    except Exception as e:
        print(f"Error: {e}")
        conn.rollback()

copy_data_from_csv('estimated_bus_stops.csv', conn)