In [1]:
import os

# Change directory to your target path
#os.chdir('/Users/carolinarutilidelima/Documents/OD_Sao_Paulo/')

os.chdir('/home/carolima/Documents/GitHub/OD_Sao_Paulo')


In [2]:
import pandas as pd

# Load the CSV file into a DataFrame
file_path = './output_data/top_30_station_pairs.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to confirm it was loaded correctly
df.head()

Unnamed: 0,closest_source_station_ID,closest_dest_station_ID,count,source_lat,source_lon,dest_lat,dest_lon
0,157,84,8412,-23.539144,-46.62193,-23.505072,-46.859987
1,84,157,7578,-23.505072,-46.859987,-23.539144,-46.62193
2,120,226,7031,-23.545626,-46.734611,-23.481354,-46.46427
3,241,115,5384,-23.535974,-46.30188,-23.633616,-46.669118
4,226,28,5052,-23.481354,-46.46427,-23.494149,-46.764421


In [3]:
from geopy.distance import geodesic

# Calculate distance between source and destination
def calculate_distance(row):
    source_coords = (row["source_lat"], row["source_lon"])
    dest_coords = (row["dest_lat"], row["dest_lon"])
    return geodesic(source_coords, dest_coords).kilometers

# Add a new column for distance
df["distance_km"] = df.apply(calculate_distance, axis=1)

# Filter out rows where the distance is less than 30km
filtered_df = df[df["distance_km"] >= 30]

# Display the filtered dataframe
filtered_df

Unnamed: 0,closest_source_station_ID,closest_dest_station_ID,count,source_lat,source_lon,dest_lat,dest_lon,distance_km
3,241,115,5384,-23.535974,-46.30188,-23.633616,-46.669118,39.014723
4,226,28,5052,-23.481354,-46.46427,-23.494149,-46.764421,30.693345
5,110,195,4638,-23.478594,-46.51092,-23.536484,-46.8178,31.992267
6,157,232,4311,-23.539144,-46.62193,-23.447804,-46.3274,31.74027
8,195,110,4004,-23.536484,-46.8178,-23.478594,-46.51092,31.992267
11,191,110,3575,-23.500174,-46.82034,-23.478594,-46.51092,31.697237
13,110,191,3566,-23.478594,-46.51092,-23.500174,-46.82034,31.697237
14,157,233,3566,-23.539144,-46.62193,-23.442774,-46.29218,35.333814
15,157,241,3477,-23.539144,-46.62193,-23.535974,-46.30188,32.682845
16,232,157,3368,-23.447804,-46.3274,-23.539144,-46.62193,31.74027


In [4]:
import googlemaps
import populartimes


API_KEY = 'APIKEY'
gmaps = googlemaps.Client(key=API_KEY)


In [5]:
def get_popular_times_for_location(lat, lon):
    # Use the Places API to find nearby places with a larger radius
    places_result = gmaps.places_nearby(location=(lat, lon), radius=2000, type='establishment')
    
    # Iterate through the results to find a place with Popular Times data
    for place in places_result['results']:
        place_id = place['place_id']
        print(f"Checking Place ID: {place_id}")
        
        # Use populartimes library to get the popular times
        try:
            popular_times_data = populartimes.get_id(API_KEY, place_id)
            # Extract popular times information if available
            if 'populartimes' in popular_times_data:
                # Aggregate popularity data across all days
                hourly_popularity = [0] * 24
                for day_data in popular_times_data['populartimes']:
                    for hour, popularity in enumerate(day_data['data']):
                        hourly_popularity[hour] += popularity
                
                # Sort the hours based on aggregated popularity in descending order
                top_5_hours = sorted(range(24), key=lambda h: hourly_popularity[h], reverse=True)[:5]
                return top_5_hours
        except Exception as e:
            print(f"An error occurred while retrieving popular times data for place ID {place_id}: {e}")
    
    return None  # If no popular times data found



In [6]:
# Assuming df is the DataFrame loaded with the top 5 pairs data
output_data = []

for _, row in df.iterrows():  # limit to top 5 rows as per your requirement
    source_name = row['closest_source_station_ID']
    source_lat = row['source_lat']
    source_lon = row['source_lon']

    dest_name = row['closest_dest_station_ID']
    dest_lat = row['dest_lat']
    dest_lon = row['dest_lon']
    
    # Get the top 5 popular hours for this source
    top_5_source_hours = get_popular_times_for_location(source_lat, source_lon)

    # Get the top 5 popular hours for this destination
    top_5_dest_hours = get_popular_times_for_location(dest_lat, dest_lon)
    
    record = {
        'source': source_name,
        'source_lat': source_lat,
        'source_lon': source_lon,
        'destination': dest_name,
        'dest_lat': dest_lat,
        'dest_lon': dest_lon
    }

    if top_5_source_hours:
        for i, hour in enumerate(top_5_source_hours):
            record[f'source_hour{i+1}'] = hour
    
    if top_5_dest_hours:
        for i, hour in enumerate(top_5_dest_hours):
            record[f'dest_hour{i+1}'] = hour
    
    output_data.append(record)

# Create a DataFrame from the collected data
output_df_time = pd.DataFrame(output_data)

Checking Place ID: ChIJ0WGkg4FEzpQRrlsz_whLqZs
Checking Place ID: ChIJT7dJ01BYzpQRsuRSVDrKxPE
Checking Place ID: ChIJ7QKEpuNYzpQRz_I-_KF58iI
Checking Place ID: ChIJVdaCo1BYzpQR0nq7mStbVQ8
Checking Place ID: ChIJowq2_VtYzpQRMtacqu64abI
Checking Place ID: ChIJT7dJ01BYzpQRlGT63UWOzvs
Checking Place ID: ChIJVSdGDVBYzpQRktMjaPyYOS4
Checking Place ID: ChIJr5ZxTFdYzpQRpdx-V05M5-8
Checking Place ID: ChIJL3X1BVFYzpQRUbqb77lGQIo
Checking Place ID: ChIJk9QTEFBYzpQRZy2-yLs6DKE
Checking Place ID: ChIJ58ycDVBYzpQRS0OS9a9Q5Bc
Checking Place ID: ChIJAT0Lb6tZzpQRokvHm9Fi93U
Checking Place ID: ChIJiYgGyFBYzpQRsIuA6NJTUGc
Checking Place ID: ChIJlTMYmFBYzpQRWLWsNQ8fAoQ
Checking Place ID: ChIJych0KVxYzpQRu8lLqxf95i4
Checking Place ID: ChIJLbPnS-RYzpQR8bJP7OjVzMo
Checking Place ID: ChIJlbkdat0Dz5QRoqnk1jaIVHA
Checking Place ID: ChIJB-sBuDwCz5QRPHlWuocSWms
Checking Place ID: ChIJNytetjsCz5QRuVhQxq7aJZ8
Checking Place ID: ChIJV-0aZDwCz5QR_XJdF_e4wV4
Checking Place ID: ChIJDTYQ2tIDz5QR2VmNZYVH3ZM
Checking Plac

In [7]:
output_df_time

Unnamed: 0,source,source_lat,source_lon,destination,dest_lat,dest_lon,source_hour1,source_hour2,source_hour3,source_hour4,source_hour5,dest_hour1,dest_hour2,dest_hour3,dest_hour4,dest_hour5
0,157.0,-23.539144,-46.62193,84.0,-23.505072,-46.859987,12,13,11,10,14,14,15,16,13,12
1,84.0,-23.505072,-46.859987,157.0,-23.539144,-46.62193,14,15,16,13,12,12,13,11,10,14
2,120.0,-23.545626,-46.734611,226.0,-23.481354,-46.46427,11,10,9,8,7,11,12,10,13,14
3,241.0,-23.535974,-46.30188,115.0,-23.633616,-46.669118,18,19,14,17,16,11,12,13,10,14
4,226.0,-23.481354,-46.46427,28.0,-23.494149,-46.764421,11,12,10,13,14,13,12,11,14,10
5,110.0,-23.478594,-46.51092,195.0,-23.536484,-46.8178,16,15,17,14,13,15,16,17,8,9
6,157.0,-23.539144,-46.62193,232.0,-23.447804,-46.3274,12,13,11,10,14,14,11,13,10,15
7,91.0,-23.557378,-46.682599,84.0,-23.505072,-46.859987,13,14,15,12,16,14,15,16,13,12
8,195.0,-23.536484,-46.8178,110.0,-23.478594,-46.51092,15,16,17,8,9,16,15,17,14,13
9,134.0,-23.713856,-46.414621,115.0,-23.633616,-46.669118,13,14,12,15,16,11,12,13,10,14


In [8]:
# Optionally, save to a CSV file
output_df_time.to_csv('output_data/output_df_time.csv', index=False)