In [1]:
import os

# Change directory to your target path
os.chdir('/Users/carolinarutilidelima/Documents/OD_DC/')

#os.chdir('/home/carolima/Documents/GitHub/OD_Sao_Paulo')


In [2]:
import pandas as pd

# Load the CSV file into a DataFrame
file_path = './output_data/top_50_station_pairs.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to confirm it was loaded correctly
df.head()

Unnamed: 0,closest_source_station_Index,closest_dest_station_Index,count,PULat,PULon,DOLat,DOLon
0,5,44,10388,40.677833,-73.785667,40.694833,-74.117167
1,23,17,5449,40.764667,-74.000833,40.576167,-74.207167
2,5,49,5128,40.677833,-73.785667,40.612333,-74.153833
3,5,49,5128,40.677833,-73.785667,40.612333,-74.153833
4,46,44,5119,40.891833,-73.895333,40.694833,-74.117167


In [3]:
from geopy.distance import geodesic

# Calculate distance between source and destination
def calculate_distance(row):
    source_coords = (row["PULat"], row["PULon"])
    dest_coords = (row["DOLat"], row["DOLon"])
    return geodesic(source_coords, dest_coords).kilometers

# Add a new column for distance
df["distance_km"] = df.apply(calculate_distance, axis=1)

# Filter out rows where the distance is less than 30km
filtered_df = df[df["distance_km"] >= 30]

# Display the filtered dataframe
filtered_df

Unnamed: 0,closest_source_station_Index,closest_dest_station_Index,count,PULat,PULon,DOLat,DOLon,distance_km
2,5,49,5128,40.677833,-73.785667,40.612333,-74.153833,31.979459
3,5,49,5128,40.677833,-73.785667,40.612333,-74.153833,31.979459
5,49,5,4413,40.612333,-74.153833,40.677833,-73.785667,31.979459
6,49,5,4413,40.612333,-74.153833,40.677833,-73.785667,31.979459
9,13,31,2653,40.877333,-73.828167,40.575667,-73.944,34.899708
10,31,13,2496,40.575667,-73.944,40.877333,-73.828167,34.899708
11,5,17,2408,40.677833,-73.785667,40.576167,-74.207167,37.406464
12,13,44,2174,40.877333,-73.828167,40.694833,-74.117167,31.713966
13,31,46,2136,40.575667,-73.944,40.891833,-73.895333,35.349775
14,1,49,2096,40.822667,-73.887,40.612333,-74.153833,32.463249


In [4]:
import googlemaps
import populartimes


API_KEY = 'APYKEY'
gmaps = googlemaps.Client(key=API_KEY)


In [5]:
def get_popular_times_for_location(lat, lon):
    # Use the Places API to find nearby places with a larger radius
    places_result = gmaps.places_nearby(location=(lat, lon), radius=2000, type='establishment')
    
    # Iterate through the results to find a place with Popular Times data
    for place in places_result['results']:
        place_id = place['place_id']
        print(f"Checking Place ID: {place_id}")
        
        # Use populartimes library to get the popular times
        try:
            popular_times_data = populartimes.get_id(API_KEY, place_id)
            # Extract popular times information if available
            if 'populartimes' in popular_times_data:
                # Aggregate popularity data across all days
                hourly_popularity = [0] * 24
                for day_data in popular_times_data['populartimes']:
                    for hour, popularity in enumerate(day_data['data']):
                        hourly_popularity[hour] += popularity
                
                # Sort the hours based on aggregated popularity in descending order
                top_5_hours = sorted(range(24), key=lambda h: hourly_popularity[h], reverse=True)[:5]
                return top_5_hours
        except Exception as e:
            print(f"An error occurred while retrieving popular times data for place ID {place_id}: {e}")
    
    return None  # If no popular times data found



In [8]:
# Assuming df is the DataFrame loaded with the top 5 pairs data
output_data = []

for _, row in df.iterrows():  # limit to top 5 rows as per your requirement
    source_name = row['closest_source_station_Index']
    source_lat = row['PULat']
    source_lon = row['PULon']

    dest_name = row['closest_dest_station_Index']
    dest_lat = row['DOLat']
    dest_lon = row['DOLon']
    
    # Get the top 5 popular hours for this source
    top_5_source_hours = get_popular_times_for_location(source_lat, source_lon)

    # Get the top 5 popular hours for this destination
    top_5_dest_hours = get_popular_times_for_location(dest_lat, dest_lon)
    
    record = {
        'source': source_name,
        'source_lat': source_lat,
        'source_lon': source_lon,
        'destination': dest_name,
        'dest_lat': dest_lat,
        'dest_lon': dest_lon
    }

    if top_5_source_hours:
        for i, hour in enumerate(top_5_source_hours):
            record[f'source_hour{i+1}'] = hour
    
    if top_5_dest_hours:
        for i, hour in enumerate(top_5_dest_hours):
            record[f'dest_hour{i+1}'] = hour
    
    output_data.append(record)

# Create a DataFrame from the collected data
output_df_time = pd.DataFrame(output_data)


Checking Place ID: ChIJOwg_06VPwokRYv534QaPC8g
Checking Place ID: ChIJ_XHigN1mwokRK7RneOx0C_4
Checking Place ID: ChIJYWxh7-dmwokRNre_Fuocl_8
Checking Place ID: ChIJZaYNEehmwokR-BbeDiBKeo8
Checking Place ID: ChIJi8RA1OlmwokRorQgjXISzck
Checking Place ID: ChIJYVL1t-dmwokRJfvPlVd3ZXY
Checking Place ID: ChIJ0UPl6edmwokR6d5rYomFoEo
Checking Place ID: ChIJn3dIO-BmwokRx73btKlDuZk
Checking Place ID: ChIJi5pe3x9nwokR2pTa8hZw0_Y
Checking Place ID: ChIJzWrnsedmwokRn-Ty0VpKtfE
Checking Place ID: ChIJ0X2w58JmwokR9Zj7xGM0lAs
Checking Place ID: ChIJv1dW_SdnwokRZOt9sVf0D0Y
Checking Place ID: ChIJ01SeWNxmwokRTLvpAzQZYUA
Checking Place ID: ChIJq-msZyNnwokR-r517BiAy-M
Checking Place ID: ChIJHQ6aMnBTwokRc-T-3CrcvOE
Checking Place ID: ChIJ8yxbuJ1RwokR8Ni3sUYZOaQ
Checking Place ID: ChIJNz2gaHVRwokRLqGLxeRkZVo
Checking Place ID: ChIJOwg_06VPwokRYv534QaPC8g
Checking Place ID: ChIJf5OYzE9YwokREJzjUSSeMaI
Checking Place ID: ChIJcfmS1FFYwokR98RN442V41c
Checking Place ID: ChIJF7GMl01YwokRmRIs_AxP9r0
Checking Plac

In [9]:
output_df_time

Unnamed: 0,source,source_lat,source_lon,destination,dest_lat,dest_lon,source_hour1,source_hour2,source_hour3,source_hour4,source_hour5,dest_hour1,dest_hour2,dest_hour3,dest_hour4,dest_hour5
0,5.0,40.677833,-73.785667,44.0,40.694833,-74.117167,12.0,13.0,11.0,10.0,14.0,11.0,10.0,12.0,14.0,13.0
1,23.0,40.764667,-74.000833,17.0,40.576167,-74.207167,13.0,14.0,12.0,15.0,11.0,16.0,15.0,14.0,17.0,11.0
2,5.0,40.677833,-73.785667,49.0,40.612333,-74.153833,12.0,13.0,11.0,10.0,14.0,15.0,14.0,16.0,13.0,17.0
3,5.0,40.677833,-73.785667,49.0,40.612333,-74.153833,12.0,13.0,11.0,10.0,14.0,15.0,14.0,16.0,13.0,17.0
4,46.0,40.891833,-73.895333,44.0,40.694833,-74.117167,17.0,16.0,18.0,15.0,14.0,11.0,10.0,12.0,14.0,13.0
5,49.0,40.612333,-74.153833,5.0,40.677833,-73.785667,15.0,14.0,16.0,13.0,17.0,12.0,13.0,11.0,10.0,14.0
6,49.0,40.612333,-74.153833,5.0,40.677833,-73.785667,15.0,14.0,16.0,13.0,17.0,12.0,13.0,11.0,10.0,14.0
7,17.0,40.576167,-74.207167,23.0,40.764667,-74.000833,16.0,15.0,14.0,17.0,11.0,13.0,14.0,12.0,15.0,11.0
8,14.0,40.737,-73.770833,44.0,40.694833,-74.117167,14.0,15.0,13.0,16.0,17.0,11.0,10.0,12.0,14.0,13.0
9,13.0,40.877333,-73.828167,31.0,40.575667,-73.944,15.0,14.0,16.0,13.0,17.0,21.0,20.0,19.0,22.0,18.0


In [10]:
# Optionally, save to a CSV file
output_df_time.to_csv('output_data/output_df_time.csv', index=False)