In [None]:
import os

data_path = os.path.join("..", "data")
bronze_data_path = os.path.join(data_path, "bronze")
bronze = os.path.join(bronze_data_path, "ncr_ride_bookings.csv")
silver_data_file_path = os.path.join(data_path, "silver", "uber-silver-data.csv")

In [None]:
import pandas as pd

silver_df = pd.read_csv(silver_data_file_path)
silver_df.head()

# Geolocation

In [None]:
unique_locations = list(silver_df["pickup_location"].unique()) + list(silver_df["drop_location"].unique())

loc_dict = {unique_location: None for unique_location in unique_locations}

In [None]:
from tqdm.notebook import tqdm
from geopy.geocoders import Nominatim
import time

loc = Nominatim(user_agent="GetLoc", timeout=10)


# Increase timeout to 10 seconds
def get_loc(location_string: str, max_retries: int = 3, delay: float = 1.5):
    for attempt in range(max_retries):
        try:
            get_location = loc.geocode(location_string)
            if get_location is None:
                return None, None
            return get_location.latitude, get_location.longitude

        except Exception as e:
            print(f"Attempt {attempt + 1} failed for '{location_string}': {e}")
            if attempt < max_retries - 1:
                time.sleep(delay)
            else:
                return None, None


for location in tqdm(loc_dict):
    if loc_dict[location] is None:
        loc_dict[location] = get_loc(location)

In [None]:
# convert to dataframe
loc_df = (
    pd.DataFrame.from_dict(loc_dict, orient="index", columns=["latitude", "longitude"])
    .reset_index()
    .rename(columns={"index": "location"})
)

print(loc_df.head())

# Weather

In [None]:
import requests_cache
import openmeteo_requests
from retry_requests import retry
import numpy as np
import time


def get_weather_info(long, lat, start_date="2024-01-01", end_date="2024-12-30", max_retries=3, delay=60):
    # Setup the Open-Meteo API client with cache and retry on error
    cache_session = requests_cache.CachedSession(".cache", expire_after=-1)
    retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
    openmeteo = openmeteo_requests.Client(session=retry_session)

    url = "https://archive-api.open-meteo.com/v1/archive"
    params = {
        "latitude": lat,
        "longitude": long,
        "start_date": start_date,
        "end_date": end_date,
        "hourly": ["temperature_2m", "rain", "snowfall", "wind_speed_10m", "wind_speed_100m"],
    }

    for attempt in range(max_retries):
        try:
            responses = openmeteo.weather_api(url, params=params)
            response = responses[0]

            hourly = response.Hourly()
            hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
            hourly_rain = hourly.Variables(1).ValuesAsNumpy()
            hourly_snowfall = hourly.Variables(2).ValuesAsNumpy()
            hourly_wind_speed_10m = hourly.Variables(3).ValuesAsNumpy()
            hourly_wind_speed_100m = hourly.Variables(4).ValuesAsNumpy()

            hourly_data = {
                "date": pd.date_range(
                    start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
                    end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
                    freq=pd.Timedelta(seconds=hourly.Interval()),
                    inclusive="left",
                )
            }

            hourly_data["temperature_2m"] = hourly_temperature_2m
            hourly_data["rain"] = hourly_rain
            hourly_data["snowfall"] = hourly_snowfall
            hourly_data["wind_speed_10m"] = hourly_wind_speed_10m
            hourly_data["wind_speed_100m"] = hourly_wind_speed_100m

            return hourly_data
        except Exception as e:
            print(f"Attempt {attempt + 1} failed for weather API at ({lat}, {long}): {e}")
            if attempt < max_retries - 1:
                print("Waiting 1 minute before retrying...")
                time.sleep(delay)
            else:
                print("All retries failed.")
                return None

In [None]:
min_date = silver_df["date"].min()
max_date = silver_df["date"].max()

weather_dfs = []

for idx, row in tqdm(loc_df.dropna().iterrows(), total=loc_df.dropna().shape[0]):
    location = row["location"]
    if row["latitude"] == np.nan or row["longitude"] == np.nan:
        print(f"Skipping location {location} due to missing coordinates.")
        continue
    weather_data = get_weather_info(
        long=row["longitude"],
        lat=row["latitude"],
        start_date=min_date,
        end_date=max_date,
    )
    weather_df = pd.DataFrame(weather_data)
    weather_df["location"] = location
    weather_dfs.append(weather_df)


all_weather_df = pd.concat(weather_dfs, ignore_index=True)
# Convert date to format 2024-03-23 12:29:38
all_weather_df["date"] = pd.to_datetime(all_weather_df["date"]).dt.strftime("%Y-%m-%d %H:%M:%S")

pd.set_option("display.float_format", "{:.4f}".format)
print(all_weather_df.head())
all_weather_df.describe()