In [14]:
import os
from dotenv import load_dotenv
from pathlib import Path

# Explicitly set the correct path to the .env file
env_path = Path('/Users/farshid/taxi_demand_predictor/src/.env')

# Debugging step to verify the path
print(f"Checking if .env exists at: {env_path}")
print(f"Exists: {env_path.exists()}")

# Load the .env file
load_dotenv(env_path)

# Check if the variable is in the environment
if 'HOPSWORKS_API_KEY' in os.environ:
    print("HOPSWORKS_API_KEY is set.")
else:
    print("HOPSWORKS_API_KEY is not found in environment.")

HOPSWORKS_API_KEY = os.environ['HOPSWORKS_API_KEY']




Checking if .env exists at: /Users/farshid/taxi_demand_predictor/src/.env
Exists: True
HOPSWORKS_API_KEY is set.


In [15]:
from datetime import datetime
import pandas as pd
import sys
from pathlib import Path
import os

# Explicitly set the working directory to the project root (taxi_demand_predictor)
project_root = Path.cwd().parent  # Adjust if needed
os.chdir(project_root)

# Check if the src directory is added to the sys.path
src_path = project_root / 'src'

# Debugging step to verify the correct path
print(f"Project root: {project_root}")
print(f"Checking if src exists at: {src_path}")
print(f"Exists: {src_path.exists()}")

# Add the `src` folder to Python path if it exists
if src_path.exists():
    if str(src_path) not in sys.path:
        sys.path.append(str(src_path))
    print(f"Added {src_path} to sys.path")
else:
    print(f"Error: The path {src_path} does not exist. Check your project structure.")

# Verify if sys.path includes the 'src' directory
print("Current sys.path:", sys.path)

# Try importing the module again
try:
    from src.data import load_raw_data
except ModuleNotFoundError as e:
    print("Error importing module:", e)

# Download raw data
from_year = 2024
to_year = datetime.now().year
print(f'Downloading raw data from {from_year} to {to_year}')

rides = pd.DataFrame()
for year in range(from_year, to_year + 1):
    # Download data for the whole year
    rides_one_year = load_raw_data(year)

    # Append rows
    rides = pd.concat([rides, rides_one_year])

print(f"Data downloaded: {len(rides)} rows")



Project root: /Users/farshid
Checking if src exists at: /Users/farshid/src
Exists: False
Error: The path /Users/farshid/src does not exist. Check your project structure.
Current sys.path: ['/Library/Frameworks/Python.framework/Versions/3.12/lib/python312.zip', '/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12', '/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/lib-dynload', '', '/Users/farshid/taxi_demand_predictor/.venv/lib/python3.12/site-packages', '/Users/farshid/taxi_demand_predictor/src']
Downloading raw data from 2024 to 2025
File 2024-01 was already in local storage
File 2024-02 was already in local storage
File 2024-03 was already in local storage
File 2024-04 was already in local storage
File 2024-05 was already in local storage
File 2024-06 was already in local storage
File 2024-07 was already in local storage
File 2024-08 was already in local storage
File 2024-09 was already in local storage
File 2024-10 was already in local storage
File 20

In [16]:
print(f'{len(rides)=:,}')


len(rides)=37,500,963


In [17]:
from src.data import transform_raw_data_into_ts_data

ts_data = transform_raw_data_into_ts_data(rides)

100%|██████████| 265/265 [00:01<00:00, 166.33it/s]


In [18]:
# string to datetime
ts_data['pickup_hour'] = pd.to_datetime(ts_data['pickup_hour'], utc=True)

# add column with Unix epoch milliseconds
ts_data['pickup_ts'] = ts_data['pickup_hour'].astype(int) // 10**6

In [19]:
import hopsworks


In [20]:
HOPSWORKS_PROJECT_NAME = 'Taxi_demand_predictor'


In [21]:
# connect to the project
project = hopsworks.login(
    project=HOPSWORKS_PROJECT_NAME,
    api_key_value=HOPSWORKS_API_KEY
)

2025-02-03 21:22:52,479 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-02-03 21:22:52,483 INFO: Initializing external client
2025-02-03 21:22:52,483 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-02-03 21:22:52,960 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1211556


In [22]:
feature_store = project.get_feature_store()


In [23]:
FEATURE_GROUP_NAME = 'time_series_hourly_feature_group'
FEATURE_GROUP_VERSION = 3

In [24]:
feature_group = feature_store.get_or_create_feature_group(
    name=FEATURE_GROUP_NAME,
    version=FEATURE_GROUP_VERSION,
    description="Time-series data at hourly frequency",
    primary_key = ['pickup_location_id', 'pickup_ts'],
    event_time='pickup_ts',
)

In [25]:
# Check the first few rows and data types
print(ts_data.head())
print(ts_data.dtypes)


                pickup_hour  rides  pickup_location_id      pickup_ts
0 2024-01-01 00:00:00+00:00      0                   1  1704067200000
1 2024-01-01 01:00:00+00:00      0                   1  1704070800000
2 2024-01-01 02:00:00+00:00      0                   1  1704074400000
3 2024-01-01 03:00:00+00:00      0                   1  1704078000000
4 2024-01-01 04:00:00+00:00      0                   1  1704081600000
pickup_hour           datetime64[ns, UTC]
rides                               int64
pickup_location_id                  int64
pickup_ts                           int64
dtype: object


In [26]:
feature_group.insert(ts_data, write_options={"wait_for_job": False})


Uploading Dataframe: 100.00% |██████████| Rows 2130600/2130600 | Elapsed Time: 01:39 | Remaining Time: 00:00


Launching job: time_series_hourly_feature_group_3_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1211556/jobs/named/time_series_hourly_feature_group_3_offline_fg_materialization/executions


(Job('time_series_hourly_feature_group_3_offline_fg_materialization', 'SPARK'),
 None)