In [1]:
import os, pathlib
import pandas as pd
import numpy as np
from dotenv import load_dotenv
from pathlib import Path
import sys

import pyarrow.parquet as pq
import pyarrow.dataset as ds

PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))
from src.cleaning import *


load_dotenv()
RAW = pathlib.Path(os.getenv('DATA_DIR_RAW', '../data/raw'))
RAW.mkdir(parents=True, exist_ok=True)


PROC = pathlib.Path(os.getenv('DATA_DIR_PROC', '../data/processed'))
PROC.mkdir(parents=True, exist_ok=True)
# Define folder paths relative to this notebook



## Study Raw Dataset

In [2]:
files = sorted(RAW.glob("*-citi-bike-nyc-stats.parquet"))
# Making sure they are all the same shape
for f in files:
    schema = pq.read_schema(f)
    print(f"{f.name}:")
    print(schema.names)
    print("-" * 60)

for f in files:
    dataset = ds.dataset(f, format="parquet")
    head_table = dataset.head(5)   # returns Arrow table
    df_head = head_table.to_pandas()
    # print(f"{f}:")
    # print(df_head)
    # print("-" * 60)

202411-citi-bike-nyc-stats.parquet:
['tag', 'id', 'nuid', 'name', 'latitude', 'longitude', 'bikes', 'free', 'extra', 'timestamp']
------------------------------------------------------------
202412-citi-bike-nyc-stats.parquet:
['tag', 'id', 'nuid', 'name', 'latitude', 'longitude', 'bikes', 'free', 'extra', 'timestamp']
------------------------------------------------------------
202501-citi-bike-nyc-stats.parquet:
['tag', 'id', 'nuid', 'name', 'latitude', 'longitude', 'bikes', 'free', 'extra', 'timestamp']
------------------------------------------------------------
202502-citi-bike-nyc-stats.parquet:
['tag', 'id', 'nuid', 'name', 'latitude', 'longitude', 'bikes', 'free', 'extra', 'timestamp']
------------------------------------------------------------
202503-citi-bike-nyc-stats.parquet:
['tag', 'id', 'nuid', 'name', 'latitude', 'longitude', 'bikes', 'free', 'extra', 'timestamp']
------------------------------------------------------------
202504-citi-bike-nyc-stats.parquet:
['tag', '

## Cleaning

### Assumption : 
- `tag` is the same for all the the entry it just says that it is in new york, we can drop it safely

- `id`, `nuid`, and `name` are unique per station, so I decided to only keep the `name`

- I wont be using the localisation, so I will also drop `latitude` or `longitude`

- And I will also drop `extra` because it does not provide any information useful that i cant create (like `total_slot` = `bikes` + `free`

In [3]:
use_cols = ["name", "bikes", "free", "timestamp"]
df = pd.read_parquet("../data/raw/citibike_merged.parquet", columns=use_cols)

In [None]:
# Make sure timestamp is datetime and you have a date column
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['date'] = df['timestamp'].dt.date


df = df.sort_values(['name', 'date', 'timestamp'])


g = df.groupby(['name', 'date'])

# First row in each (name, date)
is_first = g.cumcount() == 0
# Last row in each (name, date)
is_last  = g.cumcount(ascending=False) == 0

mask = is_first | is_last
df = df[mask]

print(df.head(20))

In [None]:
df.to_parquet(PROC / "citibike_cleaned.parquet", index=False)

print("Saved to:", PROC / "citibike_cleaned.parquet")