# Stage 04: Data Acquisition and Ingestion

In [1]:
import os, pathlib, datetime as dt
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from dotenv import load_dotenv

## API Pull

In [3]:
url = "https://api.citybik.es/v2/networks/citi-bike-nyc"
response = requests.get(url)
data = response.json()

stations = data.get("network", {}).get("stations", [])
print("num stations:", len(stations))

for s in stations:
    name = s.get("name")
    free = s.get("free_bikes")             # CityBikes key
    empty = s.get("empty_slots")           # CityBikes key
    # fallbacks in case of partial/malformed items
    if name is None:
        name = s.get("extra", {}).get("uid", "<no name>")
    #print(f"{name}: free={free}, empty={empty}")




num stations: 2194


## I downloaded historic data from this website 
https://data.citybik.es/

In [4]:
load_dotenv()
RAW = pathlib.Path(os.getenv('DATA_DIR_RAW', '../data/raw'))
RAW.mkdir(parents=True, exist_ok=True)


# List all parquet files matching the pattern
files = sorted(RAW.glob("*-citi-bike-nyc-stats.parquet"))

# Read and concatenate
df = pd.concat([pd.read_parquet(f) for f in files], ignore_index=True)

# Make sure timestamp is datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])


print(df.shape)
print(df.head())


(46124185, 10)
             tag                                id  \
0  citi-bike-nyc  0007398f39bd26118c9fdfd5197ccada   
1  citi-bike-nyc  0007398f39bd26118c9fdfd5197ccada   
2  citi-bike-nyc  0007398f39bd26118c9fdfd5197ccada   
3  citi-bike-nyc  0007398f39bd26118c9fdfd5197ccada   
4  citi-bike-nyc  0007398f39bd26118c9fdfd5197ccada   

                                   nuid                  name   latitude  \
0  7aa86431-bbc5-4d9c-8198-9630c9d32a22  Chester Ave & 12 Ave  40.644367   
1  7aa86431-bbc5-4d9c-8198-9630c9d32a22  Chester Ave & 12 Ave  40.644367   
2  7aa86431-bbc5-4d9c-8198-9630c9d32a22  Chester Ave & 12 Ave  40.644367   
3  7aa86431-bbc5-4d9c-8198-9630c9d32a22  Chester Ave & 12 Ave  40.644367   
4  7aa86431-bbc5-4d9c-8198-9630c9d32a22  Chester Ave & 12 Ave  40.644367   

   longitude  bikes  free                                              extra  \
0 -73.984276     19     0  {"uid":"7aa86431-bbc5-4d9c-8198-9630c9d32a22",...   
1 -73.984276     18     1  {"uid":"7aa86431

In [5]:
df.to_parquet(RAW / "citibike_merged.parquet", index=False)

## Documentation
### API Source: 

https://api.citybik.es/v2/networks/citi-bike-nyc

The issue is that on the API there are only recent/live data and no historical values, therefore that would be a good way to optain data however to train the model, I would need historical data.

### Historical Data Source:

https://data.citybik.es/

and in this notebook, I just merged them in a single parquet file : `citibike_merged.parquet` stored in `data/raw/`