In [1]:
# Cell 1: Imports and folder setup
import os
import pandas as pd
import numpy as np

# path where you extracted CitiBike 2022 CSVs
DATA_RAW = "data/raw"         # adjust if your CSVs are in a different folder
PROCESSED = "data/processed"
os.makedirs(PROCESSED, exist_ok=True)

# list CSV files found (preview)
filepaths = sorted([os.path.join(DATA_RAW, f) for f in os.listdir(DATA_RAW) if f.lower().endswith('.csv')])
len(filepaths), filepaths[:5]

(12,
 ['data/raw\\JC-202201-citibike-tripdata.csv',
  'data/raw\\JC-202202-citibike-tripdata.csv',
  'data/raw\\JC-202203-citibike-tripdata.csv',
  'data/raw\\JC-202204-citibike-tripdata.csv',
  'data/raw\\JC-202205-citibike-tripdata.csv'])

In [2]:
# Cell 2: Read and concatenate all CSVs into one DataFrame
# NOTE: This approach uses a generator expression inside pd.concat to avoid building
# an intermediate list of dataframes in memory; it reads one file at a time and appends it.
df = pd.concat((pd.read_csv(f) for f in filepaths), ignore_index=True)

# Quick check
print("Rows,Cols:", df.shape)
df.head(3)

Rows,Cols: (895485, 13)


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,CA5837152804D4B5,electric_bike,2022-01-26 18:50:39,2022-01-26 18:51:53,12 St & Sinatra Dr N,HB201,12 St & Sinatra Dr N,HB201,40.750604,-74.02402,40.750604,-74.02402,member
1,BA06A5E45B6601D2,classic_bike,2022-01-28 13:14:07,2022-01-28 13:20:23,Essex Light Rail,JC038,Essex Light Rail,JC038,40.712774,-74.036486,40.712774,-74.036486,member
2,7B6827D7B9508D93,classic_bike,2022-01-10 19:55:13,2022-01-10 20:00:37,Essex Light Rail,JC038,Essex Light Rail,JC038,40.712774,-74.036486,40.712774,-74.036486,member


In [4]:
# Cell 3: Explanation (comment for graders/mentor)
# The code above does the following:
# 1. Collects all CSV file paths stored in "data/raw" into the `filepaths` list.
# 2. Uses a generator expression (pd.read_csv(f) for f in filepaths) that yields one DataFrame per CSV file.
# 3. pd.concat(...) concatenates these yielded DataFrames vertically (stack rows), creating a single combined DataFrame 'df'.
#    - ignore_index=True resets the index so the combined DataFrame has a continuous range index.
# 4. Using a generator here is memory-efficient because pd.read_csv reads one file at a time,
#    rather than building a large list of DataFrames in memory first (which could blow up RAM for large datasets).

In [5]:
# Cell 4: Sanity checks and save a small sample if you want
print("Columns:", df.columns.tolist()[:30])
# Save a small sample (safe to push) for demonstration purposes
sample_path = os.path.join(PROCESSED, "citibike_2022_sample_1000rows.csv")
df.sample(1000, random_state=42).to_csv(sample_path, index=False)
print("Sample saved:", sample_path)

Columns: ['ride_id', 'rideable_type', 'started_at', 'ended_at', 'start_station_name', 'start_station_id', 'end_station_name', 'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng', 'member_casual']
Sample saved: data/processed\citibike_2022_sample_1000rows.csv
