In [None]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parents[0]  # since notebook is in /notebooks
sys.path.append(str(PROJECT_ROOT / "src"))

PROJECT_ROOT


In [None]:
import pandas as pd
from pathlib import Path

p = Path("../data/processed/mobility_nyc_daily.csv")
df = pd.read_csv(p, parse_dates=["date"]).sort_values("date")

print("rows:", len(df))
print("min date:", df["date"].min())
print("max date:", df["date"].max())

# missing days in the date range
full = pd.date_range(df["date"].min(), df["date"].max(), freq="D")
missing = full.difference(df["date"])
print("missing day count:", len(missing))
print("first 10 missing:", missing[:10])


In [None]:
import pandas as pd
from pathlib import Path

cases_path = Path("../data/raw/COVID-19_Daily_Counts_of_Cases,_Hospitalizations,_and_Deaths_20260213.csv")
cases_raw = pd.read_csv(cases_path)
print(cases_raw.shape)
print(cases_raw.columns.tolist())
cases_raw.head()


In [None]:
# find date-like columns
for c in cases_raw.columns:
    if "date" in c.lower() or "time" in c.lower():
        print(c)


In [None]:
diner_path = Path("../data/raw/YoY_Seated_Diner_Data.csv")
diner_raw = pd.read_csv(diner_path)
print(diner_raw.shape)
print(diner_raw.columns.tolist())
diner_raw.head()


In [None]:
for c in diner_raw.columns:
    if "date" in c.lower():
        print("date col:", c)


In [None]:
ts_path = Path("../data/raw/time_series_US-NY_20191231-1900_20260213-1130.csv")
ts_raw = pd.read_csv(ts_path)
print(ts_raw.shape)
print(ts_raw.columns.tolist())
ts_raw.head()


In [None]:
top_path = Path("../data/raw/searched_with_top-queries_US-NY_20191231-1900_20260213-1134.csv")
top_raw = pd.read_csv(top_path)
print(top_raw.shape)
print(top_raw.columns.tolist())
top_raw.head()


In [None]:
import pandas as pd
from pathlib import Path

diner_path = Path("../data/raw/YoY_Seated_Diner_Data.csv")
diner = pd.read_csv(diner_path)

# Filter ONLY NYC city-level row
diner_nyc = diner[
    (diner["Type"] == "city") &
    (diner["Name"] == "New York")
].copy()

assert len(diner_nyc) == 1, "Expected exactly one NYC city row"

# Identify date columns
date_cols = [c for c in diner_nyc.columns if "/" in c]

# Wide â†’ long
diner_long = diner_nyc.melt(
    id_vars=["Type", "Name"],
    value_vars=date_cols,
    var_name="md",
    value_name="yoy_seated_diner"
)

# Add year (this dataset is 2020)
diner_long["date"] = pd.to_datetime("2020-" + diner_long["md"], errors="coerce")
diner_long = diner_long.dropna(subset=["date"]).sort_values("date")

opentable_daily = diner_long[["date", "yoy_seated_diner"]].copy()

print(opentable_daily.head())
print(opentable_daily.tail())
print("rows:", len(opentable_daily))

out = Path("../data/processed/opentable_yoy_daily.csv")
opentable_daily.to_csv(out, index=False)
print("Wrote:", out)


In [None]:
import pandas as pd
from pathlib import Path

df = pd.read_csv("../data/processed/cases_nyc_daily.csv", parse_dates=["date"])

print("rows:", len(df))
print("min:", df.date.min())
print("max:", df.date.max())

df.head()


In [None]:
import pandas as pd

df = pd.read_csv("../data/processed/nyc_master_daily.csv", parse_dates=["date"])

print(len(df))
print(df["date"].min(), df["date"].max())

# Check missing values
print(df.isna().sum())

# Confirm no duplicate dates
print("duplicate dates:", df["date"].duplicated().sum())
