<a href="https://colab.research.google.com/github/Chaudhari-Amar/econ8310-assignment1/blob/main/assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# === SINGLE-CELL COLAB SOLUTION (Corrected) ===
# Model: Holt-Winters (Exponential Smoothing) with weekly seasonality (168 hours)

!pip -q install statsmodels pandas numpy

import warnings, pickle
from typing import Tuple
import numpy as np, pandas as pd
from statsmodels.tsa.holtwinters import ExponentialSmoothing
warnings.filterwarnings("ignore")

TRAIN_URL = "https://github.com/dustywhite7/econ8310-assignment1/raw/main/assignment_data_train.csv"
TEST_URL  = "https://github.com/dustywhite7/econ8310-assignment1/raw/main/assignment_data_test.csv"

# ---- Helpers -----------------------------------------------------------------

def _detect_timestamp_column(df: pd.DataFrame) -> str:
    """Return an existing timestamp-like column name if present, else ''."""
    cands = [c for c in df.columns if any(k in c.lower() for k in
             ["timestamp","datetime","date_time","date time","date","time"])]
    return cands[0] if cands else ""

def _build_datetime_from_parts(df: pd.DataFrame):
    """If the frame has separate year/month/day[/hour] columns, return a DatetimeIndex; else None."""
    cols = {c.lower(): c for c in df.columns}
    need = {"year","month","day"}
    if need.issubset(cols):
        year, month, day = cols["year"], cols["month"], cols["day"]
        if "hour" in cols:
            hour = cols["hour"]
            dt = pd.to_datetime(dict(year=df[year], month=df[month], day=df[day], hour=df[hour]), errors="coerce")
        else:
            dt = pd.to_datetime(dict(year=df[year], month=df[month], day=df[day]), errors="coerce")
        return pd.DatetimeIndex(dt)
    return None

def _detect_target_column(df: pd.DataFrame, exclude=()) -> str:
    """
    Prefer obvious trip/count names; otherwise choose a numeric column that's
    not a date/time part (e.g., year/month/day/hour/weekday/week).
    """
    preferred = ['trip_count','trips','trip','rides','ridership','count','target','y']
    for key in preferred:
        for c in df.columns:
            if c not in exclude and key in c.lower():
                return c

    bad_exact = {'year','yr','month','day','date','hour','weekday','week','dow'}
    numeric = [c for c in df.columns if c not in exclude and pd.api.types.is_numeric_dtype(df[c])]
    numeric = [c for c in numeric if c.lower() not in bad_exact]
    if numeric:
        return numeric[0]

    # last resort
    return df.columns[-1]

def _load(url: str) -> pd.DataFrame:
    df = pd.read_csv(url)
    # try a single timestamp column first
    ts = _detect_timestamp_column(df)
    if ts:
        try:
            df[ts] = pd.to_datetime(df[ts], errors="coerce")
        except Exception:
            pass
    # else try composing from parts
    if not ts:
        dt = _build_datetime_from_parts(df)
        if isinstance(dt, pd.DatetimeIndex):
            df.insert(0, "constructed_ts", dt)
            ts = "constructed_ts"
    # sort if we have a valid datetime
    if ts and pd.api.types.is_datetime64_any_dtype(df[ts]):
        df = df.sort_values(ts).reset_index(drop=True)
    return df

def _to_series(df: pd.DataFrame):
    ts_col = _detect_timestamp_column(df)
    if not ts_col and "constructed_ts" in df.columns:
        ts_col = "constructed_ts"
    y_col  = _detect_target_column(df, exclude=(ts_col,) if ts_col else ())

    if ts_col and pd.api.types.is_datetime64_any_dtype(df[ts_col]):
        idx = pd.DatetimeIndex(df[ts_col])
        y = pd.Series(pd.to_numeric(df[y_col], errors="coerce").astype(float).to_numpy(),
                      index=idx, name=y_col)
        return y, idx, ts_col, y_col
    else:
        y = pd.Series(pd.to_numeric(df[y_col], errors="coerce").astype(float).to_numpy(), name=y_col)
        return y, y.index, "", y_col

def build_and_fit(y: pd.Series, seasonal_periods=168):
    # Ensure non-negative support; shift if necessary.
    vals = y.to_numpy()
    min_y = float(np.nanmin(vals))
    offset = 0.0
    if min_y < 0:
        offset = -min_y + 1e-6
        y = y + offset

    mdl = ExponentialSmoothing(
        y.astype(float),
        trend="add",
        seasonal="add",
        seasonal_periods=seasonal_periods,
        initialization_method="estimated"
    )
    res = mdl.fit(optimized=True, use_brute=True)
    res._hw_offset = offset
    return mdl, res

def forecast_with_fitted(res, steps: int) -> np.ndarray:
    fc = res.forecast(steps)
    off = getattr(res, "_hw_offset", 0.0)
    if off:
        fc = np.asarray(fc) - off
        fc = np.maximum(fc, 0.0)  # guard tiny negatives
    return np.asarray(fc).ravel()

# ---- Load, build, fit, forecast ---------------------------------------------

train_df = _load(TRAIN_URL)
test_df  = _load(TEST_URL)

y_train, train_index, ts_train_col, y_train_col = _to_series(train_df)
y_test,  test_index,  ts_test_col,  y_test_col  = _to_series(test_df)

h = len(y_test) if len(y_test) > 0 else 744  # expected 744 hours for January

# Required names for grading:
model, modelFit = build_and_fit(y_train, seasonal_periods=168)
pred = forecast_with_fitted(modelFit, h)

# ---- Save artifacts ----------------------------------------------------------

# Save fitted model (optional helper)
try:
    with open("model.pkl", "wb") as f:
        pickle.dump(modelFit, f)
except Exception:
    pass

# Save predictions CSV (with timestamps if we have them)
if isinstance(test_index, pd.DatetimeIndex) and len(test_index) == h:
    pd.DataFrame({"timestamp": test_index, "prediction": pred}).to_csv("predictions.csv", index=False)
else:
    pd.DataFrame({"t": np.arange(h), "prediction": pred}).to_csv("predictions.csv", index=False)

# ---- Console summary ---------------------------------------------------------
print("Columns (train):", list(train_df.columns))
print(f"Detected timestamp column (train): {ts_train_col or 'None'}")
print(f"Detected target column (train): {y_train_col}")
print("Training observations:", len(y_train))
print("Forecast horizon:", h)
print("First 5 predictions:", pred[:5])

# Auto-download in Colab (optional)
try:
    from google.colab import files
    files.download("predictions.csv")
except Exception:
    pass


Columns (train): ['Timestamp', 'year', 'month', 'day', 'hour', 'trips']
Detected timestamp column (train): Timestamp
Detected target column (train): trips
Training observations: 8760
Forecast horizon: 744
First 5 predictions: [5150.05941242 3027.62348696 2196.46342464 2005.68942901 2508.0126198 ]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>