# Homework Starter — Stage 10b: Time Series & Classification
Fill in the TODOs. Use your own dataset or adapt the synthetic generator below.

In [1]:
# Imports
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, TimeSeriesSplit
np.random.seed(7); sns.set(); plt.rcParams['figure.figsize']=(9,4)

## Option B: Synthetic Generator (Use if you don't have data ready)

In [3]:
# Synthetic series with regimes & jumps
n=500
dates=pd.bdate_range('2021-01-01', periods=n)
mu = np.where(np.arange(n)<n//2, 0.0003, -0.0001)
sigma = np.where(np.arange(n)<n//2, 0.01, 0.015)
eps = np.random.normal(mu, sigma)
jumps = np.zeros(n); jump_days = np.random.choice(np.arange(20,n-20), size=5, replace=False)
jumps[jump_days] = np.random.normal(0,0.05,size=len(jump_days))
rets = eps + jumps
price = 100*np.exp(np.cumsum(rets))
df = pd.DataFrame({'price':price}, index=dates)
df['ret'] = df['price'].pct_change().fillna(0.0)
df['log_ret'] = np.log1p(df['ret'])
df.head()

Unnamed: 0,price,ret,log_ret
2021-01-01,101.735412,0.0,0.0
2021-01-04,101.292875,-0.00435,-0.004359
2021-01-05,101.356527,0.000628,0.000628
2021-01-06,101.80095,0.004385,0.004375
2021-01-07,101.031283,-0.007561,-0.007589


## Feature Engineering

In [13]:
# TODO: create at least two features
df['lag_1'] = df['ret'].shift(1)
df['roll_mean_5'] = df['ret'].rolling(5).mean().shift(1)

# Add your own:
# 1. Rolling volatility (21-day standard deviation)
df['roll_vol_21'] = df['ret'].rolling(21).std().shift(1)
# 2. Momentum (sum of last 10 returns)
df['momentum_10'] = df['ret'].rolling(10).sum().shift(1)


# df['roll_vol_20'] = df['ret'].rolling(20).std().shift(1)
df['y_next_ret'] = df['ret'].shift(-1)
df['y_up'] = (df['y_next_ret']>0).astype(int)
df_feat = df.dropna().copy()
df_feat.head()

Unnamed: 0,price,ret,log_ret,lag_1,roll_mean_5,y_next_ret,y_up,roll_vol_21,momentum_10
2021-02-01,100.383751,0.001845,0.001843,0.016949,0.002509,-0.003565,0,0.008242,-0.002068
2021-02-02,100.02588,-0.003565,-0.003571,0.001845,0.001706,0.020804,1,0.00826,0.001192
2021-02-03,102.106835,0.020804,0.020591,-0.003565,0.000685,-0.000154,0,0.008244,-0.00774
2021-02-04,102.091126,-0.000154,-0.000154,0.020804,0.004236,-0.014106,0,0.009474,0.015375
2021-02-05,100.650994,-0.014106,-0.014207,-0.000154,0.007176,-0.003745,0,0.00943,0.017346


## Split

In [15]:
# Time-aware split
cut=int(len(df_feat)*0.8)
train, test = df_feat.iloc[:cut], df_feat.iloc[cut:]
features=['lag_1','roll_mean_5']  # extend as you add features
X_tr, X_te = train[features], test[features]
y_tr_reg, y_te_reg = train['y_next_ret'], test['y_next_ret']
y_tr_clf, y_te_clf = train['y_up'], test['y_up']

## Pipeline + Model (Choose one track below)

In [17]:
# Track 1: Forecasting returns
reg = Pipeline([('scaler', StandardScaler()), ('linreg', LinearRegression())])
reg.fit(X_tr, y_tr_reg)
pred = reg.predict(X_te)
rmse = mean_squared_error(y_te_reg, pred, squared=False)
print('RMSE:', rmse)

RMSE: 0.014484789970194234




## Interpretation (Markdown)
- What worked?
  - The time-aware split ensured no lookahead bias, which is critical for financial time series. The lag and rolling features did provide a predictive signal, showing that short-term memory of returns helps forecasting.
- Where might assumptions fail?
    - Returns distributions can shift over time.
    - Return residuals often have fat tails.
- How would you extend features or model?
    - extend to the GARCH or VAR model for further analysis of the time-series case.