# Stock Price Modeling: Linear Regression + MAE

In [None]:
# Visualization
# Setup
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import yfinance as yf
from datetime import datetime, timedelta
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt

pd.set_option('display.width', 150)
pd.set_option('display.max_columns', 50)

# params
TICKER = 'AAPL'        
PERIOD = '5y'         
INTERVAL = '1d'        
TARGET_COL = 'Close'   # predict next-day close
TEST_SPLIT_RATIO = 0.2 # last 20% of samples for final holdout
N_FEATURES = 15        # cap on number of features after selection


In [None]:
# Data
raw = yf.download(TICKER, period=PERIOD, interval=INTERVAL, auto_adjust=False, progress=False)
if raw.empty:
    raise RuntimeError('No data returned. Check ticker or network access.')
df = raw.copy()
df = df.dropna().rename_axis('Date').reset_index()
df.head()

## Features

In [None]:
def rsi(series, window=14):
    delta = series.diff()
    up = delta.clip(lower=0)
    down = -1 * delta.clip(upper=0)
    roll_up = up.ewm(alpha=1/window, adjust=False).mean()
    roll_down = down.ewm(alpha=1/window, adjust=False).mean()
    rs = roll_up / (roll_down + 1e-9)
    return 100 - (100 / (1 + rs))

def make_features(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    # basic returns
    out['ret_1d'] = out[TARGET_COL].pct_change()
    out['ret_5d'] = out[TARGET_COL].pct_change(5)
    out['ret_10d'] = out[TARGET_COL].pct_change(10)
    # moving averages / momentum
    for w in [5, 10, 20, 50, 100, 200]:
        out[f'sma_{w}'] = out[TARGET_COL].rolling(w).mean()
        out[f'ema_{w}'] = out[TARGET_COL].ewm(span=w, adjust=False).mean()
        out[f'sma_ratio_{w}'] = out[TARGET_COL] / (out[f'sma_{w}'] + 1e-9)
    # volatility proxies
    out['hl_range'] = (out['High'] - out['Low']) / (out['Close'] + 1e-9)
    out['atr_14'] = (out[['High','Low','Close']]
                     .assign(prev_close=out['Close'].shift(1))
                     .pipe(lambda d: pd.concat([
                         (d['High']-d['Low']).abs(),
                         (d['High']-d['prev_close']).abs(),
                         (d['Low']-d['prev_close']).abs()
                     ], axis=1).max(axis=1))
                     .rolling(14).mean())
    # RSI + rolling std
    out['rsi_14'] = rsi(out[TARGET_COL], 14)
    out['std_10'] = out[TARGET_COL].rolling(10).std()
    out['std_20'] = out[TARGET_COL].rolling(20).std()
    # volume-related
    out['vol_chg'] = out['Volume'].pct_change()
    out['vwap_proxy'] = (out['High'] + out['Low'] + out['Close']) / 3.0
    # lags
    for l in [1,2,3,5,10]:
 
        out[f'lag_{l}'] = out[TARGET_COL].shift(l)
    # target: next day close
    out['y'] = out[TARGET_COL].shift(-1)
    return out

feat = make_features(df).dropna().reset_index(drop=True)
feat.head()

## Train / Validate

In [None]:
all_features = [c for c in feat.columns if c not in ['Date', 'y'] and not c.endswith('_y')]
X = feat[all_features].values
y = feat['y'].values

# chronological split
n = len(feat)
split_idx = int(n * (1 - TEST_SPLIT_RATIO))
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]
dates_test = feat.loc[split_idx:, 'Date']

# feature selection + linear model
tscv = TimeSeriesSplit(n_splits=5)
selector = SelectKBest(score_func=f_regression, k=min(N_FEATURES, X_train.shape[1]))
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('select', selector),
    ('lr', LinearRegression())
])

# fit on train; simple CV just to sanity-check (no hyperparams here)
pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)
mae = mean_absolute_error(y_test, pred)
print(f"Holdout MAE: {mae:,.4f}")

# chosen features
mask = selector.get_support(indices=True)
selected_cols = [all_features[i] for i in mask]
print('Selected features:', selected_cols)


## Diagnostics

In [None]:
plt.figure()
plt.plot(dates_test.values, y_test, label='Actual')
plt.plot(dates_test.values, pred, label='Predicted')
plt.legend()
plt.title(f'{TICKER} – Actual vs Predicted (Holdout)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# error distribution
err = y_test - pred
plt.figure()
plt.hist(err, bins=50)
plt.title('Residuals (Holdout)')
plt.tight_layout()
plt.show()

## One-step-ahead forecast

In [None]:
# refit on all data and predict next day close using last row's features
X_all = feat[all_features].values
y_all = feat['y'].values
pipe.fit(X_all, y_all)

last_row = X_all[-1:]
next_close_pred = pipe.predict(last_row)[0]
print(f"Next-day predicted close for {TICKER}: {next_close_pred:,.4f}")

### Notes
- No leakage (target is next-day close).
- Feature cap via `SelectKBest` keeps it tight; swap in `mutual_info_regression` if nonlinearity dominates.
- Real trading needs slippage/transaction costs and robust backtesting; this is a point-forecast demo.
- If you want more stability, consider regularized linear models (Ridge/Lasso) with `TimeSeriesSplit` grid search.