# Day 4: Feature Selection Methods

## ðŸŽ¯ Learning Objectives
- Filter, wrapper, embedded methods
- Recursive Feature Elimination
- Importance-based selection

---

In [16]:
import numpy as np
import pandas as pd
import yfinance as yf
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import warnings
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_selection import RFE, SelectKBest, f_classif, mutual_info_classif
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')

ticker = 'SPY'
end_date = datetime.now()
start_date = end_date - timedelta(days=5*365)

print("ðŸ“¥ Downloading data...")
data = yf.download(ticker, start=start_date, end=end_date, progress=False)

# Handle multi-level columns from yfinance
if isinstance(data.columns, pd.MultiIndex):
    data.columns = data.columns.get_level_values(0)

print(f"âœ… Data: {len(data)} days")

ðŸ“¥ Downloading data...
âœ… Data: 1254 days


In [17]:
# Create many features (potential overfitting)
df = data.copy()
close = df['Close']

# Returns at various lags
for lag in [1, 2, 3, 5, 10, 20, 40, 60]:
    df[f'ret_{lag}'] = close.pct_change(lag)

# Moving averages
for w in [5, 10, 20, 50, 100, 200]:
    df[f'sma_{w}'] = close.rolling(w).mean()
    df[f'sma_{w}_dist'] = (close - df[f'sma_{w}']) / df[f'sma_{w}']

# Volatility
for w in [5, 10, 20, 60]:
    df[f'vol_{w}'] = df['ret_1'].rolling(w).std()

# Volume
df['vol_ratio'] = df['Volume'] / df['Volume'].rolling(20).mean()

# RSI variations
for w in [7, 14, 21]:
    delta = close.diff()
    gain = delta.where(delta > 0, 0).rolling(w).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(w).mean()
    df[f'rsi_{w}'] = 100 - 100 / (1 + gain / loss)

# Target
df['target'] = np.sign(close.shift(-1) - close)
df['target'] = df['target'].map({1: 1, -1: 0, 0: 1})

df = df.dropna()
feature_cols = [c for c in df.columns if c not in ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'target'] + [f'sma_{w}' for w in [5,10,20,50,100,200]]]

print(f"\nðŸ“Š Total features: {len(feature_cols)}")


ðŸ“Š Total features: 22


In [18]:
# Split data
train_size = int(len(df) * 0.8)
train = df.iloc[:train_size]
test = df.iloc[train_size:]

X_train = train[feature_cols]
y_train = train['target']
X_test = test[feature_cols]
y_test = test['target']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [19]:
# Method 1: Filter - SelectKBest with F-score
selector_f = SelectKBest(f_classif, k=10)
selector_f.fit(X_train_scaled, y_train)

f_scores = pd.DataFrame({
    'feature': feature_cols,
    'f_score': selector_f.scores_
}).sort_values('f_score', ascending=False)

print("\n" + "="*60)
print("METHOD 1: F-SCORE (Filter)")
print("="*60)
print(f_scores.head(10))


METHOD 1: F-SCORE (Filter)
         feature   f_score
17        vol_60  6.237606
16        vol_20  4.818270
14         vol_5  4.468947
13  sma_200_dist  3.861844
15        vol_10  3.746219
21        rsi_21  1.725685
2          ret_3  1.000293
12  sma_100_dist  0.828660
0          ret_1  0.759033
7         ret_60  0.564060


In [20]:
# Method 2: Mutual Information
mi_scores = mutual_info_classif(X_train_scaled, y_train, random_state=42)
mi_df = pd.DataFrame({
    'feature': feature_cols,
    'mi_score': mi_scores
}).sort_values('mi_score', ascending=False)

print("\n" + "="*60)
print("METHOD 2: MUTUAL INFORMATION (Filter)")
print("="*60)
print(mi_df.head(10))


METHOD 2: MUTUAL INFORMATION (Filter)
         feature  mi_score
15        vol_10  0.038710
1          ret_2  0.030402
5         ret_20  0.028798
6         ret_40  0.022774
18     vol_ratio  0.017205
19         rsi_7  0.014355
11   sma_50_dist  0.012155
12  sma_100_dist  0.011718
17        vol_60  0.010504
20        rsi_14  0.009474


In [21]:
# Method 3: RFE (Wrapper)
base_model = RandomForestClassifier(n_estimators=50, max_depth=3, random_state=42)
rfe = RFE(base_model, n_features_to_select=10, step=5)
rfe.fit(X_train_scaled, y_train)

rfe_selected = [f for f, s in zip(feature_cols, rfe.support_) if s]

print("\n" + "="*60)
print("METHOD 3: RFE (Wrapper)")
print("="*60)
print(f"Selected features: {rfe_selected}")


METHOD 3: RFE (Wrapper)
Selected features: ['ret_1', 'ret_2', 'ret_3', 'ret_20', 'sma_20_dist', 'sma_50_dist', 'vol_5', 'vol_60', 'rsi_14', 'rsi_21']


In [22]:
# Method 4: Feature Importance (Embedded)
rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf.fit(X_train_scaled, y_train)

importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("\n" + "="*60)
print("METHOD 4: RF IMPORTANCE (Embedded)")
print("="*60)
print(importance_df.head(10))


METHOD 4: RF IMPORTANCE (Embedded)
         feature  importance
5         ret_20    0.062828
21        rsi_21    0.058400
11   sma_50_dist    0.055252
14         vol_5    0.052836
4         ret_10    0.052026
2          ret_3    0.049790
10   sma_20_dist    0.048202
8     sma_5_dist    0.046454
12  sma_100_dist    0.045545
9    sma_10_dist    0.045344


In [23]:
# Compare all vs selected features
print("\n" + "="*60)
print("MODEL COMPARISON")
print("="*60)

# All features
rf_all = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf_all.fit(X_train_scaled, y_train)
acc_all = accuracy_score(y_test, rf_all.predict(X_test_scaled))

# Top 10 by importance
top_features = importance_df['feature'].head(10).tolist()
X_train_top = train[top_features]
X_test_top = test[top_features]

scaler_top = StandardScaler()
X_train_top_scaled = scaler_top.fit_transform(X_train_top)
X_test_top_scaled = scaler_top.transform(X_test_top)

rf_top = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf_top.fit(X_train_top_scaled, y_train)
acc_top = accuracy_score(y_test, rf_top.predict(X_test_top_scaled))

print(f"\nAll {len(feature_cols)} features: {acc_all:.1%}")
print(f"Top 10 features: {acc_top:.1%}")


MODEL COMPARISON

All 22 features: 50.7%
Top 10 features: 51.2%


In [24]:
# Today's prediction with selected features
latest = df[top_features].iloc[[-1]]
latest_scaled = scaler_top.transform(latest)
pred = rf_top.predict(latest_scaled)[0]
prob = rf_top.predict_proba(latest_scaled)[0]

print("\n" + "="*60)
print(f"ðŸ“Š SIGNAL WITH SELECTED FEATURES")
print("="*60)
print(f"\nDate: {df.index[-1].strftime('%Y-%m-%d')}")
print(f"\nTop 10 Features Used:")
for f in top_features:
    print(f"  {f}: {df[f].iloc[-1]:.4f}")
print(f"\nðŸŽ¯ Signal: {'ðŸ“ˆ BULLISH' if pred == 1 else 'ðŸ“‰ BEARISH'}")
print(f"   Confidence: {max(prob):.1%}")


ðŸ“Š SIGNAL WITH SELECTED FEATURES

Date: 2026-01-20

Top 10 Features Used:
  ret_20: 0.0046
  rsi_21: 55.9463
  sma_50_dist: -0.0034
  vol_5: 0.0090
  ret_10: -0.0147
  ret_3: -0.0185
  sma_20_dist: -0.0157
  sma_5_dist: -0.0167
  sma_100_dist: 0.0101
  sma_10_dist: -0.0188

ðŸŽ¯ Signal: ðŸ“ˆ BULLISH
   Confidence: 51.7%


---
## ðŸ“… Tomorrow: Feature Transformations