In [4]:
# === LEAK CHECK HELPERS ===
import numpy as np
import pandas as pd

def show_df_info(df, name="df", nhead=5):
    print(f"== {name} info ==")
    print("type:", type(df))
    try:
        print("shape:", df.shape)
    except:
        pass
    print("columns (count):", len(df.columns))
    print(df.columns.tolist()[:50])
    print("\ndtypes:")
    display(df.dtypes)
    print("\nhead:")
    display(df.head(nhead))
    print("\ntail:")
    display(df.tail(nhead))
    print("\nindex sample:", df.index[:5])

def check_column_duplicates(df):
    cols = list(df.columns)
    dup = [c for c in set(cols) if cols.count(c) > 1]
    print("Duplicate column names:", dup)
    return dup

def near_equal_prop(df, col, ref_col, tol=1e-8):
    a = np.asarray(df[col], dtype=float)
    b = np.asarray(df[ref_col], dtype=float)
    mask = np.isfinite(a) & np.isfinite(b)
    if mask.sum() == 0:
        return 0.0
    return float(np.isclose(a[mask], b[mask], atol=tol, rtol=0).mean())

def leak_smoke_tests(model_df):
    print("**Leak smoke tests**")
    # must be numeric-only check for correlation
    num_df = model_df.select_dtypes(include=[np.number]).copy()
    print("Numeric cols count:", len(num_df.columns))
    # top correlations with target if available
    if 'target_thresholded' in num_df.columns:
        corr = num_df.corr()['target_thresholded'].abs().sort_values(ascending=False)
        print("Top correlations with target:")
        display(corr.head(20))
    else:
        print("No 'target_thresholded' in numeric columns.")
    # check near-equality to future columns if present
    for ref in ['close_future_h','future_return_h','close_next','future_return_1']:
        if ref in model_df.columns:
            print(f"\nChecking near-equality to {ref}:")
            suspicious = []
            for c in model_df.select_dtypes(include=[np.number]).columns:
                if c in [ref,'target_thresholded','close_future_h','future_return_h','close_next','future_return_1']:
                    continue
                p = near_equal_prop(model_df, c, ref)
                if p > 0.01:
                    suspicious.append((c, p))
            if suspicious:
                print("Suspicious columns (prop > 0.01):", suspicious)
            else:
                print("No columns nearly equal to", ref)
    print("Done smoke tests.")


In [5]:
# === CELL 1: Load numerical CSV and run diagnostics ===
import os
import pandas as pd

DATA_DIR = r"C:\Users\amanb\OneDrive\Desktop\SWM Project\CHARTS"
FILENAME = "AMAZON30.csv"
filepath = os.path.join(DATA_DIR, FILENAME)

print("Attempting to load:", filepath)
if not os.path.exists(filepath):
    raise FileNotFoundError(f"File not found: {filepath}")

# Try to read with tab/whitespace separator first (sample looked tab-separated).
# If that fails, fall back to comma.
read_errors = []
for sep in ["\t", r"\s+", ","]:
    try:
        df_raw = pd.read_csv(filepath, sep=sep, header=None, engine="python")
        # Heuristic: if we read expected 7 columns, accept it
        if df_raw.shape[1] >= 7:
            break
    except Exception as e:
        read_errors.append((sep, str(e)))
else:
    # if loop didn't break, raise with debug info
    raise RuntimeError("Failed to read CSV with common separators. Attempts: " + str(read_errors))

# Assign expected column names (date, time, open, high, low, close, volume)
expected_cols = ['date','time','open','high','low','close','volume']
# If df_raw has more than 7 cols, take the first 7 as those values
if df_raw.shape[1] < 7:
    raise RuntimeError(f"File read produced {df_raw.shape[1]} columns (<7). Please inspect the file.")
df_raw = df_raw.iloc[:, :7]
df_raw.columns = expected_cols

# create df copy for downstream operations
df = df_raw.copy()

# Diagnostics:
print("\n=== Basic load diagnostics ===")
print("Raw shape:", df_raw.shape)
print("\nFirst 12 rows:")
display(df.head(12))

print("\nDtypes (before any coercion):")
display(df.dtypes)

print("\nCheck for obvious parsing issues in 'date'/'time' columns (show unique samples):")
print("date sample unique values (up to 10):", df['date'].unique()[:10])
print("time sample unique values (up to 10):", df['time'].unique()[:10])

# Check for missing values
print("\nMissing values per column (raw):")
display(df.isna().sum())

# Quick sanity: try to coerce numeric columns and show stats (without overwriting df yet)
for c in ['open','high','low','close','volume']:
    coerced = pd.to_numeric(df[c], errors='coerce')
    n_missing = coerced.isna().sum()
    print(f"Column '{c}': parsed numeric missing after coercion = {n_missing} / {len(df)}")
print("\nIf any of the numeric columns show many missing values above, paste the output and STOP.")

# Print a small sample of rows where numeric coercion failed (if any)
mask_any_na = False
for c in ['open','high','low','close','volume']:
    coerced = pd.to_numeric(df[c], errors='coerce')
    if coerced.isna().any():
        mask_any_na = True
        print(f"\nRows with non-numeric '{c}':")
        display(df[coerced.isna()].head(10))

if not mask_any_na:
    print("\nNo non-numeric values detected in the 5 numerical columns (good).")

print("\n=== End of Cell 1 diagnostics ===")


Attempting to load: C:\Users\amanb\OneDrive\Desktop\SWM Project\CHARTS\AMAZON30.csv

=== Basic load diagnostics ===
Raw shape: (7155, 7)

First 12 rows:


Unnamed: 0,date,time,open,high,low,close,volume
0,2017.05.12,18:00,960.01,961.24,960.0,960.66,978
1,2017.05.12,18:30,960.65,962.14,960.47,961.89,1017
2,2017.05.12,19:00,961.91,962.76,960.63,961.74,1384
3,2017.05.12,19:30,961.77,962.27,961.13,961.34,2110
4,2017.05.15,12:30,962.39,962.5,959.97,960.2,121
5,2017.05.15,13:00,960.1,960.7,958.09,958.68,372
6,2017.05.15,13:30,958.66,963.0,957.0,959.97,2512
7,2017.05.15,14:00,959.98,960.64,956.05,957.54,1887
8,2017.05.15,14:30,957.56,961.01,957.09,960.16,1727
9,2017.05.15,15:00,960.15,961.05,958.47,959.44,1424



Dtypes (before any coercion):


date       object
time       object
open      float64
high      float64
low       float64
close     float64
volume      int64
dtype: object


Check for obvious parsing issues in 'date'/'time' columns (show unique samples):
date sample unique values (up to 10): ['2017.05.12' '2017.05.15' '2017.05.16' '2017.05.17' '2017.05.18'
 '2017.05.19' '2017.05.22' '2017.05.23' '2017.05.24' '2017.05.25']
time sample unique values (up to 10): ['18:00' '18:30' '19:00' '19:30' '12:30' '13:00' '13:30' '14:00' '14:30'
 '15:00']

Missing values per column (raw):


date      0
time      0
open      0
high      0
low       0
close     0
volume    0
dtype: int64

Column 'open': parsed numeric missing after coercion = 0 / 7155
Column 'high': parsed numeric missing after coercion = 0 / 7155
Column 'low': parsed numeric missing after coercion = 0 / 7155
Column 'close': parsed numeric missing after coercion = 0 / 7155
Column 'volume': parsed numeric missing after coercion = 0 / 7155

If any of the numeric columns show many missing values above, paste the output and STOP.

No non-numeric values detected in the 5 numerical columns (good).

=== End of Cell 1 diagnostics ===


In [6]:
# === CELL 2: parse date+time into datetime index, set index, and diagnostics ===
import pandas as pd
import numpy as np

# df should exist from CELL 1
if 'df' not in globals():
    raise RuntimeError("DataFrame 'df' not found. Run CELL 1 first.")

# Create a combined timestamp string and try parsing with known format
ts_str = df['date'].astype(str).str.strip() + " " + df['time'].astype(str).str.strip()

# Try parsing with a couple of formats; fall back to pandas' parser if needed
parse_errors = 0
try:
    ts = pd.to_datetime(ts_str, format="%Y.%m.%d %H:%M", errors='coerce')
    parse_errors = ts.isna().sum()
    if parse_errors > 0:
        # try a more flexible parse if exact format didn't work for some rows
        ts2 = pd.to_datetime(ts_str, errors='coerce', infer_datetime_format=True)
        parse_errors = ts2.isna().sum()
        ts = ts2
except Exception as e:
    # fallback
    ts = pd.to_datetime(ts_str, errors='coerce', infer_datetime_format=True)
    parse_errors = ts.isna().sum()

# attach ts to df
df = df.copy()
df['ts'] = ts
df['ts_str'] = ts_str  # keep original string for debugging if needed

print("Parsed timestamps. Unparsable count:", parse_errors)

# Drop rows with unparsable timestamps (should be zero). If non-zero, show samples.
if parse_errors > 0:
    print("\nSample rows with unparsable timestamps:")
    display(df[df['ts'].isna()].head(20))
    raise RuntimeError("Some timestamps could not be parsed. Fix the source or adjust parsing logic.")

# Set datetime index and sort
df.index = df['ts']
df.index.name = 'ts'
df = df.sort_index()

# Quick checks
print("\nMissing values per column after conversion:")
display(df[['open','high','low','close','volume']].isna().sum())

print("\nIndex range: {} to {}".format(df.index.min(), df.index.max()))
print("Total rows:", len(df))

# Show small sample with index as datetime
display(df.loc[df.index[:15], ['date','time','open','high','low','close','volume','ts_str']])

# Duplicate timestamp check
dup_count = df.index.duplicated().sum()
print("\nDuplicate timestamps:", dup_count)

# Largest gaps between consecutive timestamps (minutes)
diffs_min = df.index.to_series().diff().dt.total_seconds().div(60)
top_gaps = diffs_min.nlargest(20)
print("\nTop gaps (minutes) between consecutive rows (largest first):")
display(top_gaps)

# Volume percentiles (sanity)
print("\nVolume percentiles:")
display(df['volume'].quantile([0,0.01,0.05,0.25,0.5,0.75,0.95,0.99,1.0]))

# keep df back in globals
globals()['df'] = df

print("\n=== End of CELL 2 diagnostics ===")


Parsed timestamps. Unparsable count: 0

Missing values per column after conversion:


open      0
high      0
low       0
close     0
volume    0
dtype: int64


Index range: 2017-05-12 18:00:00 to 2019-02-01 20:00:00
Total rows: 7155


Unnamed: 0_level_0,date,time,open,high,low,close,volume,ts_str
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-05-12 18:00:00,2017.05.12,18:00,960.01,961.24,960.0,960.66,978,2017.05.12 18:00
2017-05-12 18:30:00,2017.05.12,18:30,960.65,962.14,960.47,961.89,1017,2017.05.12 18:30
2017-05-12 19:00:00,2017.05.12,19:00,961.91,962.76,960.63,961.74,1384,2017.05.12 19:00
2017-05-12 19:30:00,2017.05.12,19:30,961.77,962.27,961.13,961.34,2110,2017.05.12 19:30
2017-05-15 12:30:00,2017.05.15,12:30,962.39,962.5,959.97,960.2,121,2017.05.15 12:30
2017-05-15 13:00:00,2017.05.15,13:00,960.1,960.7,958.09,958.68,372,2017.05.15 13:00
2017-05-15 13:30:00,2017.05.15,13:30,958.66,963.0,957.0,959.97,2512,2017.05.15 13:30
2017-05-15 14:00:00,2017.05.15,14:00,959.98,960.64,956.05,957.54,1887,2017.05.15 14:00
2017-05-15 14:30:00,2017.05.15,14:30,957.56,961.01,957.09,960.16,1727,2017.05.15 14:30
2017-05-15 15:00:00,2017.05.15,15:00,960.15,961.05,958.47,959.44,1424,2017.05.15 15:00



Duplicate timestamps: 0

Top gaps (minutes) between consecutive rows (largest first):


ts
2017-05-30 12:30:00    5340.0
2017-09-05 12:30:00    5340.0
2017-12-26 13:30:00    5340.0
2018-01-02 13:30:00    5340.0
2018-01-16 13:30:00    5340.0
2018-02-20 13:30:00    5340.0
2018-05-29 12:30:00    5340.0
2018-09-04 12:30:00    5340.0
2019-01-22 13:30:00    5340.0
2018-04-02 12:30:00    5280.0
2017-11-27 13:30:00    4080.0
2018-11-26 13:30:00    4050.0
2017-11-06 13:30:00    3960.0
2018-11-05 13:30:00    3960.0
2017-05-15 12:30:00    3900.0
2017-05-22 12:30:00    3900.0
2017-06-05 12:30:00    3900.0
2017-06-12 12:30:00    3900.0
2017-06-19 12:30:00    3900.0
2017-06-26 12:30:00    3900.0
Name: ts, dtype: float64


Volume percentiles:


0.00       1.00
0.01      20.54
0.05      63.00
0.25     856.50
0.50    1578.00
0.75    2464.50
0.95    3753.00
0.99    4488.84
1.00    5639.00
Name: volume, dtype: float64


=== End of CELL 2 diagnostics ===


In [7]:
# === CELL 3: Safe feature engineering (lagged/rolling only, no future info) ===
import numpy as np
import pandas as pd

# Work on a copy of df
df_feat = df.copy()

# Basic lag features
df_feat['return_1'] = df_feat['close'].pct_change()               # current return (still safe)
df_feat['log_return_1'] = np.log(df_feat['close']).diff()

# Rolling statistics — all shifted by 1 to use only past data
wins = [3, 6, 12]
for w in wins:
    df_feat[f'roll_mean_{w}'] = df_feat['close'].shift(1).rolling(window=w, min_periods=1).mean()
    df_feat[f'roll_std_{w}']  = df_feat['close'].shift(1).rolling(window=w, min_periods=1).std()
    df_feat[f'roll_max_{w}']  = df_feat['close'].shift(1).rolling(window=w, min_periods=1).max()
    df_feat[f'roll_min_{w}']  = df_feat['close'].shift(1).rolling(window=w, min_periods=1).min()

# Momentum / volatility ratio features
df_feat['mom_3'] = df_feat['close'].shift(1) - df_feat['roll_mean_3']
df_feat['vol_ratio_3_12'] = df_feat['roll_std_3'] / (df_feat['roll_std_12'] + 1e-9)

# Volume dynamics
df_feat['vol_change'] = df_feat['volume'].pct_change()
df_feat['vol_roll_mean_6'] = df_feat['volume'].shift(1).rolling(6, min_periods=1).mean()

# Diagnostics
print("Feature-engineering complete.")
print("Shape before dropna:", df_feat.shape)
nan_summary = df_feat.isna().sum()
print("\nTop columns by missing values (should mainly be from first few rows):")
display(nan_summary[nan_summary > 0].sort_values(ascending=False).head(15))

# Drop NaNs produced by rolling (safe — only first few rows)
df_feat = df_feat.dropna().copy()
print("Shape after dropna:", df_feat.shape)

# Show sample of engineered features
print("\nSample of engineered columns (tail):")
cols_to_show = ['close','return_1','log_return_1','roll_mean_3','roll_std_3','mom_3','vol_change','vol_roll_mean_6']
display(df_feat[cols_to_show].tail(10))

print("\n=== End of CELL 3 diagnostics ===")


Feature-engineering complete.
Shape before dropna: (7155, 27)

Top columns by missing values (should mainly be from first few rows):


roll_std_6        2
roll_std_3        2
vol_ratio_3_12    2
roll_std_12       2
log_return_1      1
return_1          1
roll_min_3        1
roll_mean_6       1
roll_max_3        1
roll_mean_3       1
roll_min_6        1
roll_max_6        1
roll_max_12       1
roll_mean_12      1
roll_min_12       1
dtype: int64

Shape after dropna: (7153, 27)

Sample of engineered columns (tail):


Unnamed: 0_level_0,close,return_1,log_return_1,roll_mean_3,roll_std_3,mom_3,vol_change,vol_roll_mean_6
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-02-01 15:30:00,1643.65,-0.003232,-0.003238,1643.16,6.029959,5.82,-0.231334,2940.833333
2019-02-01 16:00:00,1646.63,0.001813,0.001811,1645.396667,3.103584,-1.746667,-0.021132,2753.5
2019-02-01 16:30:00,1648.77,0.0013,0.001299,1646.42,2.671198,0.21,-0.045961,2854.333333
2019-02-01 17:00:00,1649.28,0.000309,0.000309,1646.35,2.571459,2.42,-0.154015,3152.5
2019-02-01 17:30:00,1646.83,-0.001485,-0.001487,1648.226667,1.40607,1.053333,0.066437,3350.0
2019-02-01 18:00:00,1642.0,-0.002933,-0.002937,1648.293333,1.292685,-1.463333,0.139563,2858.833333
2019-02-01 18:30:00,1632.75,-0.005633,-0.005649,1646.036667,3.704272,-4.036667,0.347178,2692.166667
2019-02-01 19:00:00,1629.19,-0.00218,-0.002183,1640.526667,7.154693,-7.776667,0.144137,2835.666667
2019-02-01 19:30:00,1627.16,-0.001246,-0.001247,1634.646667,6.612264,-5.456667,-0.092584,3080.666667
2019-02-01 20:00:00,1624.95,-0.001358,-0.001359,1629.7,2.829682,-2.54,-0.647716,3280.666667



=== End of CELL 3 diagnostics ===


In [8]:
# === CELL 4 (fixed) ===
import numpy as np
import pandas as pd

# Use df_feat from previous cell
if 'df_feat' not in globals():
    raise RuntimeError("df_feat not found. Run CELL 3 first.")

df_label = df_feat.copy()

# Parameters
HORIZON = globals().get('HORIZON', 1)
THRESHOLD = globals().get('THRESHOLD', 0.0005)

# Create future close and return (safe)
df_label['close_future_h'] = df_label['close'].shift(-HORIZON)
df_label['future_return_h'] = (df_label['close_future_h'] / df_label['close']) - 1

# Binary target
df_label['target_thresholded'] = (df_label['future_return_h'] > THRESHOLD).astype(int)

# Drop rows where future_return_h is NaN (last horizon rows)
df_label = df_label.dropna(subset=['future_return_h']).copy()

# Basic prints
print("Rows after adding target and dropping NaNs:", len(df_label))
print("\nTarget class distribution (counts):")
print(df_label['target_thresholded'].value_counts())
print("\nTarget class distribution (proportions):")
print(df_label['target_thresholded'].value_counts(normalize=True))

# Show a sample of last few rows for verification
display(df_label[['close','close_future_h','future_return_h','target_thresholded']].tail(10))

# Ensure target is 1-D numeric Series
t = df_label['target_thresholded']
print("\nTarget type:", type(t), " dtype:", getattr(t, 'dtype', None))
# If by any chance it's a DataFrame-like, coerce to Series
if isinstance(t, pd.DataFrame):
    print("WARNING: target is a DataFrame — selecting the rightmost column as the target.")
    df_label['target_thresholded'] = t.iloc[:, -1].astype(int)

# Check duplicate column names
cols = list(df_label.columns)
dups = [c for c in set(cols) if cols.count(c) > 1]
print("\nDuplicate column names (if any):", dups)

# Correlations: only numeric columns (safe)
num_df = df_label.select_dtypes(include=[np.number]).copy()
if 'target_thresholded' not in num_df.columns:
    raise RuntimeError("'target_thresholded' not in numeric columns — unexpected.")

corr_with_target = num_df.corr()['target_thresholded'].abs().sort_values(ascending=False)
print("\nTop 20 numeric features by absolute correlation with target_thresholded:")
display(corr_with_target.head(20))

# Simple near-equality leak smoke tests (checks if any feature equals the future close/return often)
def near_equal_prop(arr_a, arr_b, tol=1e-8):
    a = np.asarray(arr_a, dtype=float)
    b = np.asarray(arr_b, dtype=float)
    mask = np.isfinite(a) & np.isfinite(b)
    if mask.sum() == 0:
        return 0.0
    return float(np.isclose(a[mask], b[mask], atol=tol, rtol=0).mean())

print("\nNear-equality checks (proportion of rows where feature ≈ future):")
for ref in ['close_future_h', 'future_return_h']:
    if ref not in num_df.columns:
        print(f"  {ref}: (not present in numeric columns)")
        continue
    suspicious = []
    for c in num_df.columns:
        if c in [ref, 'target_thresholded']:
            continue
        p = near_equal_prop(num_df[c], num_df[ref], tol=1e-8)
        if p > 0.01:  # more than 1% equal -> suspicious
            suspicious.append((c, p))
    if suspicious:
        print(f"  Suspicious near-equalities to {ref}:")
        for c,p in suspicious:
            print(f"    {c}: {p:.4f}")
    else:
        print(f"  No suspicious columns nearly equal to {ref} (>1%).")

print("\n=== End of CELL 4 (fixed) diagnostics ===")

# Keep df_label in globals for next cells
globals()['df_label'] = df_label


Rows after adding target and dropping NaNs: 7152

Target class distribution (counts):
target_thresholded
0    4236
1    2916
Name: count, dtype: int64

Target class distribution (proportions):
target_thresholded
0    0.592282
1    0.407718
Name: proportion, dtype: float64


Unnamed: 0_level_0,close,close_future_h,future_return_h,target_thresholded
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-02-01 15:00:00,1648.98,1643.65,-0.003232,0
2019-02-01 15:30:00,1643.65,1646.63,0.001813,1
2019-02-01 16:00:00,1646.63,1648.77,0.0013,1
2019-02-01 16:30:00,1648.77,1649.28,0.000309,0
2019-02-01 17:00:00,1649.28,1646.83,-0.001485,0
2019-02-01 17:30:00,1646.83,1642.0,-0.002933,0
2019-02-01 18:00:00,1642.0,1632.75,-0.005633,0
2019-02-01 18:30:00,1632.75,1629.19,-0.00218,0
2019-02-01 19:00:00,1629.19,1627.16,-0.001246,0
2019-02-01 19:30:00,1627.16,1624.95,-0.001358,0



Target type: <class 'pandas.core.series.Series'>  dtype: int64

Duplicate column names (if any): []

Top 20 numeric features by absolute correlation with target_thresholded:


target_thresholded    1.000000
future_return_h       0.592738
roll_std_3            0.046107
roll_std_6            0.043691
roll_std_12           0.041818
close_future_h        0.032048
vol_ratio_3_12        0.030196
volume                0.029969
high                  0.020800
roll_max_6            0.020717
roll_max_3            0.020704
roll_max_12           0.020682
open                  0.020433
roll_mean_3           0.020270
close                 0.020164
roll_mean_6           0.020139
low                   0.019945
roll_mean_12          0.019790
roll_min_3            0.019741
roll_min_6            0.019152
Name: target_thresholded, dtype: float64


Near-equality checks (proportion of rows where feature ≈ future):
  No suspicious columns nearly equal to close_future_h (>1%).
  No suspicious columns nearly equal to future_return_h (>1%).

=== End of CELL 4 (fixed) diagnostics ===


In [9]:
# CELL 5 — Parse datetime, convert dtypes, set index and show basic stats
# Replace previous CELL 5 with this cell.

df = df_raw.copy()

# Clean strings and combine
df['date'] = df['date'].astype(str).str.strip()
df['time'] = df['time'].astype(str).str.strip()
df['ts_str'] = df['date'] + ' ' + df['time']

# Parse timestamps using the sample format: "2017.05.12 18:00"
df['ts'] = pd.to_datetime(df['ts_str'], format="%Y.%m.%d %H:%M", errors='coerce')
n_bad = df['ts'].isna().sum()
print("Parsed timestamps. Unparsable count:", n_bad)
if n_bad > 0:
    display(df[df['ts'].isna()].head())

# Convert numeric columns (coerce errors to NaN so we can inspect)
for col in ['open', 'high', 'low', 'close', 'volume']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Quick checks
print("\nMissing values per column after conversion:")
print(df[['open','high','low','close','volume']].isna().sum())

# Set index and sort
df = df.set_index('ts').sort_index()
print("\nIndex range:", df.index.min(), "to", df.index.max())
print("Total rows:", len(df))
display(df.head(10))
display(df[['open','high','low','close','volume']].describe().T)


Parsed timestamps. Unparsable count: 0

Missing values per column after conversion:
open      0
high      0
low       0
close     0
volume    0
dtype: int64

Index range: 2017-05-12 18:00:00 to 2019-02-01 20:00:00
Total rows: 7155


Unnamed: 0_level_0,date,time,open,high,low,close,volume,ts_str
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-05-12 18:00:00,2017.05.12,18:00,960.01,961.24,960.0,960.66,978,2017.05.12 18:00
2017-05-12 18:30:00,2017.05.12,18:30,960.65,962.14,960.47,961.89,1017,2017.05.12 18:30
2017-05-12 19:00:00,2017.05.12,19:00,961.91,962.76,960.63,961.74,1384,2017.05.12 19:00
2017-05-12 19:30:00,2017.05.12,19:30,961.77,962.27,961.13,961.34,2110,2017.05.12 19:30
2017-05-15 12:30:00,2017.05.15,12:30,962.39,962.5,959.97,960.2,121,2017.05.15 12:30
2017-05-15 13:00:00,2017.05.15,13:00,960.1,960.7,958.09,958.68,372,2017.05.15 13:00
2017-05-15 13:30:00,2017.05.15,13:30,958.66,963.0,957.0,959.97,2512,2017.05.15 13:30
2017-05-15 14:00:00,2017.05.15,14:00,959.98,960.64,956.05,957.54,1887,2017.05.15 14:00
2017-05-15 14:30:00,2017.05.15,14:30,957.56,961.01,957.09,960.16,1727,2017.05.15 14:30
2017-05-15 15:00:00,2017.05.15,15:00,960.15,961.05,958.47,959.44,1424,2017.05.15 15:00


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
open,7155.0,1415.360063,335.949233,932.45,1015.645,1486.0,1682.965,2047.08
high,7155.0,1418.858921,337.081782,934.99,1018.335,1490.99,1686.36,2050.17
low,7155.0,1411.589758,334.653916,927.89,1012.18,1481.0,1677.97,2041.75
close,7155.0,1415.261593,335.846736,932.47,1016.265,1486.15,1682.96,2047.08
volume,7155.0,1683.95891,1151.533643,1.0,856.5,1578.0,2464.5,5639.0


In [10]:
# === RUN THIS: build modeling dataset and chronological train/test split ===
import pandas as pd
import numpy as np

# Prefer df_label (has target); otherwise use df_feat and build minimal label
if 'df_label' in globals():
    model_src = df_label.copy()
    print("Using df_label as source (has target).")
elif 'df_feat' in globals():
    print("df_label not found; building model_src from df_feat.")
    tmp = df_feat.copy()
    HORIZON = globals().get('HORIZON', 1)
    THRESHOLD = globals().get('THRESHOLD', 0.0005)
    tmp['close_future_h'] = tmp['close'].shift(-HORIZON)
    tmp['future_return_h'] = (tmp['close_future_h'] / tmp['close']) - 1
    tmp = tmp.dropna(subset=['future_return_h'])
    tmp['target_thresholded'] = (tmp['future_return_h'] > THRESHOLD).astype(int)
    model_src = tmp
else:
    raise RuntimeError("Neither 'df_label' nor 'df_feat' found. Run parsing and feature cells first.")

model_df = model_src.copy()

# Columns to exclude from features (identifiers and any future/leaky columns)
exclude_cols = ['date','time','ts_str','close_future_h','future_return_h','close_next','future_return_1']

# Build feature list (numeric features only)
features = [c for c in model_df.columns if c not in exclude_cols + ['target_thresholded']]
print("Total candidate features:", len(features))
print("Example feature names (first 20):", features[:20])

# Ensure selected features are numeric
non_numeric = model_df[features].select_dtypes(exclude=[np.number]).columns.tolist()
if non_numeric:
    print("WARNING: Non-numeric features detected and will be removed:", non_numeric)
    features = [c for c in features if c not in non_numeric]

# Prepare X and y
X = model_df[features].copy()
y = model_df['target_thresholded'].copy()

# Chronological 80/20 split (no shuffling)
split_idx = int(len(model_df) * 0.8)
X_train, X_test = X.iloc[:split_idx].copy(), X.iloc[split_idx:].copy()
y_train, y_test = y.iloc[:split_idx].copy(), y.iloc[split_idx:].copy()

print("\nTrain shape:", X_train.shape, " Test shape:", X_test.shape)
print("Train time range:", X_train.index.min(), "→", X_train.index.max())
print("Test  time range:", X_test.index.min(),  "→", X_test.index.max())

print("\nLabel distribution (train counts):")
print(y_train.value_counts())
print("\nLabel distribution (test counts):")
print(y_test.value_counts())

# Export to globals for the training cell
globals().update({
    'model_df': model_df,
    'X': X, 'y': y,
    'X_train': X_train, 'X_test': X_test,
    'y_train': y_train, 'y_test': y_test
})


Using df_label as source (has target).
Total candidate features: 24
Example feature names (first 20): ['open', 'high', 'low', 'close', 'volume', 'ts', 'return_1', 'log_return_1', 'roll_mean_3', 'roll_std_3', 'roll_max_3', 'roll_min_3', 'roll_mean_6', 'roll_std_6', 'roll_max_6', 'roll_min_6', 'roll_mean_12', 'roll_std_12', 'roll_max_12', 'roll_min_12']

Train shape: (5721, 23)  Test shape: (1431, 23)
Train time range: 2017-05-12 19:00:00 → 2018-09-26 17:00:00
Test  time range: 2018-09-26 17:30:00 → 2019-02-01 19:30:00

Label distribution (train counts):
target_thresholded
0    3407
1    2314
Name: count, dtype: int64

Label distribution (test counts):
target_thresholded
0    829
1    602
Name: count, dtype: int64


In [11]:
# === CELL 6: Train baseline models (LogisticRegression, SVM, XGBoost) with a progress bar ===
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# --- Safety check: required variables
for name in ['X_train','X_test','y_train','y_test']:
    if name not in globals():
        raise RuntimeError(f"Required variable '{name}' not found in globals. Run the train/test split cell first.")

# --- Model pipelines to train
models = [
    ("LogisticRegression", Pipeline([("scaler", StandardScaler()), ("clf", LogisticRegression(max_iter=2000, random_state=42))])),
    ("SVM_RBF",            Pipeline([("scaler", StandardScaler()), ("clf", SVC(probability=True, kernel='rbf', random_state=42))])),
    ("XGBoost",            Pipeline([("scaler", StandardScaler()), ("clf", XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))]))
]

results = {}

print(f"Training {len(models)} models on {len(X_train)} training samples ...")

# train each model and report metrics
for name, pipe in tqdm(models, desc="Models", file=sys.stdout):
    print(f"\n--- Training {name} ---")
    # Fit
    pipe.fit(X_train, y_train)

    # Predict
    y_pred = pipe.predict(X_test)
    y_proba = None
    try:
        y_proba = pipe.predict_proba(X_test)[:, 1]
    except Exception:
        y_proba = None

    # Metrics
    acc  = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec  = recall_score(y_test, y_pred, zero_division=0)
    f1   = f1_score(y_test, y_pred, zero_division=0)
    auc  = roc_auc_score(y_test, y_proba) if (y_proba is not None) else None

    # Print metrics
    print(f"Model: {name}")
    print("-" * max(len(name), 6))
    print(f"Accuracy:  {acc:.3f}")
    print(f"Precision: {prec:.3f}")
    print(f"Recall:    {rec:.3f}")
    print(f"F1-score:  {f1:.3f}")
    if auc is not None:
        print(f"ROC-AUC:   {auc:.3f}")
    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred))

    # Save results
    results[name] = {'model': pipe, 'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'auc': auc}

    # Show top coefficients / feature importances where possible
    try:
        clf = pipe.named_steps['clf']
        feat_names = X_train.columns if hasattr(X_train, "columns") else None
        if hasattr(clf, 'coef_') and feat_names is not None:
            coefs = clf.coef_.ravel()
            top_coef = pd.Series(coefs, index=feat_names).abs().sort_values(ascending=False).head(10)
            print("\nTop absolute LR coefficients:")
            display(top_coef)
        elif hasattr(clf, 'feature_importances_') and feat_names is not None:
            fi = clf.feature_importances_
            top_fi = pd.Series(fi, index=feat_names).sort_values(ascending=False).head(10)
            print("\nTop feature importances (by XGBoost):")
            display(top_fi)
    except Exception as e:
        print("Could not extract model coefficients/importances:", repr(e))

# Summary print (safe formatting)
print("\nAll baseline models trained. Summary:")
for name, stats in results.items():
    auc_display = "None" if stats["auc"] is None else f"{stats['auc']:.3f}"
    print(f"{name}:  F1={stats['f1']:.3f}  Acc={stats['acc']:.3f}  AUC={auc_display}")

# Save results for next steps
globals()['baseline_results'] = results


Training 3 models on 5721 training samples ...
Models:   0%|          | 0/3 [00:00<?, ?it/s]
--- Training LogisticRegression ---
Model: LogisticRegression
------------------
Accuracy:  0.581
Precision: 0.506
Recall:    0.151
F1-score:  0.233
ROC-AUC:   0.527
Confusion matrix:
[[740  89]
 [511  91]]

Top absolute LR coefficients:


high              0.428117
roll_max_12       0.415224
roll_mean_12      0.290595
open              0.281830
roll_min_12       0.243565
low               0.162222
roll_min_6        0.124023
roll_std_12       0.119404
close             0.109290
vol_ratio_3_12    0.084081
dtype: float64


--- Training SVM_RBF ---
Model: SVM_RBF
-------
Accuracy:  0.568
Precision: 0.367
Recall:    0.037
F1-score:  0.066
ROC-AUC:   0.498
Confusion matrix:
[[791  38]
 [580  22]]
Models:  67%|██████▋   | 2/3 [00:07<00:03,  3.76s/it]
--- Training XGBoost ---
Model: XGBoost
-------
Accuracy:  0.521
Precision: 0.398
Recall:    0.269
F1-score:  0.321
ROC-AUC:   0.490
Confusion matrix:
[[584 245]
 [440 162]]

Top feature importances (by XGBoost):


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


roll_max_3     0.060421
roll_min_6     0.054324
vol_change     0.048849
roll_min_12    0.048371
roll_max_6     0.047675
high           0.046231
volume         0.046171
roll_std_3     0.045901
return_1       0.044619
roll_std_6     0.044503
dtype: float32

Models: 100%|██████████| 3/3 [00:07<00:00,  2.64s/it]

All baseline models trained. Summary:
LogisticRegression:  F1=0.233  Acc=0.581  AUC=0.527
SVM_RBF:  F1=0.066  Acc=0.568  AUC=0.498
XGBoost:  F1=0.321  Acc=0.521  AUC=0.490


In [12]:
# === CELL 7: TimeSeries-aware RandomizedSearchCV tuning for LR, SVM, XGBoost ===
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, f1_score
import warnings
warnings.filterwarnings("ignore")

# Safety checks
for name in ['X_train','y_train']:
    if name not in globals():
        raise RuntimeError(f"Required variable '{name}' not found in globals. Run the train/test split cell first.")

# CV setup
tscv = TimeSeriesSplit(n_splits=5)
f1_scorer = make_scorer(f1_score)

# Pipelines
pipe_lr  = Pipeline([("scaler", StandardScaler()), ("clf", LogisticRegression(max_iter=2000, random_state=42))])
pipe_svm = Pipeline([("scaler", StandardScaler()), ("clf", SVC(probability=True, random_state=42))])
pipe_xgb = Pipeline([("scaler", StandardScaler()), ("clf", XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))])

# Parameter distributions (use modest sizes for speed; increase n_iter if you want more thorough search)
param_lr = {
    "clf__C": np.logspace(-4, 3, 12),
    "clf__penalty": ["l2"],
    # consider 'class_weight': ['balanced'] as an alternative experiment
}

param_svm = {
    "clf__C": [0.01, 0.1, 1, 5, 10],
    "clf__gamma": ["scale", 0.01, 0.001],
    "clf__kernel": ["rbf"]
}

param_xgb = {
    "clf__n_estimators": [100, 200, 400],
    "clf__max_depth": [3, 5, 8],
    "clf__learning_rate": [0.01, 0.05, 0.1],
    "clf__subsample": [0.7, 0.9, 1.0],
    "clf__colsample_bytree": [0.6, 0.8, 1.0]
}

# Helper to run RandomizedSearchCV
def run_rand_search(name, pipeline, param_dist, n_iter=12):
    print(f"\nRunning RandomizedSearchCV for {name} (n_iter={n_iter}) ...")
    search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_dist,
        n_iter=n_iter,
        scoring=f1_scorer,
        cv=tscv,
        random_state=42,
        verbose=2,
        n_jobs=-1,
        refit=True
    )
    search.fit(X_train, y_train)
    print(f"-> {name} best CV F1: {search.best_score_:.4f}")
    print("-> Best params:", search.best_params_)
    return search

# Run searches (adjust n_iter to taste; start with 12 then increase for final runs)
search_lr  = run_rand_search("Logistic Regression", pipe_lr, param_lr, n_iter=12)
search_svm = run_rand_search("SVM (RBF)", pipe_svm, param_svm, n_iter=12)
search_xgb = run_rand_search("XGBoost", pipe_xgb, param_xgb, n_iter=12)

# Save to globals
globals().update({
    'search_lr': search_lr, 'search_svm': search_svm, 'search_xgb': search_xgb,
    'best_lr': search_lr.best_estimator_, 'best_svm': search_svm.best_estimator_, 'best_xgb': search_xgb.best_estimator_
})
print("\nTuning finished. Best estimators stored as: best_lr, best_svm, best_xgb")



Running RandomizedSearchCV for Logistic Regression (n_iter=12) ...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
-> Logistic Regression best CV F1: 0.1626
-> Best params: {'clf__penalty': 'l2', 'clf__C': np.float64(231.0129700083158)}

Running RandomizedSearchCV for SVM (RBF) (n_iter=12) ...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
-> SVM (RBF) best CV F1: 0.2999
-> Best params: {'clf__kernel': 'rbf', 'clf__gamma': 'scale', 'clf__C': 5}

Running RandomizedSearchCV for XGBoost (n_iter=12) ...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
-> XGBoost best CV F1: 0.3432
-> Best params: {'clf__subsample': 1.0, 'clf__n_estimators': 400, 'clf__max_depth': 8, 'clf__learning_rate': 0.05, 'clf__colsample_bytree': 1.0}

Tuning finished. Best estimators stored as: best_lr, best_svm, best_xgb


In [13]:
# === CELL 8: Evaluate tuned models on test set ===
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import pandas as pd
import numpy as np

# Required globals check
for v in ['best_lr','best_svm','best_xgb','X_test','y_test']:
    if v not in globals():
        raise RuntimeError(f"Required variable '{v}' missing. Run CELL 7 first.")

models = [
    ("Tuned LogisticRegression", globals()['best_lr']),
    ("Tuned SVM (RBF)", globals()['best_svm']),
    ("Tuned XGBoost", globals()['best_xgb'])
]

def evaluate_and_report(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    try:
        y_proba = model.predict_proba(X_test)[:,1]
    except Exception:
        y_proba = None
    acc  = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec  = recall_score(y_test, y_pred, zero_division=0)
    f1   = f1_score(y_test, y_pred, zero_division=0)
    auc  = roc_auc_score(y_test, y_proba) if y_proba is not None else None

    print(f"\n{name}")
    print("-"*len(name))
    print(f"Accuracy:  {acc:.3f}")
    print(f"Precision: {prec:.3f}")
    print(f"Recall:    {rec:.3f}")
    print(f"F1-score:  {f1:.3f}")
    if auc is not None: print(f"ROC-AUC:   {auc:.3f}")
    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred))

    # show top features if available
    try:
        clf = model.named_steps['clf']
        feat_names = X_train.columns
        if hasattr(clf, 'coef_'):
            coefs = clf.coef_.ravel()
            display(pd.Series(coefs, index=feat_names).abs().sort_values(ascending=False).head(20))
        elif hasattr(clf, 'feature_importances_'):
            display(pd.Series(clf.feature_importances_, index=feat_names).sort_values(ascending=False).head(20))
    except Exception:
        pass

for name, m in models:
    evaluate_and_report(name, m, X_test, y_test)



Tuned LogisticRegression
------------------------
Accuracy:  0.584
Precision: 0.517
Recall:    0.176
F1-score:  0.263
ROC-AUC:   0.533
Confusion matrix:
[[730  99]
 [496 106]]


high               0.805324
roll_max_12        0.785361
roll_mean_12       0.547652
open               0.528647
roll_min_12        0.456421
low                0.302476
roll_min_6         0.236303
close              0.202550
return_1           0.129628
roll_min_3         0.126420
roll_mean_3        0.123471
roll_max_6         0.121680
roll_std_12        0.121226
roll_max_3         0.119111
log_return_1       0.089563
vol_ratio_3_12     0.084365
vol_roll_mean_6    0.077049
mom_3              0.060250
vol_change         0.043246
volume             0.028066
dtype: float64


Tuned SVM (RBF)
---------------
Accuracy:  0.544
Precision: 0.367
Recall:    0.115
F1-score:  0.175
ROC-AUC:   0.485
Confusion matrix:
[[710 119]
 [533  69]]

Tuned XGBoost
-------------
Accuracy:  0.527
Precision: 0.409
Recall:    0.279
F1-score:  0.332
ROC-AUC:   0.497
Confusion matrix:
[[586 243]
 [434 168]]


roll_max_3         0.050773
roll_min_12        0.049184
roll_mean_12       0.048989
roll_mean_6        0.048494
high               0.048387
roll_min_6         0.047054
low                0.046676
vol_change         0.046378
roll_mean_3        0.046340
roll_max_12        0.045800
roll_max_6         0.045758
volume             0.045121
vol_ratio_3_12     0.044807
roll_min_3         0.044519
roll_std_3         0.044425
roll_std_6         0.043819
vol_roll_mean_6    0.043704
roll_std_12        0.043513
return_1           0.043318
mom_3              0.042934
dtype: float32

In [14]:
# === CELL 9 — Class imbalance experiments ===
# Run this cell after you have X_train, X_test, y_train, y_test in globals.

import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Safety check
for name in ['X_train','X_test','y_train','y_test']:
    if name not in globals():
        raise RuntimeError(f"Required variable '{name}' not found - run the modeling split cell first.")

# Compute scale_pos_weight for XGBoost from training set
n_pos = int(y_train.sum())
n_neg = int(len(y_train) - n_pos)
scale_pos_weight = (n_neg / n_pos) if n_pos > 0 else 1.0
print(f"Train samples: {len(y_train)}  positives: {n_pos}  negatives: {n_neg}  scale_pos_weight: {scale_pos_weight:.3f}\n")

# Models with imbalance handling
models = [
    ("LR_balanced", Pipeline([("scaler", StandardScaler()), ("clf", LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42))])),
    ("SVM_balanced", Pipeline([("scaler", StandardScaler()), ("clf", SVC(probability=True, class_weight='balanced', kernel='rbf', random_state=42))])),
    ("XGB_spw", Pipeline([("scaler", StandardScaler()), ("clf", XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42,
                                                                               scale_pos_weight=scale_pos_weight))]))
]

results_imb = {}

print("Training with class-imbalance adjustments...\n")

for name, pipe in tqdm(models, desc="Models", file=sys.stdout):
    print(f"\n--- Training {name} ---")
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    y_proba = None
    try:
        y_proba = pipe.predict_proba(X_test)[:,1]
    except Exception:
        y_proba = None

    acc  = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec  = recall_score(y_test, y_pred, zero_division=0)
    f1   = f1_score(y_test, y_pred, zero_division=0)
    auc  = roc_auc_score(y_test, y_proba) if y_proba is not None else None

    print(f"Model: {name}")
    print("-"*max(len(name),6))
    print(f"Accuracy:  {acc:.3f}")
    print(f"Precision: {prec:.3f}")
    print(f"Recall:    {rec:.3f}")
    print(f"F1-score:  {f1:.3f}")
    if auc is not None:
        print(f"ROC-AUC:   {auc:.3f}")
    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred))

    # store
    results_imb[name] = {'model': pipe, 'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'auc': auc}

print("\nSummary (imbalance-handling):")
for n, s in results_imb.items():
    auc_display = "None" if s['auc'] is None else f"{s['auc']:.3f}"
    print(f"{n}: F1={s['f1']:.3f} Acc={s['acc']:.3f} AUC={auc_display}  Prec={s['prec']:.3f} Rec={s['rec']:.3f}")

# Save for later comparisons
globals()['results_imbalance'] = results_imb


Train samples: 5721  positives: 2314  negatives: 3407  scale_pos_weight: 1.472

Training with class-imbalance adjustments...

Models:   0%|          | 0/3 [00:00<?, ?it/s]
--- Training LR_balanced ---
Model: LR_balanced
-----------
Accuracy:  0.481
Precision: 0.429
Recall:    0.704
F1-score:  0.533
ROC-AUC:   0.527
Confusion matrix:
[[265 564]
 [178 424]]

--- Training SVM_balanced ---
Model: SVM_balanced
------------
Accuracy:  0.501
Precision: 0.412
Recall:    0.437
F1-score:  0.424
ROC-AUC:   0.500
Confusion matrix:
[[454 375]
 [339 263]]
Models:  67%|██████▋   | 2/3 [00:08<00:04,  4.17s/it]
--- Training XGB_spw ---
Model: XGB_spw
-------
Accuracy:  0.493
Precision: 0.390
Recall:    0.364
F1-score:  0.377
ROC-AUC:   0.480
Confusion matrix:
[[487 342]
 [383 219]]
Models: 100%|██████████| 3/3 [00:08<00:00,  2.83s/it]

Summary (imbalance-handling):
LR_balanced: F1=0.533 Acc=0.481 AUC=0.527  Prec=0.429 Rec=0.704
SVM_balanced: F1=0.424 Acc=0.501 AUC=0.500  Prec=0.412 Rec=0.437
XGB_spw: F

In [15]:
# === CELL 10: Corrected Purged / Embargoed Time-Series CV with diagnostics ===
import numpy as np
from sklearn.model_selection import BaseCrossValidator
from sklearn.metrics import f1_score, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

class PurgedTimeSeriesSplit(BaseCrossValidator):
    """
    Purged time-series split where the training set grows each fold and test folds are contiguous.
    Optionally apply an embargo (number of samples) to remove immediately preceding indices from training.
    """
    def __init__(self, n_splits=5, embargo=0):
        assert n_splits >= 2, "n_splits must be >=2"
        self.n_splits = n_splits
        self.embargo = int(embargo)

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        # choose test_size so that we have n_splits test blocks
        test_size = n_samples // (self.n_splits + 1)
        if test_size < 1:
            raise ValueError("Not enough samples for the requested number of splits.")
        indices = np.arange(n_samples)
        # For i-th split: train_end = (i+1)*test_size, test_start = train_end, test_stop = test_start+test_size
        for i in range(self.n_splits):
            train_end = (i + 1) * test_size
            test_start = train_end
            test_stop = test_start + test_size if (test_start + test_size) <= n_samples else n_samples
            # apply embargo: remove last `embargo` samples ending at train_end from training
            embargo_start = max(0, train_end - self.embargo)
            train_indices = indices[:embargo_start]
            test_indices = indices[test_start:test_stop]
            yield train_indices, test_indices

# Diagnostics run
purged_cv = PurgedTimeSeriesSplit(n_splits=5, embargo=1)  # adjust embargo if you want
f1_scorer = make_scorer(f1_score)

pipe_lr_bal = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42))
])

print("Using PurgedTimeSeriesSplit(n_splits=5, embargo=1) on X_train/y_train")

fold_f1s = []
fold_info = []
for fold, (train_idx, test_idx) in enumerate(purged_cv.split(X_train, y_train)):
    y_tr = y_train.iloc[train_idx]
    y_te = y_train.iloc[test_idx]
    # basic checks
    n_tr, n_te = len(train_idx), len(test_idx)
    pos_tr, pos_te = int(y_tr.sum()) if n_tr>0 else 0, int(y_te.sum()) if n_te>0 else 0
    fold_info.append((fold, n_tr, n_te, pos_tr, pos_te))
    print(f"\nFold {fold}: train_size={n_tr}, test_size={n_te}, pos_train={pos_tr}, pos_test={pos_te}")

    # If train is empty, skip (shouldn't happen with this splitter)
    if n_tr == 0:
        print("  -> Skipping fold because training set is empty.")
        fold_f1s.append(np.nan)
        continue

    # Fit & evaluate on this fold (using pipeline) with safe zero_division
    pipe_lr_bal.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
    ypred = pipe_lr_bal.predict(X_train.iloc[test_idx])
    f1 = f1_score(y_train.iloc[test_idx], ypred, zero_division=0)
    fold_f1s.append(f1)
    print(f"  -> Fold {fold} F1: {f1:.4f}")

# Summary
fold_f1s_arr = np.array(fold_f1s, dtype=float)
valid = ~np.isnan(fold_f1s_arr)
if valid.sum() == 0:
    print("\nAll folds produced NaN F1 — check data and splitter settings.")
else:
    print("\nFold F1s:", np.round(fold_f1s_arr, 4))
    print(f"Mean F1 (valid folds): {np.nanmean(fold_f1s_arr):.4f}  Std: {np.nanstd(fold_f1s_arr):.4f}")


Using PurgedTimeSeriesSplit(n_splits=5, embargo=1) on X_train/y_train

Fold 0: train_size=952, test_size=953, pos_train=384, pos_test=348
  -> Fold 0 F1: 0.4730

Fold 1: train_size=1905, test_size=953, pos_train=732, pos_test=377
  -> Fold 1 F1: 0.0732

Fold 2: train_size=2858, test_size=953, pos_train=1108, pos_test=402
  -> Fold 2 F1: 0.5637

Fold 3: train_size=3811, test_size=953, pos_train=1510, pos_test=390
  -> Fold 3 F1: 0.5134

Fold 4: train_size=4764, test_size=953, pos_train=1901, pos_test=413
  -> Fold 4 F1: 0.5176

Fold F1s: [0.473  0.0732 0.5637 0.5134 0.5176]
Mean F1 (valid folds): 0.4282  Std: 0.1798


In [16]:
# CELL DIAG_1 — per-fold diagnostics (run immediately)
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Ensure required objects exist
for var in ['X_train','y_train']:
    if var not in globals():
        raise RuntimeError(f"Required variable '{var}' not found. Re-run prior cells to produce it.")

# Use the same purged CV you used before (5 splits, embargo=1)
class PurgedTimeSeriesSplit:
    def __init__(self, n_splits=5, embargo=1):
        self.n_splits = n_splits
        self.embargo = int(embargo)
    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        test_size = n_samples // (self.n_splits + 1)
        indices = np.arange(n_samples)
        for i in range(self.n_splits):
            train_end = (i + 1) * test_size
            test_start = train_end
            test_stop = test_start + test_size if (test_start + test_size) <= n_samples else n_samples
            embargo_start = max(0, train_end - self.embargo)
            train_indices = indices[:embargo_start]
            test_indices = indices[test_start:test_stop]
            yield train_indices, test_indices

purged_cv = PurgedTimeSeriesSplit(n_splits=5, embargo=1)

# Choose the pipeline to inspect (use the same variant you used in purged CV earlier)
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42))
])

# Run per-fold diagnostics
fold_reports = []
for fold, (train_idx, test_idx) in enumerate(purged_cv.split(X_train, y_train)):
    print(f"\n=== Fold {fold} ===")
    print(f"train_size={len(train_idx)}, test_size={len(test_idx)}")
    # class counts
    if len(train_idx)>0:
        print("pos_train:", int(y_train.iloc[train_idx].sum()), "neg_train:", len(train_idx)-int(y_train.iloc[train_idx].sum()))
    print("pos_test:", int(y_train.iloc[test_idx].sum()), "neg_test:", len(test_idx)-int(y_train.iloc[test_idx].sum()))
    if len(train_idx) == 0:
        print("Skipping: empty train set")
        continue
    # fit & eval
    pipe.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
    ypred = pipe.predict(X_train.iloc[test_idx])
    cm = confusion_matrix(y_train.iloc[test_idx], ypred)
    prec = precision_score(y_train.iloc[test_idx], ypred, zero_division=0)
    rec  = recall_score(y_train.iloc[test_idx], ypred, zero_division=0)
    f1   = f1_score(y_train.iloc[test_idx], ypred, zero_division=0)
    print("confusion_matrix:\n", cm)
    print(f"precision={prec:.3f}  recall={rec:.3f}  f1={f1:.3f}")
    fold_reports.append((fold, len(train_idx), len(test_idx), int(y_train.iloc[train_idx].sum()), int(y_train.iloc[test_idx].sum()), cm.tolist(), prec, rec, f1))

    # quick feature drift check: compare means of top 10 numeric features between train and test
    try:
        feat_means_tr = X_train.iloc[train_idx].mean().sort_values(ascending=False)
        feat_means_te = X_train.iloc[test_idx].mean()
        top_feats = feat_means_tr.index[:10].tolist()
        df_comp = pd.DataFrame({
            'feat': top_feats,
            'train_mean': X_train.iloc[train_idx][top_feats].mean().values,
            'test_mean' : X_train.iloc[test_idx][top_feats].mean().values
        })
        print("\nTop feature means (train vs test) for top 10 train-means:")
        display(df_comp)
    except Exception as e:
        print("Feature drift check failed:", e)



=== Fold 0 ===
train_size=952, test_size=953
pos_train: 384 neg_train: 568
pos_test: 348 neg_test: 605
confusion_matrix:
 [[288 317]
 [142 206]]
precision=0.394  recall=0.592  f1=0.473

Top feature means (train vs test) for top 10 train-means:


Unnamed: 0,feat,train_mean,test_mean
0,vol_roll_mean_6,1171.320816,1063.697971
1,volume,1170.585084,1064.801679
2,roll_max_12,999.213971,978.250514
3,roll_max_6,996.922805,976.43829
4,high,995.505861,975.332025
5,roll_max_3,995.365158,975.179119
6,open,993.792626,973.907009
7,close,993.734979,973.836159
8,roll_mean_3,993.684207,973.853204
9,roll_mean_6,993.643502,973.871221



=== Fold 1 ===
train_size=1905, test_size=953
pos_train: 732 neg_train: 1173
pos_test: 377 neg_test: 576
confusion_matrix:
 [[558  18]
 [362  15]]
precision=0.455  recall=0.040  f1=0.073

Top feature means (train vs test) for top 10 train-means:


Unnamed: 0,feat,train_mean,test_mean
0,volume,1117.777953,1473.750262
1,vol_roll_mean_6,1117.011942,1474.888073
2,roll_max_12,988.732031,1173.75553
3,roll_max_6,986.680913,1171.844827
4,high,985.417696,1171.054008
5,roll_max_3,985.272583,1170.387775
6,open,983.847459,1168.904281
7,close,983.784924,1169.060546
8,roll_mean_3,983.768279,1168.390423
9,roll_mean_6,983.758603,1167.885853



=== Fold 2 ===
train_size=2858, test_size=953
pos_train: 1108 neg_train: 1750
pos_test: 402 neg_test: 551
confusion_matrix:
 [[107 444]
 [ 70 332]]
precision=0.428  recall=0.826  f1=0.564

Top feature means (train vs test) for top 10 train-means:


Unnamed: 0,feat,train_mean,test_mean
0,volume,1236.606368,1956.527807
1,vol_roll_mean_6,1236.39576,1957.56978
2,roll_max_12,1050.312446,1479.845782
3,roll_max_6,1048.31204,1475.122833
4,high,1047.206679,1471.956537
5,roll_max_3,1046.887551,1471.480965
6,close,1045.453341,1467.446139
7,open,1045.443027,1467.513924
8,roll_mean_3,1045.21879,1467.129091
9,roll_mean_6,1045.043199,1466.889934



=== Fold 3 ===
train_size=3811, test_size=953
pos_train: 1510 neg_train: 2301
pos_test: 390 neg_test: 563
confusion_matrix:
 [[258 305]
 [150 240]]
precision=0.440  recall=0.615  f1=0.513

Top feature means (train vs test) for top 10 train-means:


Unnamed: 0,feat,train_mean,test_mean
0,vol_roll_mean_6,1416.856752,1434.562434
1,volume,1416.850171,1436.405037
2,roll_max_12,1157.687326,1627.590178
3,roll_max_6,1155.002781,1624.117492
4,high,1153.382401,1622.209609
5,roll_max_3,1153.023737,1621.617119
6,open,1150.948843,1619.007188
7,close,1150.939567,1619.037427
8,roll_mean_3,1150.684229,1618.509645
9,roll_mean_6,1150.492994,1618.112086



=== Fold 4 ===
train_size=4764, test_size=953
pos_train: 1901 neg_train: 2863
pos_test: 413 neg_test: 540
confusion_matrix:
 [[257 283]
 [170 243]]
precision=0.462  recall=0.588  f1=0.518

Top feature means (train vs test) for top 10 train-means:


Unnamed: 0,feat,train_mean,test_mean
0,volume,1420.766793,1712.741868
1,vol_roll_mean_6,1420.298261,1707.812697
2,roll_max_12,1251.634446,1893.840262
3,roll_max_6,1248.79224,1889.710986
4,high,1247.114691,1887.421364
5,roll_max_3,1246.709216,1886.73978
6,open,1244.52721,1883.7234
7,close,1244.526194,1883.747146
8,roll_mean_3,1244.215985,1883.145792
9,roll_mean_6,1243.983166,1882.715953


In [17]:
# CELL DIAG_2 — increase embargo and re-run per-fold diagnostics
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

# safety checks
for name in ['X_train','y_train']:
    if name not in globals():
        raise RuntimeError(f"Required variable '{name}' not found. Re-run prior parsing/feature cells first.")

# PurgedTimeSeriesSplit but with larger embargo
class PurgedTimeSeriesSplit:
    def __init__(self, n_splits=5, embargo=3):
        self.n_splits = n_splits
        self.embargo = int(embargo)
    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        test_size = n_samples // (self.n_splits + 1)
        indices = np.arange(n_samples)
        for i in range(self.n_splits):
            train_end = (i + 1) * test_size
            test_start = train_end
            test_stop = min(n_samples, test_start + test_size)
            # apply embargo by removing last `embargo` indices before test from train
            embargo_start = max(0, train_end - self.embargo)
            train_indices = indices[:embargo_start]
            test_indices = indices[test_start:test_stop]
            yield train_indices, test_indices

purged_cv = PurgedTimeSeriesSplit(n_splits=5, embargo=3)

# Use a robust pipeline for diagnostics
pipe = Pipeline([
    ("scaler", RobustScaler()), 
    ("clf", LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42))
])

print("Running per-fold diagnostics with embargo=3 and RobustScaler...\n")
for fold, (train_idx, test_idx) in enumerate(purged_cv.split(X_train, y_train)):
    print(f"=== Fold {fold} ===")
    print(f"train_size={len(train_idx)}, test_size={len(test_idx)}")
    if len(train_idx) == 0:
        print("Empty train set for this fold — adjust n_splits/embargo.")
        continue
    print("pos_train:", int(y_train.iloc[train_idx].sum()), "neg_train:", len(train_idx)-int(y_train.iloc[train_idx].sum()))
    print("pos_test:", int(y_train.iloc[test_idx].sum()), "neg_test:", len(test_idx)-int(y_train.iloc[test_idx].sum()))
    # fit & evaluate
    pipe.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
    ypred = pipe.predict(X_train.iloc[test_idx])
    cm = confusion_matrix(y_train.iloc[test_idx], ypred)
    prec = precision_score(y_train.iloc[test_idx], ypred, zero_division=0)
    rec  = recall_score(y_train.iloc[test_idx], ypred, zero_division=0)
    f1   = f1_score(y_train.iloc[test_idx], ypred, zero_division=0)
    acc  = accuracy_score(y_train.iloc[test_idx], ypred)
    print("confusion_matrix:\n", cm)
    print(f"precision={prec:.3f}  recall={rec:.3f}  f1={f1:.3f}  acc={acc:.3f}")

    # small drift check: top 6 numeric cols by train mean, show train vs test mean
    try:
        feat_means_tr = X_train.iloc[train_idx].mean().sort_values(ascending=False)
        top_feats = feat_means_tr.index[:6].tolist()
        df_comp = pd.DataFrame({
            'feat': top_feats,
            'train_mean': X_train.iloc[train_idx][top_feats].mean().values,
            'test_mean' : X_train.iloc[test_idx][top_feats].mean().values
        })
        print("\nTop feature means (train vs test) for top 6 train-means:")
        display(df_comp)
    except Exception as e:
        print("Feature-drift check failed:", e)
    print("\n")


Running per-fold diagnostics with embargo=3 and RobustScaler...

=== Fold 0 ===
train_size=950, test_size=953
pos_train: 383 neg_train: 567
pos_test: 348 neg_test: 605
confusion_matrix:
 [[284 321]
 [144 204]]
precision=0.389  recall=0.586  f1=0.467  acc=0.512

Top feature means (train vs test) for top 6 train-means:


Unnamed: 0,feat,train_mean,test_mean
0,vol_roll_mean_6,1172.308509,1063.697971
1,volume,1170.477895,1064.801679
2,roll_max_12,999.235095,978.250514
3,roll_max_6,996.939105,976.43829
4,high,995.515895,975.332025
5,roll_max_3,995.380895,975.179119




=== Fold 1 ===
train_size=1903, test_size=953
pos_train: 730 neg_train: 1173
pos_test: 377 neg_test: 576
confusion_matrix:
 [[559  17]
 [362  15]]
precision=0.469  recall=0.040  f1=0.073  acc=0.602

Top feature means (train vs test) for top 6 train-means:


Unnamed: 0,feat,train_mean,test_mean
0,volume,1117.790331,1473.750262
1,vol_roll_mean_6,1116.264976,1474.888073
2,roll_max_12,988.742154,1173.75553
3,roll_max_6,986.689932,1171.844827
4,high,985.424241,1171.054008
5,roll_max_3,985.281172,1170.387775




=== Fold 2 ===
train_size=2856, test_size=953
pos_train: 1107 neg_train: 1749
pos_test: 402 neg_test: 551
confusion_matrix:
 [[110 441]
 [ 75 327]]
precision=0.426  recall=0.813  f1=0.559  acc=0.459

Top feature means (train vs test) for top 6 train-means:


Unnamed: 0,feat,train_mean,test_mean
0,volume,1236.840336,1956.527807
1,vol_roll_mean_6,1235.704278,1957.56978
2,roll_max_12,1050.130823,1479.845782
3,roll_max_6,1048.13534,1475.122833
4,high,1047.031404,1471.956537
5,roll_max_3,1046.712489,1471.480965




=== Fold 3 ===
train_size=3809, test_size=953
pos_train: 1509 neg_train: 2300
pos_test: 390 neg_test: 563
confusion_matrix:
 [[255 308]
 [144 246]]
precision=0.444  recall=0.631  f1=0.521  acc=0.526

Top feature means (train vs test) for top 6 train-means:


Unnamed: 0,feat,train_mean,test_mean
0,volume,1417.000525,1436.405037
1,vol_roll_mean_6,1416.974162,1434.562434
2,roll_max_12,1157.534224,1627.590178
3,roll_max_6,1154.84827,1624.117492
4,high,1153.226556,1622.209609
5,roll_max_3,1152.868186,1621.617119




=== Fold 4 ===
train_size=4762, test_size=953
pos_train: 1901 neg_train: 2861
pos_test: 413 neg_test: 540
confusion_matrix:
 [[233 307]
 [161 252]]
precision=0.451  recall=0.610  f1=0.519  acc=0.509

Top feature means (train vs test) for top 6 train-means:


Unnamed: 0,feat,train_mean,test_mean
0,volume,1420.561319,1712.741868
1,vol_roll_mean_6,1420.287187,1707.812697
2,roll_max_12,1251.444469,1893.840262
3,roll_max_6,1248.60164,1889.710986
4,high,1246.922979,1887.421364
5,roll_max_3,1246.518359,1886.73978






In [18]:
# === CELL B — Leak-safe hyperparameter tuning with PurgedTimeSeriesSplit (embargo=3) ===
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

# Define purged CV (embargo=3)
class PurgedTimeSeriesSplit:
    def __init__(self, n_splits=5, embargo=3):
        self.n_splits = n_splits
        self.embargo = int(embargo)
    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        test_size = n_samples // (self.n_splits + 1)
        idx = np.arange(n_samples)
        for i in range(self.n_splits):
            train_end = (i + 1) * test_size
            test_start = train_end
            test_stop = min(n_samples, test_start + test_size)
            embargo_start = max(0, train_end - self.embargo)
            train_idx = idx[:embargo_start]
            test_idx  = idx[test_start:test_stop]
            yield train_idx, test_idx

purged_cv = PurgedTimeSeriesSplit(n_splits=5, embargo=3)
f1_scorer = make_scorer(f1_score)

# Pipelines
pipe_lr  = Pipeline([("scaler", RobustScaler()),
                     ("clf", LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42))])
pipe_svm = Pipeline([("scaler", RobustScaler()),
                     ("clf", SVC(probability=True, class_weight='balanced', random_state=42))])
pipe_xgb = Pipeline([("scaler", RobustScaler()),
                     ("clf", XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42,
                                           scale_pos_weight=int((y_train==0).sum()/(y_train==1).sum()))) ])

# Parameter grids
param_lr  = {"clf__C": np.logspace(-4, 2, 10)}
param_svm = {"clf__C": [0.01, 0.1, 1, 5, 10], "clf__gamma": ["scale", 0.01, 0.001], "clf__kernel": ["rbf"]}
param_xgb = {"clf__n_estimators": [100,200,400],
             "clf__max_depth": [3,5,8],
             "clf__learning_rate": [0.01,0.05,0.1],
             "clf__subsample": [0.7,0.9],
             "clf__colsample_bytree": [0.7,0.9]}

def tune_model(name, pipe, param_dist, n_iter=10):
    print(f"\nTuning {name} with purged CV (embargo=3)...")
    search = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=n_iter,
                                scoring=f1_scorer, cv=purged_cv, n_jobs=-1,
                                random_state=42, verbose=2, refit=True)
    search.fit(X_train, y_train)
    print(f"Best CV F1: {search.best_score_:.4f}")
    print("Best params:", search.best_params_)
    return search.best_estimator_

best_lr_purged  = tune_model("LogisticRegression", pipe_lr, param_lr)
best_svm_purged = tune_model("SVM", pipe_svm, param_svm)
best_xgb_purged = tune_model("XGBoost", pipe_xgb, param_xgb)

# Evaluate tuned models on the holdout test set
# === corrected evaluation print block ===
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

for name, model in [("LR_purged", best_lr_purged),
                    ("SVM_purged", best_svm_purged),
                    ("XGB_purged", best_xgb_purged)]:
    y_pred = model.predict(X_test)
    try:
        y_proba = model.predict_proba(X_test)[:, 1]
    except Exception:
        y_proba = None

    acc  = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec  = recall_score(y_test, y_pred, zero_division=0)
    f1   = f1_score(y_test, y_pred, zero_division=0)
    auc  = roc_auc_score(y_test, y_proba) if y_proba is not None else None

    auc_str = f"{auc:.3f}" if auc is not None else "None"
    print(f"\n{name}:  Acc={acc:.3f}  Prec={prec:.3f}  Rec={rec:.3f}  F1={f1:.3f}  AUC={auc_str}")
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))



Tuning LogisticRegression with purged CV (embargo=3)...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best CV F1: 0.4422
Best params: {'clf__C': np.float64(0.00046415888336127773)}

Tuning SVM with purged CV (embargo=3)...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best CV F1: 0.5174
Best params: {'clf__kernel': 'rbf', 'clf__gamma': 0.01, 'clf__C': 0.1}

Tuning XGBoost with purged CV (embargo=3)...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best CV F1: 0.3311
Best params: {'clf__subsample': 0.9, 'clf__n_estimators': 400, 'clf__max_depth': 3, 'clf__learning_rate': 0.1, 'clf__colsample_bytree': 0.9}

LR_purged:  Acc=0.449  Prec=0.425  Rec=0.874  F1=0.571  AUC=0.511
Confusion matrix:
 [[116 713]
 [ 76 526]]

SVM_purged:  Acc=0.454  Prec=0.426  Rec=0.855  F1=0.569  AUC=0.524
Confusion matrix:
 [[135 694]
 [ 87 515]]

XGB_purged:  Acc=0.538  Prec=0.407  Rec=0.214  F1=0.281  AUC=0.493
Confusion matrix:
 [[641 188]
 [473 129]]


In [19]:
# === Cell News_2: Parse all JSON news files ===
import os, json
import pandas as pd
from tqdm import tqdm

NEWS_DIR = r"C:\Users\amanb\OneDrive\Desktop\SWM Project\Extracted_News"

rows = []
for root, _, files in os.walk(NEWS_DIR):
    for f in files:
        if f.lower().endswith(".json"):
            path = os.path.join(root, f)
            try:
                with open(path, "r", encoding="utf-8") as fh:
                    data = json.load(fh)
                published = data.get("published") or data.get("thread", {}).get("published")
                title = data.get("title") or ""
                text  = data.get("text") or ""
                site  = data.get("thread", {}).get("site_full") or data.get("thread", {}).get("site") or ""
                url   = data.get("url") or data.get("thread", {}).get("url") or ""
                lang  = data.get("language") or data.get("thread", {}).get("language") or "unknown"

                # combine title+text for sentiment
                full_text = (title + " " + text).strip()

                rows.append({
                    "published": published,
                    "title": title,
                    "text": text,
                    "site": site,
                    "url": url,
                    "language": lang,
                    "full_text": full_text
                })
            except Exception as e:
                print("Error reading", f, ":", e)

news_df = pd.DataFrame(rows)

# --- Basic cleaning ---
# Parse datetime with timezone awareness; drop unparsable
news_df['published'] = pd.to_datetime(news_df['published'], errors='coerce', utc=True)
news_df = news_df.dropna(subset=['published'])

# Keep only English
news_df = news_df[news_df['language'].str.lower() == 'english']

# Drop empty texts
news_df = news_df[news_df['full_text'].str.strip().astype(bool)]

print("Parsed articles:", len(news_df))
print("Date range:", news_df['published'].min(), "→", news_df['published'].max())
print("Unique sites:", news_df['site'].nunique())

display(news_df[['published','site','title']].head(10))


Parsed articles: 70731
Date range: 2017-12-07 20:00:00+00:00 → 2019-02-07 23:10:00+00:00
Unique sites: 1727


Unnamed: 0,published,site,title
0,2018-01-03 20:33:00+00:00,www.nephrologynews.com,Annual RPA meeting emphasizes physician leader...
1,2018-01-01 12:36:00+00:00,www.thelincolnianonline.com,Somewhat Positive News Coverage Somewhat Unlik...
2,2018-01-01 08:47:00+00:00,finance.yahoo.com,There is a 40% chance Apple will acquire Netfl...
3,2018-01-01 12:08:00+00:00,ledgergazette.com,Apple Inc. (AAPL) Short Interest Down 8.1% in ...
4,2018-01-01 20:02:00+00:00,www.fool.com,2 Warren Buffett Stocks to Consider Buying Now...
5,2018-01-01 14:20:00+00:00,www.dailypolitical.com,Apple Inc. (AAPL) Shares Sold by Peoples Bank OH
6,2018-01-01 21:46:00+00:00,ledgergazette.com,"Sumitomo Life Insurance Co. Sells 5,920 Shares..."
7,2018-01-01 10:07:00+00:00,heraldks.com,Centre Asset Management Has Increased By $1.69...
8,2018-01-01 01:29:00+00:00,www.financialnewsusa.com,Here’s How Much The FANG+ Stocks Gained In 201...
9,2018-01-03 00:00:00+00:00,www.bnn.ca,Intel chips design flaw fix causes them to slo...


In [20]:
# === Cell News_3 — Compute VADER sentiment on all articles ===
%pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tqdm import tqdm
import numpy as np
import pandas as pd

analyzer = SentimentIntensityAnalyzer()

# Compute sentiment for each article
scores = []
for text in tqdm(news_df['full_text'], desc="VADER sentiment"):
    if not isinstance(text, str) or len(text.strip()) == 0:
        scores.append({'vader_neg': np.nan, 'vader_neu': np.nan,
                       'vader_pos': np.nan, 'vader_compound': np.nan})
        continue
    s = analyzer.polarity_scores(text)
    scores.append({'vader_neg': s['neg'], 'vader_neu': s['neu'],
                   'vader_pos': s['pos'], 'vader_compound': s['compound']})

# Merge scores into dataframe
sent_df = pd.DataFrame(scores)
news_df = pd.concat([news_df.reset_index(drop=True), sent_df], axis=1)

print("VADER sentiment computed for", len(news_df), "articles")
display(news_df[['published','site','vader_compound','vader_pos','vader_neg','title']].head(10))

# quick stats
print("\nSummary of compound scores:")
display(news_df['vader_compound'].describe())
print("Proportion positive (>0.05):", (news_df['vader_compound']>0.05).mean())
print("Proportion negative (<-0.05):", (news_df['vader_compound']<-0.05).mean())





VADER sentiment: 100%|██████████| 70731/70731 [28:04<00:00, 42.00it/s]  

VADER sentiment computed for 70731 articles





Unnamed: 0,published,site,vader_compound,vader_pos,vader_neg,title
0,2018-01-03 20:33:00+00:00,www.nephrologynews.com,0.9983,0.137,0.006,Annual RPA meeting emphasizes physician leader...
1,2018-01-01 12:36:00+00:00,www.thelincolnianonline.com,0.9751,0.076,0.029,Somewhat Positive News Coverage Somewhat Unlik...
2,2018-01-01 08:47:00+00:00,finance.yahoo.com,0.8625,0.066,0.038,There is a 40% chance Apple will acquire Netfl...
3,2018-01-01 12:08:00+00:00,ledgergazette.com,0.9959,0.096,0.018,Apple Inc. (AAPL) Short Interest Down 8.1% in ...
4,2018-01-01 20:02:00+00:00,www.fool.com,0.9986,0.136,0.03,2 Warren Buffett Stocks to Consider Buying Now...
5,2018-01-01 14:20:00+00:00,www.dailypolitical.com,0.9979,0.114,0.005,Apple Inc. (AAPL) Shares Sold by Peoples Bank OH
6,2018-01-01 21:46:00+00:00,ledgergazette.com,0.9978,0.115,0.018,"Sumitomo Life Insurance Co. Sells 5,920 Shares..."
7,2018-01-01 10:07:00+00:00,heraldks.com,0.999,0.149,0.026,Centre Asset Management Has Increased By $1.69...
8,2018-01-01 01:29:00+00:00,www.financialnewsusa.com,0.9964,0.173,0.028,Here’s How Much The FANG+ Stocks Gained In 201...
9,2018-01-03 00:00:00+00:00,www.bnn.ca,0.6803,0.048,0.032,Intel chips design flaw fix causes them to slo...



Summary of compound scores:


count    70731.000000
mean         0.702274
std          0.605657
min         -1.000000
25%          0.867900
50%          0.990200
75%          0.997800
max          1.000000
Name: vader_compound, dtype: float64

Proportion positive (>0.05): 0.8580678910237378
Proportion negative (<-0.05): 0.13373202697544218


In [21]:
# === Cell News_4 — aggregate news sentiment into 30-minute bins ===
import pandas as pd
import numpy as np

articles_df = news_df.copy()
articles_df['published_ts'] = pd.to_datetime(articles_df['published']).dt.tz_convert(None)

# Round down to 30-minute bins
articles_df['bin_ts'] = articles_df['published_ts'].dt.floor('30T')

# Define helpers
def is_pos(x): return (x > 0.05)
def is_neg(x): return (x < -0.05)

# Aggregate per 30-min bin
agg = articles_df.groupby('bin_ts').agg(
    news_count=('vader_compound','count'),
    vader_mean =('vader_compound','mean'),
    vader_sum  =('vader_compound','sum'),
    vader_std  =('vader_compound','std'),
    pos_count  =('vader_compound',lambda s:is_pos(s).sum()),
    neg_count  =('vader_compound',lambda s:is_neg(s).sum())
)
agg['pos_prop'] = agg['pos_count']/agg['news_count']
agg['neg_prop'] = agg['neg_count']/agg['news_count']

# Fill missing std with 0
agg['vader_std'] = agg['vader_std'].fillna(0)

# Rolling features (2 h = 4 bins, 6 h = 12 bins)
agg['vader_mean_roll_4']  = agg['vader_mean'].rolling(4,  min_periods=1).mean()
agg['vader_mean_roll_12'] = agg['vader_mean'].rolling(12, min_periods=1).mean()
agg['news_count_roll_4']  = agg['news_count'].rolling(4,  min_periods=1).sum()
agg['news_count_roll_12'] = agg['news_count'].rolling(12, min_periods=1).sum()

print("Aggregated bins:", len(agg))
display(agg.head(8))


Aggregated bins: 14759


Unnamed: 0_level_0,news_count,vader_mean,vader_sum,vader_std,pos_count,neg_count,pos_prop,neg_prop,vader_mean_roll_4,vader_mean_roll_12,news_count_roll_4,news_count_roll_12
bin_ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2017-12-07 20:00:00,1,0.9898,0.9898,0.0,1,0,1.0,0.0,0.9898,0.9898,1.0,1.0
2017-12-08 21:30:00,1,0.9998,0.9998,0.0,1,0,1.0,0.0,0.9948,0.9948,2.0,2.0
2017-12-12 01:30:00,1,-0.9976,-0.9976,0.0,0,1,0.0,1.0,0.330667,0.330667,3.0,3.0
2017-12-12 22:00:00,1,0.9988,0.9988,0.0,1,0,1.0,0.0,0.4977,0.4977,4.0,4.0
2017-12-14 11:30:00,1,0.9277,0.9277,0.0,1,0,1.0,0.0,0.482175,0.5837,4.0,5.0
2017-12-14 12:30:00,1,0.9552,0.9552,0.0,1,0,1.0,0.0,0.471025,0.645617,4.0,6.0
2017-12-15 02:30:00,1,0.9989,0.9989,0.0,1,0,1.0,0.0,0.97015,0.696086,4.0,7.0
2017-12-15 12:00:00,1,0.8531,0.8531,0.0,1,0,1.0,0.0,0.933725,0.715712,4.0,8.0


In [22]:
# === REBUILD_MODEL_DF_SAFE ===
import pandas as pd
import numpy as np

# Safety checks
if 'df' not in globals():
    raise RuntimeError("Original parsed DataFrame 'df' not found. Run parsing cell 1-5 first.")
required_cols = {'open','high','low','close','volume'}
if not required_cols.issubset(set(df.columns)):
    raise RuntimeError(f"'df' missing required cols. Required: {required_cols}. Found: {list(df.columns)}")

# parameters (use existing if defined)
HORIZON = globals().get('HORIZON', 1)
THRESHOLD = globals().get('THRESHOLD', 0.0005)

def build_clean_model_df(prices_df, horizon=1, threshold=0.0005):
    p = prices_df.copy().sort_index()

    # create shifted historical series (values used at time t are from <= t-1)
    close_prev = p['close'].shift(1)
    high_prev = p['high'].shift(1)
    low_prev  = p['low'].shift(1)
    vol_prev  = p['volume'].shift(1)

    # Basic lagged returns (based on close_prev)
    p['return_1'] = close_prev.pct_change().fillna(0)
    p['log_return_1'] = np.log(close_prev).diff().fillna(0)

    # EMAs on shifted close
    p['ema_8']  = close_prev.ewm(span=8, adjust=False).mean()
    p['ema_21'] = close_prev.ewm(span=21, adjust=False).mean()
    p['ema_ratio_8_21'] = p['ema_8'] / (p['ema_21'] + 1e-9)

    # MACD on shifted close
    ema12 = close_prev.ewm(span=12, adjust=False).mean()
    ema26 = close_prev.ewm(span=26, adjust=False).mean()
    macd = ema12 - ema26
    macd_sig = macd.ewm(span=9, adjust=False).mean()
    p['macd'] = macd
    p['macd_sig'] = macd_sig
    p['macd_hist'] = macd - macd_sig

    # RSI (14) on shifted close
    delta = close_prev.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    roll_gain = gain.rolling(14, min_periods=1).mean()
    roll_loss = loss.rolling(14, min_periods=1).mean()
    rs = roll_gain / (roll_loss + 1e-9)
    p['rsi_14'] = 100 - (100 / (1 + rs))

    # Bollinger bands (20) on shifted close
    ma20 = close_prev.rolling(20, min_periods=1).mean()
    std20 = close_prev.rolling(20, min_periods=1).std().fillna(0)
    p['bb_upper'] = ma20 + 2 * std20
    p['bb_lower'] = ma20 - 2 * std20
    p['bb_width'] = (p['bb_upper'] - p['bb_lower']) / (ma20 + 1e-9)

    # ATR using shifted highs/lows and an earlier previous close for TR
    prev_close_for_tr = p['close'].shift(2)
    tr1 = high_prev - low_prev
    tr2 = (high_prev - prev_close_for_tr).abs()
    tr3 = (low_prev - prev_close_for_tr).abs()
    tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
    p['atr_14'] = tr.rolling(14, min_periods=1).mean()

    # Rolling features (example windows)
    wins = [3,6,12]
    for w in wins:
        p[f'roll_mean_{w}'] = close_prev.rolling(window=w, min_periods=1).mean()
        p[f'roll_std_{w}'] = close_prev.rolling(window=w, min_periods=1).std().fillna(0)
        p[f'roll_max_{w}'] = close_prev.rolling(window=w, min_periods=1).max()
        p[f'roll_min_{w}'] = close_prev.rolling(window=w, min_periods=1).min()

    # Momentum & volume features (lagged)
    p['mom_3'] = close_prev - p['roll_mean_3']
    p['vol_change_1'] = vol_prev.pct_change().fillna(0)
    p['vol_roll_mean_6'] = vol_prev.rolling(6, min_periods=1).mean()

    # Build target (future close at horizon) — NOT to be used as features
    p['close_future_h'] = p['close'].shift(-horizon)
    p['future_return_h'] = (p['close_future_h'] / p['close']) - 1

    # Create clean 1-D integer target series and assign
    tgt_ser = (p['future_return_h'] > threshold).astype(int)
    tgt_ser = pd.Series(tgt_ser.values, index=p.index, name='target_thresholded', dtype='int64')
    p['target_thresholded'] = tgt_ser

    # select numeric columns (keep target)
    numeric_cols = p.select_dtypes(include=[np.number]).columns.tolist()
    if 'target_thresholded' not in numeric_cols:
        numeric_cols.append('target_thresholded')
    numeric_cols = list(dict.fromkeys(numeric_cols))  # order-preserving unique

    model_safe = p[numeric_cols].copy()
    model_safe = model_safe.dropna().copy()

    # final safety: ensure single-column int Series for target
    if isinstance(model_safe['target_thresholded'], pd.DataFrame):
        model_safe['target_thresholded'] = model_safe['target_thresholded'].iloc[:, -1]
    # Convert with a safe path (work with Series)
    tgt_series = pd.Series(model_safe['target_thresholded'].values, index=model_safe.index)
    tgt_series = pd.to_numeric(tgt_series, errors='coerce').fillna(0).astype(int)
    model_safe['target_thresholded'] = tgt_series

    return model_safe

# Rebuild
model_df_safe = build_clean_model_df(df, horizon=HORIZON, threshold=THRESHOLD)

# Verify and print
print("Built model_df_safe with shape:", model_df_safe.shape)
vals, counts = np.unique(model_df_safe['target_thresholded'].to_numpy(), return_counts=True)
print("Target distribution (value:count):", dict(zip(vals.tolist(), counts.tolist())))
print("Target proportions:", dict(zip(vals.tolist(), (counts/counts.sum()).tolist())))
display(model_df_safe.head())


Built model_df_safe with shape: (7152, 36)
Target distribution (value:count): {0: 4236, 1: 2916}
Target proportions: {0: 0.5922818791946308, 1: 0.4077181208053691}


Unnamed: 0_level_0,open,high,low,close,volume,return_1,log_return_1,ema_8,ema_21,ema_ratio_8_21,...,roll_mean_12,roll_std_12,roll_max_12,roll_min_12,mom_3,vol_change_1,vol_roll_mean_6,close_future_h,future_return_h,target_thresholded
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-05-12 19:00:00,961.91,962.76,960.63,961.74,1384,0.00128,0.00128,960.933333,960.771818,1.000168,...,961.275,0.869741,961.89,960.66,0.615,0.039877,997.5,961.34,-0.000416,0
2017-05-12 19:30:00,961.77,962.27,961.13,961.34,2110,-0.000156,-0.000156,961.112593,960.859835,1.000263,...,961.43,0.671044,961.89,960.66,0.31,0.360865,1126.333333,960.2,-0.001186,0
2017-05-15 12:30:00,962.39,962.5,959.97,960.2,121,-0.000416,-0.000416,961.163128,960.903486,1.00027,...,961.4075,0.54975,961.89,960.66,-0.316667,0.524566,1372.25,958.68,-0.001583,0
2017-05-15 13:00:00,960.1,960.7,958.09,958.68,372,-0.001186,-0.001187,960.949099,960.839533,1.000114,...,961.166,0.719917,961.89,960.2,-0.893333,-0.942654,1122.0,959.97,0.001346,1
2017-05-15 13:30:00,958.66,963.0,957.0,959.97,2512,-0.001583,-0.001584,960.444855,960.643212,0.999794,...,960.751667,1.201939,961.89,958.68,-1.393333,2.07438,997.0,957.54,-0.002531,0


In [23]:
# === NEWS_MERGE: align aggregated news 'agg' with price model_df_safe and join ===
import pandas as pd
import numpy as np

if 'agg' not in globals():
    raise RuntimeError("News aggregate variable 'agg' not found. Run the news-aggregation cell first.")
if 'model_df_safe' not in globals():
    raise RuntimeError("model_df_safe not found. Run the REBUILD_MODEL_DF_SAFE cell first.")

news_agg_bins = agg.copy()
price_df = model_df_safe.copy()

print("Price index range:", price_df.index.min(), "→", price_df.index.max())
print("News bins range :", news_agg_bins.index.min(), "→", news_agg_bins.index.max())

# Reindex news to price index. Use forward-fill to carry last news state to subsequent price bars
# (alternative strategies: nearest, backfill, zero-fill — here we forward-fill then fillna(0)).
news_aligned = news_agg_bins.reindex(price_df.index, method='ffill')
news_aligned = news_aligned.fillna(0)  # bars with no previous news become zeros

# Join
model_df_news = price_df.join(news_aligned, how='left').fillna(0)

print("\nJoined model_df_news shape:", model_df_news.shape)
print("News columns added:", [c for c in news_aligned.columns])
print("\nSample head (news cols):")
display(model_df_news[['news_count','vader_mean','vader_sum','vader_std','pos_prop','neg_prop']].head(10))

print("\nNews_count describe:")
display(model_df_news['news_count'].describe())
print("Proportion of bars with any news:", (model_df_news['news_count']>0).mean())

# Keep globally
model_df_news = model_df_news.copy()
print("\nSaved model_df_news in namespace.")


Price index range: 2017-05-12 19:00:00 → 2019-02-01 19:30:00
News bins range : 2017-12-07 20:00:00 → 2019-02-07 23:00:00

Joined model_df_news shape: (7152, 48)
News columns added: ['news_count', 'vader_mean', 'vader_sum', 'vader_std', 'pos_count', 'neg_count', 'pos_prop', 'neg_prop', 'vader_mean_roll_4', 'vader_mean_roll_12', 'news_count_roll_4', 'news_count_roll_12']

Sample head (news cols):


Unnamed: 0_level_0,news_count,vader_mean,vader_sum,vader_std,pos_prop,neg_prop
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-05-12 19:00:00,0.0,0.0,0.0,0.0,0.0,0.0
2017-05-12 19:30:00,0.0,0.0,0.0,0.0,0.0,0.0
2017-05-15 12:30:00,0.0,0.0,0.0,0.0,0.0,0.0
2017-05-15 13:00:00,0.0,0.0,0.0,0.0,0.0,0.0
2017-05-15 13:30:00,0.0,0.0,0.0,0.0,0.0,0.0
2017-05-15 14:00:00,0.0,0.0,0.0,0.0,0.0,0.0
2017-05-15 14:30:00,0.0,0.0,0.0,0.0,0.0,0.0
2017-05-15 15:00:00,0.0,0.0,0.0,0.0,0.0,0.0
2017-05-15 15:30:00,0.0,0.0,0.0,0.0,0.0,0.0
2017-05-15 16:00:00,0.0,0.0,0.0,0.0,0.0,0.0



News_count describe:


count    7152.000000
mean        3.261326
std         3.746225
min         0.000000
25%         0.000000
50%         2.000000
75%         5.000000
max        39.000000
Name: news_count, dtype: float64

Proportion of bars with any news: 0.6648489932885906

Saved model_df_news in namespace.


In [24]:
# === NEWS_LEAK_CHECK ===
import numpy as np
import pandas as pd

if 'model_df_news' not in globals():
    raise RuntimeError("model_df_news not found. Run NEWS_MERGE cell first.")

dfn = model_df_news.copy()

print("model_df_news shape:", dfn.shape)
print("Columns (news subset):", [c for c in dfn.columns if 'vader' in c or 'news' in c or 'pos_prop' in c or 'neg_prop' in c])

# 1) Simple correlation of news features with future_return_h and close_future_h
num = dfn.select_dtypes(include=[np.number]).copy()
corrs = num.corr()
print("\nCorrelation of top news features with future_return_h (abs sorted):")
news_feats = [c for c in num.columns if c.startswith('vader') or c.startswith('news_count') or c.endswith('_prop') or c.endswith('_roll_4') or c.endswith('_roll_12')]
c_with_future = corrs.loc[news_feats, ['future_return_h','close_future_h']].abs().sort_values(by='future_return_h', ascending=False)
display(c_with_future)

# 2) Check near-equality: any news column nearly equal to close_future_h or future_return_h (>1% of rows)
def near_prop(a,b,rtol=1e-6,atol=1e-9):
    match = np.isclose(a,b,rtol=rtol,atol=atol)
    return float(match.mean())

print("\nNear-equality proportions (proportion of rows nearly equal to future features):")
for feat in news_feats:
    p_close = near_prop(num[feat].values, num['close_future_h'].values, rtol=1e-4, atol=1e-6)
    p_ret   = near_prop(num[feat].values, num['future_return_h'].values, rtol=1e-4, atol=1e-9)
    if p_close > 0.01 or p_ret > 0.01:
        print(f"  -> SUSPICIOUS: {feat}: close_eq={p_close:.4f}, ret_eq={p_ret:.4f}")
    else:
        # show small ones optionally
        print(f"   {feat}: close_eq={p_close:.4f}, ret_eq={p_ret:.4f}")

# 3) Time alignment sanity: earliest news timestamp vs first price bar, fraction of news before first price row
if hasattr(dfn.index, 'min'):
    print("\nPrice index min:", dfn.index.min())
if 'agg' in globals():
    print("News bins min:", agg.index.min(), "  News bins max:", agg.index.max())

# 4) Quick distribution checks for news columns
print("\nNews columns summary (describe):")
display(dfn[news_feats].describe().T)

# 5) Show last 12 rows where news_count>0 alongside future_return_h to eyeball alignment
print("\nSample tail where news_count>0 (last 12 rows):")
display(dfn[dfn['news_count']>0][['news_count','vader_mean','vader_sum','vader_mean_roll_4','future_return_h','close_future_h']].tail(12))

print("\nIf nothing above looks suspicious (no near-equality with future, correlations small), proceed to training.")


model_df_news shape: (7152, 48)
Columns (news subset): ['news_count', 'vader_mean', 'vader_sum', 'vader_std', 'pos_prop', 'neg_prop', 'vader_mean_roll_4', 'vader_mean_roll_12', 'news_count_roll_4', 'news_count_roll_12']

Correlation of top news features with future_return_h (abs sorted):


Unnamed: 0,future_return_h,close_future_h
vader_std,0.026325,0.407263
neg_prop,0.015121,0.260249
vader_mean,0.011717,0.629371
news_count_roll_12,0.009526,0.612386
vader_sum,0.008044,0.48109
vader_mean_roll_12,0.005413,0.809177
news_count_roll_4,0.005395,0.589795
pos_prop,0.004824,0.779908
news_count,0.002988,0.524124
vader_mean_roll_4,0.001204,0.759538



Near-equality proportions (proportion of rows nearly equal to future features):
   news_count: close_eq=0.0000, ret_eq=0.0018
   vader_mean: close_eq=0.0000, ret_eq=0.0018
   vader_sum: close_eq=0.0000, ret_eq=0.0018
   vader_std: close_eq=0.0000, ret_eq=0.0022
   pos_prop: close_eq=0.0000, ret_eq=0.0018
   neg_prop: close_eq=0.0000, ret_eq=0.0034
   vader_mean_roll_4: close_eq=0.0000, ret_eq=0.0018
   vader_mean_roll_12: close_eq=0.0000, ret_eq=0.0018
   news_count_roll_4: close_eq=0.0000, ret_eq=0.0018
   news_count_roll_12: close_eq=0.0000, ret_eq=0.0018

Price index min: 2017-05-12 19:00:00
News bins min: 2017-12-07 20:00:00   News bins max: 2019-02-07 23:00:00

News columns summary (describe):


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
news_count,7152.0,3.261326,3.746225,0.0,0.0,2.0,5.0,39.0
vader_mean,7152.0,0.47569,0.459892,-0.9976,0.0,0.568705,0.956725,1.0
vader_sum,7152.0,2.311547,2.92974,-8.7226,0.0,1.17865,3.904125,28.7824
vader_std,7152.0,0.222318,0.347233,0.0,0.0,0.004035,0.396081,1.414001
pos_prop,7152.0,0.575868,0.446341,0.0,0.0,0.8,1.0,1.0
neg_prop,7152.0,0.081893,0.182129,0.0,0.0,0.0,0.0,1.0
vader_mean_roll_4,7152.0,0.479143,0.382893,-0.231869,0.0,0.596329,0.831317,0.998962
vader_mean_roll_12,7152.0,0.483985,0.368222,-0.03588,0.0,0.639314,0.800805,0.997839
news_count_roll_4,7152.0,12.955537,13.041031,0.0,0.0,12.0,20.0,120.0
news_count_roll_12,7152.0,36.00769,35.104874,0.0,0.0,36.0,56.0,341.0



Sample tail where news_count>0 (last 12 rows):


Unnamed: 0_level_0,news_count,vader_mean,vader_sum,vader_mean_roll_4,future_return_h,close_future_h
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-02-01 14:00:00,7.0,0.942543,6.5978,0.660967,0.004044,1643.56
2019-02-01 14:30:00,9.0,0.769722,6.9275,0.907756,0.003298,1648.98
2019-02-01 15:00:00,6.0,0.903833,5.423,0.90188,-0.003232,1643.65
2019-02-01 15:30:00,7.0,0.972386,6.8067,0.897121,0.001813,1646.63
2019-02-01 16:00:00,5.0,0.88116,4.4058,0.881775,0.0013,1648.77
2019-02-01 16:30:00,3.0,0.3378,1.0134,0.773795,0.000309,1649.28
2019-02-01 17:00:00,9.0,0.972256,8.7503,0.7909,-0.001485,1646.83
2019-02-01 17:30:00,2.0,0.92035,1.8407,0.777891,-0.002933,1642.0
2019-02-01 18:00:00,10.0,0.68844,6.8844,0.729711,-0.005633,1632.75
2019-02-01 18:30:00,10.0,0.7284,7.284,0.827361,-0.00218,1629.19



If nothing above looks suspicious (no near-equality with future, correlations small), proceed to training.


In [25]:
# === NEWS_MODEL_TRAIN ===
# Train & evaluate models on numeric + news using purged time-series CV with embargo.
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_auc_score, confusion_matrix

if 'model_df_news' not in globals():
    raise RuntimeError("model_df_news not found. Run NEWS_MERGE first.")

# configuration
EMBARGO = 3                # number of bars to embargo on each side of a test block
N_SPLITS = 5
USE_ROBUST_SCALER = True   # robust scaler helps when news_count distributions are skewed
RANDOM_STATE = 42

dfm = model_df_news.copy()

# Prepare features and target (drop leakage cols)
drop_cols = ['close_future_h', 'future_return_h']  # not used as features
X = dfm.drop(columns=drop_cols + ['target_thresholded'])
y = dfm['target_thresholded']

# keep only numeric features (safe)
X = X.select_dtypes(include=[np.number]).copy()
feature_names = X.columns.tolist()
print("Feature count:", len(feature_names))

# helper: purged contiguous splits
def purged_time_series_splits(n_splits, n_samples, embargo):
    """
    Yields (train_idx, test_idx) where data is split into n_splits contiguous blocks.
    For each fold, we remove `embargo` bars on either side of test block from the train indices.
    """
    indices = np.arange(n_samples)
    fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=int)
    fold_sizes[: n_samples % n_splits] += 1
    current = 0
    blocks = []
    for fs in fold_sizes:
        block = indices[current: current+fs]
        blocks.append(block)
        current += fs

    for i in range(n_splits):
        test_idx = blocks[i]
        # build train blocks excluding embargo windows around test block
        train_idx = np.hstack([b for j,b in enumerate(blocks) if j!=i])
        # apply embargo: remove indices within `embargo` bars before/after test block
        start_emb = test_idx[0] - embargo
        end_emb = test_idx[-1] + embargo
        if embargo>0:
            train_idx = train_idx[(train_idx < start_emb) | (train_idx > end_emb)]
        yield train_idx.astype(int), test_idx.astype(int)

# Models to run (class-imbalance handled)
models = {
    'LR_balanced': Pipeline([
        ('scaler', RobustScaler() if USE_ROBUST_SCALER else StandardScaler()),
        ('clf', LogisticRegression(max_iter=2000, class_weight='balanced', random_state=RANDOM_STATE))
    ]),
    'SVM_balanced': Pipeline([
        ('scaler', RobustScaler() if USE_ROBUST_SCALER else StandardScaler()),
        ('clf', SVC(probability=True, class_weight='balanced', random_state=RANDOM_STATE))
    ]),
    'XGB_spw': Pipeline([
        ('scaler', RobustScaler() if USE_ROBUST_SCALER else StandardScaler()),
        ('clf', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=RANDOM_STATE, n_jobs=-1))
    ])
}

# compute scale_pos_weight for XGB (train global ratio)
pos = y.sum()
neg = len(y) - pos
scale_pos_weight = float(neg / (pos + 1e-9))
print(f"Train samples: {len(y)}  positives: {int(pos)}  negatives: {int(neg)}  scale_pos_weight: {scale_pos_weight:.3f}")

# storage
fold_results = {name: [] for name in models.keys()}

# run purged CV
n_samples = len(X)
splitter = purged_time_series_splits(N_SPLITS, n_samples, EMBARGO)

for fold, (train_idx, test_idx) in enumerate(splitter):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    print(f"\n=== Fold {fold}  train_size={len(train_idx)}, test_size={len(test_idx)}  pos_train={int(y_train.sum())} pos_test={int(y_test.sum())} ===")

    for name, pipe in models.items():
        # set XGB scale_pos_weight if needed
        if 'XGB' in name:
            # set internal param
            pipe.named_steps['clf'].set_params(scale_pos_weight= ( (len(y_train)-y_train.sum()) / (y_train.sum()+1e-9) ) )

        # fit
        pipe.fit(X_train, y_train)

        # predict
        y_pred = pipe.predict(X_test)
        try:
            y_proba = pipe.predict_proba(X_test)[:,1]
        except Exception:
            y_proba = None

        # metrics
        acc  = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, zero_division=0)
        rec  = recall_score(y_test, y_pred, zero_division=0)
        f1   = f1_score(y_test, y_pred, zero_division=0)
        auc  = roc_auc_score(y_test, y_proba) if y_proba is not None else None

        print(f"{name}:  F1={f1:.3f}  Prec={prec:.3f}  Rec={rec:.3f}  Acc={acc:.3f}  AUC={'None' if auc is None else f'{auc:.3f}'}")
        # optional: confusion matrix
        print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

        # save
        fold_results[name].append({'fold':fold,'f1':f1,'prec':prec,'rec':rec,'acc':acc,'auc':auc})

# Summarize
print("\n=== Summary across folds ===")
for name, entries in fold_results.items():
    # remove any NaNs
    vals = np.array([e['f1'] for e in entries], dtype=float)
    vals = vals[~np.isnan(vals)]
    if len(vals)==0:
        print(f"{name}: no valid folds")
        continue
    print(f"{name}: mean F1 = {vals.mean():.4f}  std = {vals.std():.4f}  (n_folds={len(vals)})")


Feature count: 45
Train samples: 7152  positives: 2916  negatives: 4236  scale_pos_weight: 1.453

=== Fold 0  train_size=5718, test_size=1431  pos_train=2354 pos_test=560 ===
LR_balanced:  F1=0.227  Prec=0.437  Rec=0.154  Acc=0.591  AUC=0.519
Confusion matrix:
 [[760 111]
 [474  86]]
SVM_balanced:  F1=0.004  Prec=1.000  Rec=0.002  Acc=0.609  AUC=0.470
Confusion matrix:
 [[871   0]
 [559   1]]
XGB_spw:  F1=0.374  Prec=0.428  Rec=0.332  Acc=0.565  AUC=0.541
Confusion matrix:
 [[622 249]
 [374 186]]

=== Fold 1  train_size=5715, test_size=1431  pos_train=2364 pos_test=549 ===
LR_balanced:  F1=0.399  Prec=0.410  Rec=0.388  Acc=0.551  AUC=0.520
Confusion matrix:
 [[576 306]
 [336 213]]
SVM_balanced:  F1=0.102  Prec=0.421  Rec=0.058  Acc=0.608  AUC=0.494
Confusion matrix:
 [[838  44]
 [517  32]]
XGB_spw:  F1=0.442  Prec=0.414  Rec=0.474  Acc=0.541  AUC=0.546
Confusion matrix:
 [[514 368]
 [289 260]]

=== Fold 2  train_size=5716, test_size=1430  pos_train=2309 pos_test=607 ===
LR_balanced:  F

In [26]:
# === FINBERT (robust) — uses GPU if available, otherwise CPU ===
# Paste & run in your notebook. Assumes your news DataFrame is present (news_df / news / articles).
import sys, math, time, importlib, subprocess
import pandas as pd, numpy as np, torch
from tqdm import tqdm

def maybe_install(pkg):
    try:
        importlib.import_module(pkg)
    except Exception:
        print(f"Installing missing package: {pkg}")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

# Try ensure transformers/tqdm present (torch likely already installed by you)
for pkg in ("transformers","tqdm"):
    maybe_install(pkg)

from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 0) Find articles DataFrame
_possible_names = ["news_df", "news", "articles", "articles_df", "parsed_articles", "df_articles"]
articles_df = None
for nm in _possible_names:
    if nm in globals() and isinstance(globals()[nm], pd.DataFrame):
        articles_df = globals()[nm]
        print("Using articles DataFrame variable:", nm, "shape:", articles_df.shape)
        break
if articles_df is None:
    raise RuntimeError("Could not find your articles DataFrame in variables: " + ", ".join(_possible_names))

# 1) Build texts (title + text or fallback)
text_cols_try = ["title","text","summary","description","content"]
texts = []
used_idxs = []
for idx, row in articles_df.iterrows():
    parts = []
    for c in text_cols_try:
        if c in articles_df.columns and pd.notna(row.get(c, None)):
            parts.append(str(row.get(c, "")))
    txt = " ".join(parts).strip()
    if not txt:
        txt = str(row.get("title", "") or row.get("url", "") or "")
    texts.append(txt)
    used_idxs.append(idx)
n = len(texts)
print("Number of articles to score:", n)

# 2) Device selection
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
print("Device:", device, "| CUDA available:", use_cuda, "| torch version:", torch.__version__)

# 3) Load FinBERT model (try safetensors first)
model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

model = None
# Try load with safetensors to avoid older-torch torch.load requirement
try:
    print("Attempting to load model with use_safetensors=True (preferred when available)...")
    model = AutoModelForSequenceClassification.from_pretrained(model_name, use_safetensors=True)
    print("Loaded model with safetensors.")
except Exception as e_s:
    print("safetensors load failed:", repr(e_s))
    print("Attempting normal from_pretrained (this may require torch>=2.6).")
    try:
        model = AutoModelForSequenceClassification.from_pretrained(model_name)
        print("Loaded model with normal weights.")
    except Exception as e2:
        # Surface friendly error and re-raise
        msg = (
            "Failed to load model. If the error mentions 'torch.load' or 'upgrade torch >=2.6',\n"
            "please upgrade PyTorch to >=2.6 (conda install pytorch pytorch-cuda -c pytorch -c nvidia) "
            "or ensure the model has safetensors weights. Original error:\n" + repr(e2)
        )
        raise RuntimeError(msg) from e2

# send to device
model.to(device)
model.eval()
id2label = getattr(model.config, "id2label", None)
print("Model id2label:", id2label)

# 4) Batch inference — adapt batch size to device
batch_size = 64 if use_cuda else 8   # smaller on CPU to avoid memory/time issues
print("Using batch_size:", batch_size, "(reduce if you hit OOM)")

all_probs = []
start = time.time()
n_batches = math.ceil(n / batch_size)

for i in tqdm(range(n_batches), desc="FinBERT batches"):
    batch_texts = texts[i*batch_size:(i+1)*batch_size]
    enc = tokenizer(batch_texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
    # move tensors to device
    enc = {k: v.to(device) for k, v in enc.items()}
    with torch.no_grad():
        out = model(**enc)
        logits = out.logits
        probs = torch.softmax(logits, dim=1).cpu().numpy()  # move to CPU for stacking
        all_probs.append(probs)

# stack
if len(all_probs):
    all_probs = np.vstack(all_probs)
else:
    all_probs = np.zeros((0, model.config.num_labels))

elapsed = time.time() - start
print(f"Done: {n} articles in {elapsed:.1f}s ({(n/elapsed) if elapsed>0 else 0:.1f} articles/s)")

# 5) Build per-article DataFrame
if id2label:
    id2label_norm = {int(k): v for k, v in id2label.items()}
    labels = [id2label_norm[i] for i in sorted(id2label_norm.keys())]
else:
    # fallback labeling order (common for FinBERT)
    labels = ["Neutral","Positive","Negative"]

cols = [f"finbert_{lab.lower()}" for lab in labels]
prob_df = pd.DataFrame(all_probs, index=used_idxs, columns=cols)

# compound: positive - negative if available else max-min
if "finbert_positive" in prob_df.columns and "finbert_negative" in prob_df.columns:
    prob_df["finbert_compound"] = prob_df["finbert_positive"] - prob_df["finbert_negative"]
else:
    prob_df["finbert_compound"] = prob_df.max(axis=1) - prob_df.min(axis=1)

# attach published timestamp (if present)
if "published" in articles_df.columns:
    prob_df["published"] = pd.to_datetime(articles_df["published"], utc=True).loc[prob_df.index]
else:
    for col in ("date","ts","timestamp"):
        if col in articles_df.columns:
            prob_df["published"] = pd.to_datetime(articles_df[col], utc=True).loc[prob_df.index]
            break
    else:
        prob_df["published"] = pd.NaT
        print("Warning: no published/date column found; published will be NaT in the results.")

# 6) Aggregate into 30-minute bins (adjust '30T' if your bars use other frequency)
prob_df["bin_ts"] = prob_df["published"].dt.tz_convert(None).dt.floor("30T")
agg = prob_df.groupby("bin_ts").agg(
    news_fin_count = ("finbert_compound", "size"),
    finbert_pos_mean = ("finbert_positive", "mean") if "finbert_positive" in prob_df.columns else ("finbert_compound","mean"),
    finbert_neg_mean = ("finbert_negative", "mean") if "finbert_negative" in prob_df.columns else ("finbert_compound","mean"),
    finbert_neu_mean = ("finbert_neutral", "mean") if "finbert_neutral" in prob_df.columns else ("finbert_compound","mean"),
    finbert_compound_mean = ("finbert_compound", "mean"),
    finbert_compound_sum = ("finbert_compound", "sum"),
    finbert_compound_std = ("finbert_compound", "std")
)
agg["finbert_compound_roll_4"] = agg["finbert_compound_mean"].rolling(4, min_periods=1).mean()
agg["finbert_compound_roll_12"] = agg["finbert_compound_mean"].rolling(12, min_periods=1).mean()
agg["news_fin_count_roll_4"] = agg["news_fin_count"].rolling(4, min_periods=1).sum()
agg["news_fin_count_roll_12"] = agg["news_fin_count"].rolling(12, min_periods=1).sum()

# 7) Merge into price dataset if available (model_df_news preferred)
if "model_df_news" in globals():
    target_df = globals()["model_df_news"].copy()
    idx = pd.to_datetime(target_df.index).tz_localize(None)
    agg_reindexed = agg.reindex(idx, fill_value=0)
    merged = target_df.join(agg_reindexed, how="left").fillna(0)
    globals()["model_df_news_with_finbert"] = merged
    print("Saved merged price+finbert as 'model_df_news_with_finbert' shape:", merged.shape)
elif "model_df_safe" in globals():
    target_df = globals()["model_df_safe"].copy()
    idx = pd.to_datetime(target_df.index).tz_localize(None)
    agg_reindexed = agg.reindex(idx, fill_value=0)
    merged = target_df.join(agg_reindexed, how="left").fillna(0)
    globals()["model_df_safe_with_finbert"] = merged
    print("Saved merged price+finbert as 'model_df_safe_with_finbert' shape:", merged.shape)
else:
    globals()["news_finbert_agg"] = agg
    print("Saved finbert aggregation only as 'news_finbert_agg' rows:", len(agg))

# 8) Save per-article scores
globals()["finbert_article_scores"] = prob_df
print("Saved per-article FinBERT scores as 'finbert_article_scores' rows:", len(prob_df))

# show tiny samples
print("\nAggregated sample (tail):")
display(agg.tail(6))
print("\nPer-article sample (tail):")
display(prob_df[["finbert_compound"] + [c for c in prob_df.columns if c.startswith("finbert_")][:3]].tail(6))


Using articles DataFrame variable: news_df shape: (70731, 11)
Number of articles to score: 70731
Device: cpu | CUDA available: False | torch version: 2.6.0+cpu
Attempting to load model with use_safetensors=True (preferred when available)...
Loaded model with safetensors.
Model id2label: {0: 'Neutral', 1: 'Positive', 2: 'Negative'}
Using batch_size: 8 (reduce if you hit OOM)


FinBERT batches: 100%|██████████| 8842/8842 [5:20:02<00:00,  2.17s/it]       

Done: 70731 articles in 19202.6s (3.7 articles/s)
Saved merged price+finbert as 'model_df_news_with_finbert' shape: (7152, 59)
Saved per-article FinBERT scores as 'finbert_article_scores' rows: 70731

Aggregated sample (tail):





Unnamed: 0_level_0,news_fin_count,finbert_pos_mean,finbert_neg_mean,finbert_neu_mean,finbert_compound_mean,finbert_compound_sum,finbert_compound_std,finbert_compound_roll_4,finbert_compound_roll_12,news_fin_count_roll_4,news_fin_count_roll_12
bin_ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-02-07 19:30:00,3,0.024459,0.242214,0.733327,-0.217755,-0.653264,0.4019,0.130156,0.166452,9.0,42.0
2019-02-07 20:00:00,1,8.5e-05,0.073334,0.926581,-0.073249,-0.073249,,0.033218,0.174137,9.0,41.0
2019-02-07 20:30:00,1,1.2e-05,1.4e-05,0.999974,-2e-06,-2e-06,,-0.072751,0.151897,8.0,34.0
2019-02-07 21:00:00,2,0.000324,0.004938,0.994738,-0.004614,-0.009229,0.0,-0.073905,0.151568,7.0,32.0
2019-02-07 22:00:00,1,1.5e-05,2e-06,0.999983,1.3e-05,1.3e-05,,-0.019463,0.138806,5.0,25.0
2019-02-07 23:00:00,1,0.000148,0.034831,0.965021,-0.034683,-0.034683,,-0.009822,0.056228,5.0,25.0



Per-article sample (tail):


Unnamed: 0,finbert_compound,finbert_neutral,finbert_positive,finbert_negative
70725,0.000298,0.999665,0.000317,1.83366e-05
70726,0.990019,0.004194,0.992912,0.002893622
70727,-0.010311,0.986855,0.001417,0.01172769
70728,0.999933,6.6e-05,0.999933,1.602052e-07
70729,0.682281,0.000953,0.840664,0.1583828
70730,-0.073249,0.926581,8.5e-05,0.07333373


In [27]:
# === SAVE RESULTS: run this cell to persist FinBERT outputs ===
import os, sys, pathlib, pickle
import pandas as pd

out_dir = pathlib.Path("saved_outputs")
out_dir.mkdir(exist_ok=True)

saved = []

# helper to print size nicely
def sizeof(path):
    try:
        s = path.stat().st_size
        for unit in ["B","KB","MB","GB"]:
            if s < 1024.0:
                return f"{s:.1f}{unit}"
            s /= 1024.0
        return f"{s:.1f}TB"
    except Exception:
        return "n/a"

# 1) per-article scores (prob_df)
if "finbert_article_scores" in globals():
    p = out_dir / "finbert_article_scores.pkl"
    fin = globals()["finbert_article_scores"]
    fin.to_pickle(p, protocol=pickle.HIGHEST_PROTOCOL)
    saved.append(("finbert_article_scores", p))
else:
    print("finbert_article_scores not found in globals(), skipping.")

# 2) aggregated bins
if "news_finbert_agg" in globals():
    p = out_dir / "news_finbert_agg.parquet"
    globals()["news_finbert_agg"].to_parquet(p, index=True)
    saved.append(("news_finbert_agg", p))
elif "news_finbert_agg" not in globals():
    # maybe you saved as 'news_finbert_agg' or as 'news_finbert_agg' inside another name
    print("No 'news_finbert_agg' found in globals(), skipping parquet save for that.")

# 3) merged model df (price + news)
for cand in ("model_df_news_with_finbert","model_df_safe_with_finbert","model_df_news"):
    if cand in globals():
        p = out_dir / f"{cand}.parquet"
        # convert index to datetimelike if needed
        df = globals()[cand].copy()
        try:
            df.to_parquet(p, index=True)
            saved.append((cand, p))
        except Exception as e:
            # fallback to pickle
            p2 = out_dir / f"{cand}.pkl"
            df.to_pickle(p2, protocol=pickle.HIGHEST_PROTOCOL)
            saved.append((cand, p2))
        break

# 4) small metadata summary
meta_p = out_dir / "save_manifest.txt"
with open(meta_p, "w", encoding="utf8") as f:
    f.write("Saved files:\n")
    for name, path in saved:
        f.write(f"{name}\t{path}\t{sizeof(path)}\n")

print("Saved files summary:")
for name, path in saved:
    print(f" - {name} -> {path}  ({sizeof(path)})")

print("\nManifest written to", meta_p)
print("You can move the 'saved_outputs' folder to another machine and reload with Cell B below.")


No 'news_finbert_agg' found in globals(), skipping parquet save for that.
Saved files summary:
 - finbert_article_scores -> saved_outputs\finbert_article_scores.pkl  (2.7MB)
 - model_df_news_with_finbert -> saved_outputs\model_df_news_with_finbert.parquet  (2.2MB)

Manifest written to saved_outputs\save_manifest.txt
You can move the 'saved_outputs' folder to another machine and reload with Cell B below.


In [28]:
# === LOAD RESULTS: run this to restore saved outputs ===
import pathlib, pickle
import pandas as pd

out_dir = pathlib.Path("saved_outputs")
if not out_dir.exists():
    raise RuntimeError(f"saved_outputs folder not found at {out_dir.resolve()}")

# try load per-article
p1 = out_dir / "finbert_article_scores.pkl"
if p1.exists():
    finbert_article_scores = pd.read_pickle(p1)
    globals()["finbert_article_scores"] = finbert_article_scores
    print("Loaded finbert_article_scores:", finbert_article_scores.shape)
else:
    print("finbert_article_scores.pkl not found")

# load aggregates
p2 = out_dir / "news_finbert_agg.parquet"
if p2.exists():
    news_finbert_agg = pd.read_parquet(p2)
    globals()["news_finbert_agg"] = news_finbert_agg
    print("Loaded news_finbert_agg:", news_finbert_agg.shape)
else:
    print("news_finbert_agg.parquet not found")

# load merged model df (try parquet then pickle)
for cand in ("model_df_news_with_finbert.parquet","model_df_safe_with_finbert.parquet","model_df_news.parquet"):
    p = out_dir / cand
    if p.exists():
        df = pd.read_parquet(p)
        varname = p.stem
        globals()[varname] = df
        print("Loaded", varname, df.shape)
        break
else:
    # try pickle fallback
    for cand in ("model_df_news_with_finbert.pkl","model_df_safe_with_finbert.pkl","model_df_news.pkl"):
        p = out_dir / cand
        if p.exists():
            df = pd.read_pickle(p)
            globals()[p.stem] = df
            print("Loaded", p.stem, df.shape)
            break
    else:
        print("No merged price+finbert dataset found in saved_outputs.")

print("\nDone. Variables available in your notebook: finbert_article_scores, news_finbert_agg (if present), plus any merged dataset.")


Loaded finbert_article_scores: (70731, 6)
news_finbert_agg.parquet not found
Loaded model_df_news_with_finbert (7152, 59)

Done. Variables available in your notebook: finbert_article_scores, news_finbert_agg (if present), plus any merged dataset.


In [29]:

# Cell: verify saved files
import pathlib, os
out_dir = pathlib.Path("saved_outputs")
print("saved_outputs exists:", out_dir.exists())
print()

if out_dir.exists():
    for p in sorted(out_dir.rglob("*")):
        if p.is_file():
            s = p.stat().st_size
            human = f"{s/1024**2:.2f} MB" if s>1024**2 else f"{s/1024:.1f} KB"
            print(p.relative_to(out_dir.parent), "-", human)
else:
    print("No saved_outputs folder found. Did the save cell run successfully?")


saved_outputs exists: True

saved_outputs\finbert_article_scores.pkl - 2.70 MB
saved_outputs\model_df_news_with_finbert.parquet - 2.25 MB
saved_outputs\save_manifest.txt - 0.2 KB


In [30]:
# Cell: quick numeric vs numeric+finbert comparison (time-ordered split)
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# choose dataset
if 'model_df_news_with_finbert' in globals():
    df = globals()['model_df_news_with_finbert'].copy()
    print("Using model_df_news_with_finbert")
elif 'model_df_news' in globals():
    df = globals()['model_df_news'].copy()
    print("Using model_df_news")
else:
    raise RuntimeError("No merged dataset found in globals (model_df_news_with_finbert or model_df_news).")

# ensure ts index and sorted
df.index = pd.to_datetime(df.index)
df = df.sort_index()

# target: use previously built column if present, else build same as before:
if 'target_thresholded' in df.columns:
    y = df['target_thresholded'].astype(int).copy()
elif 'future_return_h' in df.columns:
    y = (df['future_return_h'] > 0).astype(int)
else:
    raise RuntimeError("No target column found (target_thresholded or future_return_h).")

# select numeric features (drop leakage / future columns)
drop_like = ['future','close_future','close_next','target','target_thresholded']
cols = [c for c in df.columns if (np.issubdtype(df[c].dtype, np.number) and not any(dl in c for dl in drop_like))]
print("Total numeric columns available:", len(cols))

# pick baseline numeric features (all numeric except FinBERT ones)
fin_cols = [c for c in cols if c.startswith("finbert_") or c.startswith("news_fin_") or c.startswith("vader_")]
num_cols = [c for c in cols if c not in fin_cols]

print("Numeric-only feature count:", len(num_cols))
print("FinBERT/news feature count:", len(fin_cols))

# trim to rows without NaN in features/target
use_df = df[num_cols + fin_cols].copy()
mask = y.notna() & use_df.notna().all(axis=1)
use_df = use_df.loc[mask]
y = y.loc[mask]

# time-split: last 20% as test
n = len(use_df)
split = int(n * 0.8)
X_train = use_df.iloc[:split]
X_test  = use_df.iloc[split:]
y_train = y.iloc[:split]
y_test  = y.iloc[split:]

print("Train size:", len(X_train), "Test size:", len(X_test))

def train_and_eval(X_tr, X_te, y_tr, y_te, desc="Model"):
    scaler = StandardScaler()
    X_tr_s = scaler.fit_transform(X_tr)
    X_te_s = scaler.transform(X_te)

    # LR with balanced class weight
    lr = LogisticRegression(max_iter=2000, class_weight="balanced", random_state=42)
    lr.fit(X_tr_s, y_tr)
    p = lr.predict(X_te_s)
    try:
        proba = lr.predict_proba(X_te_s)[:,1]
    except Exception:
        proba = None
    metrics = {
        "acc": accuracy_score(y_te, p),
        "prec": precision_score(y_te, p, zero_division=0),
        "rec": recall_score(y_te, p, zero_division=0),
        "f1": f1_score(y_te, p, zero_division=0),
        "auc": roc_auc_score(y_te, proba) if proba is not None else None
    }
    print(f"\n{desc} - LogisticRegression (balanced)")
    print(metrics)

    # XGBoost (scale_pos_weight)
    spw = (y_tr==0).sum() / max(1, (y_tr==1).sum())
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, scale_pos_weight=spw)
    xgb.fit(X_tr_s, y_tr)
    p2 = xgb.predict(X_te_s)
    proba2 = xgb.predict_proba(X_te_s)[:,1] if hasattr(xgb, "predict_proba") else None
    metrics2 = {
        "acc": accuracy_score(y_te, p2),
        "prec": precision_score(y_te, p2, zero_division=0),
        "rec": recall_score(y_te, p2, zero_division=0),
        "f1": f1_score(y_te, p2, zero_division=0),
        "auc": roc_auc_score(y_te, proba2) if proba2 is not None else None
    }
    print(f"\n{desc} - XGBoost")
    print(metrics2)
    return metrics, metrics2

# 1) numeric-only
if len(num_cols) == 0:
    print("No numeric columns found for baseline — aborting numeric-only test.")
else:
    print("\n=== Baseline: numeric-only ===")
    _ = train_and_eval(X_train[num_cols], X_test[num_cols], y_train, y_test, desc="Numeric-only")

# 2) numeric + FinBERT/news
if len(fin_cols) == 0:
    print("\nNo FinBERT/news columns found — skipping combined test.")
else:
    print("\n=== Combined: numeric + FinBERT/news ===")
    _ = train_and_eval(X_train[num_cols + fin_cols], X_test[num_cols + fin_cols], y_train, y_test, desc="Numeric+FinBERT")


Using model_df_news_with_finbert
Total numeric columns available: 56
Numeric-only feature count: 40
FinBERT/news feature count: 16
Train size: 5721 Test size: 1431

=== Baseline: numeric-only ===

Numeric-only - LogisticRegression (balanced)
{'acc': 0.48846960167714887, 'prec': 0.41854636591478694, 'rec': 0.5548172757475083, 'f1': 0.47714285714285715, 'auc': 0.5124714962990273}

Numeric-only - XGBoost
{'acc': 0.5345911949685535, 'prec': 0.43073593073593075, 'rec': 0.33056478405315615, 'f1': 0.37406015037593987, 'auc': 0.5027491794540915}

=== Combined: numeric + FinBERT/news ===

Numeric+FinBERT - LogisticRegression (balanced)
{'acc': 0.5024458420684835, 'prec': 0.4260752688172043, 'rec': 0.526578073089701, 'f1': 0.4710252600297177, 'auc': 0.515214664427782}

Numeric+FinBERT - XGBoost
{'acc': 0.5129280223619846, 'prec': 0.4, 'rec': 0.31561461794019935, 'f1': 0.3528319405756732, 'auc': 0.514469260086002}


In [31]:
# === CELL: Purged-time-series hyperparameter tuning (run this) ===
import math, time
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_auc_score, confusion_matrix

# --- PurgedTimeSeriesSplit (simple, embargo in number of rows) ---
class PurgedTimeSeriesSplit:
    def __init__(self, n_splits=5, embargo=1):
        if n_splits < 2:
            raise ValueError("n_splits must be >=2")
        self.n_splits = n_splits
        self.embargo = int(embargo)

    def split(self, X):
        n = len(X)
        fold_size = n // self.n_splits
        indices = np.arange(n)
        for i in range(self.n_splits):
            test_start = i * fold_size
            # last fold takes remainder
            test_end = ((i + 1) * fold_size) if i < (self.n_splits - 1) else n
            test_idx = indices[test_start:test_end]
            # train indices are all indices before test_start and after test_end
            # apply embargo: remove up to `embargo` rows following test_end and preceding test_start
            train_left_end = test_start - self.embargo
            train_right_start = test_end + self.embargo
            left = indices[:max(0, train_left_end)]
            right = indices[train_right_start:] if train_right_start < n else np.array([], dtype=int)
            train_idx = np.concatenate([left, right]) if len(left) or len(right) else np.array([], dtype=int)
            yield train_idx, test_idx

    def get_n_splits(self, X=None):
        return self.n_splits

# --- 0) Basic checks and prepare dataset ---
if "model_df_news_with_finbert" not in globals():
    raise RuntimeError("model_df_news_with_finbert not found in globals(). Make sure you ran the merge step.")

df = model_df_news_with_finbert.copy()
label_col = "target_thresholded"
if label_col not in df.columns:
    raise RuntimeError(f"Label column '{label_col}' not found in dataframe.")

# Select numeric features and drop leakage columns if present
drop_cols = {label_col, "future_return_h", "close_future_h", "close_next", "ts_str", "date", "time"}
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
features = [c for c in num_cols if c not in drop_cols]
print(f"Using {len(features)} numeric features for tuning (example: {features[:8]})")

X = df[features].copy().reset_index(drop=True)
y = df[label_col].astype(int).reset_index(drop=True)

# Train/test split (time-based): keep same split approach you used before (last ~20% as test)
n = len(X)
test_size = int(math.ceil(n * 0.20))
train_size = n - test_size
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]
print(f"Train size: {len(X_train)}  Test size: {len(X_test)}")

# --- 1) Setup Purged CV and param grids ---
n_splits = 5
embargo = 3   # adjust as needed (in rows); you used 1 or 3 earlier
ptscv = PurgedTimeSeriesSplit(n_splits=n_splits, embargo=embargo)

# small, sensible param grids (keeps run-time reasonable)
lr_grid = [
    {"clf__penalty": "l2", "clf__C": 0.0005, "clf__solver": "saga"},
    {"clf__penalty": "l2", "clf__C": 0.01, "clf__solver": "saga"},
    {"clf__penalty": "l2", "clf__C": 0.1, "clf__solver": "saga"},
    {"clf__penalty": "l2", "clf__C": 1.0, "clf__solver": "saga"},
    {"clf__penalty": "l2", "clf__C": 10.0, "clf__solver": "saga"},
]

svm_grid = [
    {"clf__C": 0.1, "clf__gamma": 0.01},
    {"clf__C": 1.0, "clf__gamma": 0.01},
    {"clf__C": 5.0, "clf__gamma": 0.01},
    {"clf__C": 1.0, "clf__gamma": "scale"},
]

xgb_grid = [
    {"clf__n_estimators": 200, "clf__max_depth": 3, "clf__learning_rate": 0.05, "clf__subsample": 0.7, "clf__colsample_bytree": 0.7},
    {"clf__n_estimators": 400, "clf__max_depth": 3, "clf__learning_rate": 0.05, "clf__subsample": 0.9, "clf__colsample_bytree": 0.9},
    {"clf__n_estimators": 200, "clf__max_depth": 6, "clf__learning_rate": 0.05, "clf__subsample": 0.8, "clf__colsample_bytree": 0.8},
]

# pipelines
pipe_lr = Pipeline([("scaler", RobustScaler()), ("clf", LogisticRegression(class_weight="balanced", max_iter=5000, random_state=42))])
pipe_svm = Pipeline([("scaler", RobustScaler()), ("clf", SVC(kernel="rbf", probability=True, class_weight="balanced", random_state=42))])
pipe_xgb = Pipeline([("scaler", StandardScaler()), ("clf", XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42))])

# Helper to evaluate a pipeline on purged CV folds
def evaluate_params(pipe, param_dict, X_train, y_train, ptscv):
    pipe = Pipeline(pipe.steps)  # copy
    pipe.set_params(**param_dict)
    f1s = []
    aucs = []
    folds_info = []
    for fold_i, (tr_idx, te_idx) in enumerate(ptscv.split(X_train)):
        if len(tr_idx) == 0 or len(te_idx) == 0:
            f1s.append(np.nan)
            aucs.append(np.nan)
            folds_info.append(None)
            continue
        Xtr, Xte = X_train.iloc[tr_idx], X_train.iloc[te_idx]
        ytr, yte = y_train.iloc[tr_idx], y_train.iloc[te_idx]
        try:
            pipe.fit(Xtr, ytr)
            ypred = pipe.predict(Xte)
            yproba = None
            try:
                yproba = pipe.predict_proba(Xte)[:,1]
            except Exception:
                yproba = None
            f1 = f1_score(yte, ypred, zero_division=0)
            auc = roc_auc_score(yte, yproba) if yproba is not None else np.nan
            f1s.append(f1); aucs.append(auc)
            folds_info.append((fold_i, len(tr_idx), len(te_idx), f1))
        except Exception as e:
            # on error mark NaN and continue
            f1s.append(np.nan); aucs.append(np.nan)
            folds_info.append(("error", str(e)))
    # compute mean across valid folds
    valid = [v for v in f1s if not (isinstance(v, float) and np.isnan(v))]
    mean_f1 = np.nan if len(valid) == 0 else float(np.nanmean(valid))
    std_f1 = np.nanstd(valid) if len(valid) else np.nan
    return {"mean_f1": mean_f1, "std_f1": std_f1, "folds": folds_info, "f1s": f1s, "aucs": aucs}

# --- 2) Run grid search (simple brute-force over small grids) ---
results_summary = {"lr": [], "svm": [], "xgb": []}

start_all = time.time()
print("Starting purged-CV tuning: n_splits=", n_splits, "embargo=", embargo)
# Logistic Regression
print("\nTuning LogisticRegression (balanced) ...")
for params in lr_grid:
    res = evaluate_params(pipe_lr, params, X_train.reset_index(drop=True), y_train.reset_index(drop=True), ptscv)
    results_summary["lr"].append((params, res))
    print(f"Params: {params} -> mean_f1={res['mean_f1']:.4f} std={res['std_f1']:.4f}")

# SVM
print("\nTuning SVM (RBF, class_weight=balanced) ...")
for params in svm_grid:
    # set 'clf__kernel' already in pipeline; pass C and gamma
    p = {"clf__C": params["clf__C"], "clf__gamma": params["clf__gamma"]}
    res = evaluate_params(pipe_svm, p, X_train.reset_index(drop=True), y_train.reset_index(drop=True), ptscv)
    results_summary["svm"].append((p, res))
    print(f"Params: {p} -> mean_f1={res['mean_f1']:.4f} std={res['std_f1']:.4f}")

# XGBoost (use scale_pos_weight computed from train)
print("\nTuning XGBoost ...")
# compute scale_pos_weight from training set
pos = int(y_train.sum())
neg = len(y_train) - pos
spw = max(1.0, float(neg) / max(1, pos))
print("scale_pos_weight (train):", round(spw,3))
for params in xgb_grid:
    p = params.copy()
    p["clf__scale_pos_weight"] = spw
    res = evaluate_params(pipe_xgb, p, X_train.reset_index(drop=True), y_train.reset_index(drop=True), ptscv)
    results_summary["xgb"].append((p, res))
    print(f"Params: {p} -> mean_f1={res['mean_f1']:.4f} std={res['std_f1']:.4f}")

elapsed_all = time.time() - start_all
print(f"\nTuning finished in {elapsed_all:.1f}s")

# --- 3) Pick best for each estimator and fit on full training data ---
def pick_best(results_list):
    best = None
    best_score = -np.inf
    for params, res in results_list:
        score = res["mean_f1"] if res["mean_f1"] is not None else -np.inf
        if score > best_score:
            best_score = score
            best = (params, res)
    return best

best_lr_params, best_lr_res = pick_best(results_summary["lr"])
best_svm_params, best_svm_res = pick_best(results_summary["svm"])
best_xgb_params, best_xgb_res = pick_best(results_summary["xgb"])

print("\nBest (LR):", best_lr_params, " -> mean_f1:", best_lr_res["mean_f1"])
print("Best (SVM):", best_svm_params, " -> mean_f1:", best_svm_res["mean_f1"])
print("Best (XGB):", best_xgb_params, " -> mean_f1:", best_xgb_res["mean_f1"])

# Fit final estimators on the entire train set using best params
final_lr = Pipeline([("scaler", RobustScaler()), ("clf", LogisticRegression(class_weight="balanced", max_iter=5000, random_state=42))])
final_lr.set_params(**best_lr_params)
final_lr.fit(X_train, y_train)

final_svm = Pipeline([("scaler", RobustScaler()), ("clf", SVC(kernel="rbf", probability=True, class_weight="balanced", random_state=42))])
final_svm.set_params(**best_svm_params)
final_svm.fit(X_train, y_train)

final_xgb = Pipeline([("scaler", StandardScaler()), ("clf", XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42))])
final_xgb.set_params(**best_xgb_params)
final_xgb.fit(X_train, y_train)

# Save to globals
globals()["best_lr_purged"] = final_lr
globals()["best_svm_purged"] = final_svm
globals()["best_xgb_purged"] = final_xgb
globals()["purged_tuning_results"] = results_summary

# --- 4) Quick evaluation on holdout test set ---
def eval_on_test(pipe, X_test, y_test):
    ypred = pipe.predict(X_test)
    try:
        yproba = pipe.predict_proba(X_test)[:,1]
    except Exception:
        yproba = None
    out = {
        "acc": accuracy_score(y_test, ypred),
        "prec": precision_score(y_test, ypred, zero_division=0),
        "rec": recall_score(y_test, ypred, zero_division=0),
        "f1": f1_score(y_test, ypred, zero_division=0),
        "auc": (roc_auc_score(y_test, yproba) if yproba is not None else None),
        "confusion": confusion_matrix(y_test, ypred)
    }
    return out

print("\nEvaluating chosen best models on holdout test set:")
print("LR_purged:", eval_on_test(final_lr, X_test, y_test))
print("SVM_purged:", eval_on_test(final_svm, X_test, y_test))
print("XGB_purged:", eval_on_test(final_xgb, X_test, y_test))

print("\nDone. Best estimators stored as globals: best_lr_purged, best_svm_purged, best_xgb_purged")


Using 56 numeric features for tuning (example: ['open', 'high', 'low', 'close', 'volume', 'return_1', 'log_return_1', 'ema_8'])
Train size: 5721  Test size: 1431
Starting purged-CV tuning: n_splits= 5 embargo= 3

Tuning LogisticRegression (balanced) ...
Params: {'clf__penalty': 'l2', 'clf__C': 0.0005, 'clf__solver': 'saga'} -> mean_f1=0.3510 std=0.2108
Params: {'clf__penalty': 'l2', 'clf__C': 0.01, 'clf__solver': 'saga'} -> mean_f1=0.3985 std=0.1396
Params: {'clf__penalty': 'l2', 'clf__C': 0.1, 'clf__solver': 'saga'} -> mean_f1=0.4120 std=0.1266
Params: {'clf__penalty': 'l2', 'clf__C': 1.0, 'clf__solver': 'saga'} -> mean_f1=0.4140 std=0.1265
Params: {'clf__penalty': 'l2', 'clf__C': 10.0, 'clf__solver': 'saga'} -> mean_f1=0.4145 std=0.1266

Tuning SVM (RBF, class_weight=balanced) ...
Params: {'clf__C': 0.1, 'clf__gamma': 0.01} -> mean_f1=0.4224 std=0.1744
Params: {'clf__C': 1.0, 'clf__gamma': 0.01} -> mean_f1=0.3427 std=0.0660
Params: {'clf__C': 5.0, 'clf__gamma': 0.01} -> mean_f1=0.318

In [32]:
# Save tuned purged models to disk (pickles)
import joblib, pathlib
save_dir = pathlib.Path("saved_models")
save_dir.mkdir(exist_ok=True)

for name in ["best_lr_purged", "best_svm_purged", "best_xgb_purged"]:
    if name in globals():
        path = save_dir / f"{name}.pkl"
        joblib.dump(globals()[name], path)
        print(f"Saved {name} -> {path}")


Saved best_lr_purged -> saved_models\best_lr_purged.pkl
Saved best_svm_purged -> saved_models\best_svm_purged.pkl
Saved best_xgb_purged -> saved_models\best_xgb_purged.pkl


In [None]:
# import joblib
# best_lr_purged = joblib.load("saved_models/best_lr_purged.pkl")
# import joblib
# best_lr_purged = joblib.load("saved_models/best_lr_purged.pkl")
