In [1]:
import pandas as pd
import numpy as np
import pandas_ta as ta
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import warnings
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")


plt.style.use('fivethirtyeight')

In [2]:
try:
    full_df = pd.read_csv('your_data.csv')
except FileNotFoundError:
    print("Error: 'your_data.csv' not found")
    # As a fallback for testing, create a dummy dataframe
    print("Creating dummy data to proceed with the example...")
    num_rows = 5000
    data = {'open': np.random.uniform(98, 102, num_rows).cumsum(), 'high': 100, 'low': 100, 'close': 100, 'volume': 1000}
    full_df = pd.DataFrame(data)
    full_df['high'] = full_df['open'] + np.random.uniform(0, 2, num_rows)
    full_df['low'] = full_df['open'] - np.random.uniform(0, 2, num_rows)
    full_df['close'] = (full_df['open'] + full_df['high'] + full_df['low']) / 3
    full_df['volume'] = np.random.randint(10000, 50000, num_rows)



ohlcv_df = full_df[['open', 'high', 'low', 'close', 'volume']]


print("Data loaded successfully. Here are the first 5 rows:")
print(ohlcv_df.head())

Data loaded successfully. Here are the first 5 rows:
     open    high     low   close  volume
0  646.90  662.00  646.90  662.00  271571
1  661.70  662.30  653.60  653.90  193089
2  654.20  658.00  651.05  657.95   89069
3  657.00  657.50  655.40  656.80   68028
4  656.95  656.95  647.00  647.00  105605


In [3]:
class CtrlAlpha:
    
    def __init__(self):
        # --- Hyperparameters ---
        self.h = 8
        self.upper_threshold = 0.002
        self.lower_threshold = -0.002
        self.execution_delay = 1

        # --- Model & Scaler ---
        self.model = RandomForestClassifier(
            n_estimators=150, max_depth=10, min_samples_leaf=10,
            class_weight="balanced",random_state=42, n_jobs=-1
        )
        self.scaler = StandardScaler()

        # --- State Management ---
        self.history_df = pd.DataFrame()
        self.features_list = None
        self.min_history_size = 50


# OLD CODE
    
    # def _create_features(self, df: pd.DataFrame) -> pd.DataFrame:
    #     df_copy = df.copy()
    #     if not isinstance(df_copy.index, pd.DatetimeIndex):
    #          df_copy.index = pd.to_datetime(df_copy.index, unit='s')
    #     df_copy.ta.strategy("common", append=True)
    #     df_copy.fillna(method='ffill', inplace=True)
    #     return df_copy


    # NEW code
    def _create_features(self, df: pd.DataFrame) -> pd.DataFrame:
        df_copy = df.copy()
        if not isinstance(df_copy.index, pd.DatetimeIndex):
                df_copy.index = pd.to_datetime(df_copy.index, unit='s')
    
        # Manually add the indicators from the "common" strategy
        df_copy.ta.rsi(append=True)
        df_copy.ta.macd(append=True)
        df_copy.ta.bbands(append=True)
        df_copy.ta.obv(append=True)
        
        df_copy.fillna(method='ffill', inplace=True)
        return df_copy




    
    def _create_labels(self, df: pd.DataFrame) -> pd.Series:
        forward_returns = (df['close'].shift(-self.h) / df['close']) - 1
        conditions = [
            (forward_returns > self.upper_threshold),
            (forward_returns < self.lower_threshold),
        ]
        choices = [1, -1]
        labels = np.select(conditions, choices, default=0)
        labels = pd.Series(labels, index=df.index)
    
        # Drop last h rows (to avoid peeking into the future)
        labels.iloc[-self.h:] = np.nan
        return labels


    def train(self, ohlcv_data: pd.DataFrame):
        print("Starting model training...")
        df = ohlcv_data.copy()
        df.columns = [col.lower() for col in df.columns] # Ensure lowercase columns

        df_features = self._create_features(df)
        df_labels = self._create_labels(df_features)

        df_combined = df_features.join(df_labels.rename('signal'))
        df_combined.dropna(inplace=True)

        X = df_combined.drop(columns=['signal'])
        y = df_combined['signal']

        self.features_list = [col for col in X.columns if col in df_features.columns]
        X = X[self.features_list]

        if len(X) < self.min_history_size:
            print("Not enough data to train the model.")
            return

        X_scaled = self.scaler.fit_transform(X)
        self.model.fit(X_scaled, y)
        self.history_df = df.tail(self.min_history_size * 2)
        print("Training complete.")

    def predict(self, timestamp_data: pd.DataFrame) -> int:
        df = timestamp_data.copy()
        df.columns = [col.lower() for col in df.columns]
    
        self.history_df = pd.concat([self.history_df, df], ignore_index=True)
    
        if len(self.history_df) < self.min_history_size or self.features_list is None:
            return 0
    
        # Only recompute features for last min_history_size rows
        features_df = self._create_features(self.history_df.tail(self.min_history_size).copy())
    
        latest_features = features_df[self.features_list].iloc[-self.execution_delay]
    
        if latest_features.isnull().any():
            return 0
    
        scaled_features = self.scaler.transform(latest_features.values.reshape(1, -1))
        prediction = self.model.predict(scaled_features)
        return int(prediction[0])


In [4]:
# Step 1: First split (train+val vs test)
split_point = int(len(ohlcv_df) * 0.8)
trainval_df = ohlcv_df.iloc[:split_point]   # 80% for training+validation
test_df = ohlcv_df.iloc[split_point:]       # 20% final test (kept untouched)

# Step 2: Now split trainval into train_sub and val_sub
split_val = int(len(trainval_df) * 0.8)
train_sub = trainval_df.iloc[:split_val]    # ~64% of total → training
val_sub = trainval_df.iloc[split_val:]      # ~16% of total → validation

print(f"Training size: {len(train_sub)} rows")
print(f"Validation size: {len(val_sub)} rows")
print(f"Test size: {len(test_df)} rows")

# Train only on training subset
agent = CtrlAlpha()
agent.train(train_sub)


Training size: 384000 rows
Validation size: 96000 rows
Test size: 120000 rows
Starting model training...
Training complete.


In [5]:
# --- The Fast, Vectorized Backtest ---

print("Starting vectorized backtest...")

# 1. Get the last part of the training data to use as a "lookback" history for indicators
lookback_period = 50 # Should be at least the longest period of your indicators
history = train_sub.tail(lookback_period)

# 2. Combine the history with the test data
combined_df = pd.concat([history, test_df])

# 3. Calculate features for the combined dataframe all at once
# This is much faster than doing it one row at a time in a loop
features_df = agent._create_features(combined_df.copy())

# 4. Select only the features for the test period
test_features_df = features_df.iloc[lookback_period:]

# 5. Scale all the test features at once
# Ensure we only use the columns the model was trained on
X_test = test_features_df[agent.features_list]
X_test_scaled = agent.scaler.transform(X_test)

# 6. Predict on the entire scaled test set in one go
# The model's predict method is vectorized and highly efficient
all_predictions = agent.model.predict(X_test_scaled)

# 7. Assign the signals to the original test dataframe
test_df['signal'] = all_predictions

print("Backtest predictions complete. Here's a sample of the results:")
print(test_df.tail(100))






Starting vectorized backtest...
Backtest predictions complete. Here's a sample of the results:
           open     high      low    close  volume  signal
599900  1434.95  1436.25  1434.10  1436.25    2893     0.0
599901  1436.25  1437.35  1435.30  1436.25    1741     0.0
599902  1436.25  1437.35  1435.10  1435.95    4490     0.0
599903  1435.60  1437.60  1435.60  1437.60    5004     0.0
599904  1437.60  1437.60  1436.00  1436.50    1328     0.0
...         ...      ...      ...      ...     ...     ...
599995  1427.00  1429.50  1427.00  1429.25    5649     0.0
599996  1429.35  1429.35  1427.20  1428.10    5394     0.0
599997  1427.40  1428.00  1426.20  1426.75    5012     0.0
599998  1426.75  1426.85  1425.20  1425.80    2440     0.0
599999  1425.25  1425.80  1424.55  1424.90    4139     0.0

[100 rows x 6 columns]


In [6]:
# --- In a new cell ---

# Check the value counts of all signals in the test set
print("Signal Distribution in the Entire Test Set:")
print(test_df['signal'].value_counts())

# Check the distribution as a percentage
print("\nSignal Distribution (%):")
print(test_df['signal'].value_counts(normalize=True) * 100)

Signal Distribution in the Entire Test Set:
signal
 0.0    61774
-1.0    33260
 1.0    24966
Name: count, dtype: int64

Signal Distribution (%):
signal
 0.0    51.478333
-1.0    27.716667
 1.0    20.805000
Name: proportion, dtype: float64


## OPTIMIZING HYPERPARAMETERS THRU GRID SEARCH

In [7]:
import time


h_values_coarse = [10, 30, 50]
threshold_values_coarse = [0.002, 0.006, 0.010]

results_list_coarse = []
print("--- Starting Coarse Grid Search ---")

# Looping through every combination
for h in h_values_coarse:
    for threshold in threshold_values_coarse:
        start_time = time.time()
        print(f"--- Testing h={h}, threshold={threshold:.4f} ---")

        # Configure and train the agent for this specific loop
        agent = CtrlAlpha()
        agent.h = h
        agent.upper_threshold = threshold
        agent.lower_threshold = -threshold
        agent.train(train_sub) # Train only on the training subset

        # Backtest on the validation subset
        history = train_sub.tail(50)
        combined_df = pd.concat([history, val_sub])
        features_df = agent._create_features(combined_df.copy())
        test_features_df = features_df.iloc[50:]
        
        if agent.features_list is None:
            print("Agent not trained properly, skipping...")
            continue
            
        X_val = test_features_df[agent.features_list]
        X_val_scaled = agent.scaler.transform(X_val)
        predictions = agent.model.predict(X_val_scaled)
        
        val_df = val_sub.copy()
        val_df['signal'] = predictions
        
        # Calculate metrics
        val_df['daily_return'] = val_df['close'].pct_change()
        val_df['strategy_return'] = val_df['daily_return'] * val_df['signal'].shift(1)
        
        if val_df['strategy_return'].std() == 0:
            sharpe_ratio = 0
        else:
            sharpe_ratio = (val_df['strategy_return'].mean() / val_df['strategy_return'].std()) * np.sqrt(252)
        
        cumulative_returns = (1 + val_df['strategy_return']).cumprod()
        peak = cumulative_returns.expanding(min_periods=1).max()
        drawdown = (cumulative_returns / peak) - 1
        max_drawdown = drawdown.min()

        # Store the results
        results_list_coarse.append({
            'h': h,
            'threshold': threshold,
            'sharpe_ratio': sharpe_ratio,
            'max_drawdown': max_drawdown
        })
        
        end_time = time.time()
        print(f"Done. Sharpe: {sharpe_ratio:.2f}, Max Drawdown: {max_drawdown:.2%}. Took {end_time - start_time:.2f}s")

--- Starting Coarse Grid Search ---
--- Testing h=10, threshold=0.0020 ---
Starting model training...
Training complete.
Done. Sharpe: 0.34, Max Drawdown: -35.61%. Took 12.61s
--- Testing h=10, threshold=0.0060 ---
Starting model training...
Training complete.
Done. Sharpe: 0.04, Max Drawdown: -35.70%. Took 13.02s
--- Testing h=10, threshold=0.0100 ---
Starting model training...
Training complete.
Done. Sharpe: -0.14, Max Drawdown: -79.75%. Took 13.47s
--- Testing h=30, threshold=0.0020 ---
Starting model training...
Training complete.
Done. Sharpe: 0.35, Max Drawdown: -32.23%. Took 11.67s
--- Testing h=30, threshold=0.0060 ---
Starting model training...
Training complete.
Done. Sharpe: 0.08, Max Drawdown: -51.65%. Took 12.78s
--- Testing h=30, threshold=0.0100 ---
Starting model training...
Training complete.
Done. Sharpe: -0.08, Max Drawdown: -66.89%. Took 12.29s
--- Testing h=50, threshold=0.0020 ---
Starting model training...
Training complete.
Done. Sharpe: 0.18, Max Drawdown: -30

In [8]:
results_df_coarse = pd.DataFrame(results_list_coarse)

print("\n--- Coarse Grid Search Results ---")
print(results_df_coarse.sort_values(by='sharpe_ratio', ascending=False))


--- Coarse Grid Search Results ---
    h  threshold  sharpe_ratio  max_drawdown
3  30      0.002      0.348246     -0.322272
0  10      0.002      0.344580     -0.356060
6  50      0.002      0.180634     -0.302063
4  30      0.006      0.082864     -0.516548
7  50      0.006      0.063012     -0.439951
1  10      0.006      0.043635     -0.356975
5  30      0.010     -0.080290     -0.668892
8  50      0.010     -0.092336     -0.715120
2  10      0.010     -0.139960     -0.797539


In [9]:
import time

# Define a TIGHT grid around the best coarse results (h=30, threshold=0.002)
h_values_fine = [20, 30, 40]
threshold_values_fine = [0.001, 0.002, 0.003]

results_list_fine = []
print("--- Starting Fine-Grained Grid Search ---")

# The rest of the loop is identical to the one above
for h in h_values_fine:
    for threshold in threshold_values_fine:
        start_time = time.time()
        print(f"--- Testing h={h}, threshold={threshold:.4f} ---")

        agent = CtrlAlpha()
        agent.h = h
        agent.upper_threshold = threshold
        agent.lower_threshold = -threshold
        agent.train(train_sub)

        history = train_sub.tail(50)
        combined_df = pd.concat([history, val_sub])
        features_df = agent._create_features(combined_df.copy())
        test_features_df = features_df.iloc[50:]
        
        if agent.features_list is None:
            print("Agent not trained properly, skipping...")
            continue
            
        X_val = test_features_df[agent.features_list]
        X_val_scaled = agent.scaler.transform(X_val)
        predictions = agent.model.predict(X_val_scaled)
        
        val_df = val_sub.copy()
        val_df['signal'] = predictions
        
        val_df['daily_return'] = val_df['close'].pct_change()
        val_df['strategy_return'] = val_df['daily_return'] * val_df['signal'].shift(1)
        
        if val_df['strategy_return'].std() == 0:
            sharpe_ratio = 0
        else:
            sharpe_ratio = (val_df['strategy_return'].mean() / val_df['strategy_return'].std()) * np.sqrt(252)

        cumulative_returns = (1 + val_df['strategy_return']).cumprod()
        peak = cumulative_returns.expanding(min_periods=1).max()
        drawdown = (cumulative_returns / peak) - 1
        max_drawdown = drawdown.min()

        results_list_fine.append({
            'h': h,
            'threshold': threshold,
            'sharpe_ratio': sharpe_ratio,
            'max_drawdown': max_drawdown
        })
        
        end_time = time.time()
        print(f"Done. Sharpe: {sharpe_ratio:.2f}, Max Drawdown: {max_drawdown:.2%}. Took {end_time - start_time:.2f}s")

--- Starting Fine-Grained Grid Search ---
--- Testing h=20, threshold=0.0010 ---
Starting model training...
Training complete.
Done. Sharpe: 0.35, Max Drawdown: -33.76%. Took 11.74s
--- Testing h=20, threshold=0.0020 ---
Starting model training...
Training complete.
Done. Sharpe: 0.33, Max Drawdown: -30.44%. Took 11.46s
--- Testing h=20, threshold=0.0030 ---
Starting model training...
Training complete.
Done. Sharpe: 0.35, Max Drawdown: -29.15%. Took 11.55s
--- Testing h=30, threshold=0.0010 ---
Starting model training...
Training complete.
Done. Sharpe: 0.34, Max Drawdown: -34.13%. Took 11.71s
--- Testing h=30, threshold=0.0020 ---
Starting model training...
Training complete.
Done. Sharpe: 0.35, Max Drawdown: -32.23%. Took 11.55s
--- Testing h=30, threshold=0.0030 ---
Starting model training...
Training complete.
Done. Sharpe: 0.34, Max Drawdown: -29.11%. Took 11.89s
--- Testing h=40, threshold=0.0010 ---
Starting model training...
Training complete.
Done. Sharpe: 0.30, Max Drawdown:

In [10]:
results_df_fine = pd.DataFrame(results_list_fine)

print("\n Fine-Grained Search Results: ")
print(results_df_fine.sort_values(by='sharpe_ratio', ascending=False))


--- Fine-Grained Search Results ---
    h  threshold  sharpe_ratio  max_drawdown
0  20      0.001      0.349375     -0.337570
2  20      0.003      0.348494     -0.291536
4  30      0.002      0.348246     -0.322272
3  30      0.001      0.344823     -0.341333
5  30      0.003      0.335467     -0.291136
1  20      0.002      0.325798     -0.304360
7  40      0.002      0.305865     -0.268636
6  40      0.001      0.298608     -0.287118
8  40      0.003      0.259260     -0.293106


In [11]:
-


print("Signal Distribution in the Entire Test Set:")
print(test_df['signal'].value_counts())


print("\nSignal Distribution (%):")
print(test_df['signal'].value_counts(normalize=True) * 100)

Signal Distribution in the Entire Test Set:
signal
 0.0    61774
-1.0    33260
 1.0    24966
Name: count, dtype: int64

Signal Distribution (%):
signal
 0.0    51.478333
-1.0    27.716667
 1.0    20.805000
Name: proportion, dtype: float64


In [13]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1. Generate the true labels for the test set using the same method as in training
# We use the agent's configured h and thresholds
y_true_series = agent._create_labels(test_df.copy())

# 2. Get the predictions your model made
y_pred_series = test_df['signal']

# 3. Align the true labels and predictions
# We create a temporary DataFrame and drop rows where the true label is NaN 
# (the last h rows of the test set)
report_df = pd.DataFrame({'y_true': y_true_series, 'y_pred': y_pred_series})
report_df.dropna(inplace=True)

y_true = report_df['y_true']
y_pred = report_df['y_pred']

# 4. Calculate and print all the metrics
accuracy = accuracy_score(y_true, y_pred)
report = classification_report(y_true, y_pred)
matrix = confusion_matrix(y_true, y_pred)

print(f"Accuracy: {accuracy:.4f}\n")
print("Classification Report:")
print(report)
print("Confusion Matrix:")
print(matrix)

Accuracy: 0.3981

Classification Report:
              precision    recall  f1-score   support

        -1.0       0.34      0.31      0.32     36423
         0.0       0.44      0.61      0.51     44220
         1.0       0.39      0.25      0.30     39317

    accuracy                           0.40    119960
   macro avg       0.39      0.39      0.38    119960
weighted avg       0.39      0.40      0.38    119960

Confusion Matrix:
[[11243 17553  7627]
 [ 9637 26881  7702]
 [12376 17305  9636]]
