In [78]:
import pandas as pd
import random
import yfinance as yf
import numpy as np
from statsmodels.tsa.stattools import coint, adfuller
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [79]:
# Get S&P 500 tickers
def get_sp500_tickers(sample_size=250):
    url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    table = pd.read_html(url)[0]
    tickers = table['Symbol'].tolist()
    sample_tickers = random.sample(tickers, sample_size)
    return sample_tickers

In [80]:
# Fetch historical data
def get_historical_data(tickers):
    data = pd.DataFrame()
    for i in tickers:
        stock_data = yf.download(i, start="2022-01-01", end="2024-09-10")
        data[i] = stock_data['Adj Close']
    return data

In [81]:
# Perform the ADF test for stationarity
def adf_test(series):
    result = adfuller(series)
    return result[1]  # Return p-val

In [82]:
# Cointegration test between pairs of stocks
def cointegration_test(data):
    pairs = []
    results = []
    
    tickers = data.columns
    for i in range(len(tickers)):
        for j in range(i + 1, len(tickers)):
            x = data[tickers[i]].dropna()
            y = data[tickers[j]].dropna()
            
            x, y = x.align(y, join='inner')  # Align x and y by matching labels (dates)
            if x.empty or y.empty:
                continue
            
            try:
                score, p_value, _ = coint(x, y)
            except ValueError:
                continue
            
            ratio = x / y
            adf_p_value = adf_test(ratio)
            
            pairs.append((tickers[i], tickers[j]))
            results.append((tickers[i], tickers[j], score, p_value, adf_p_value))
    
    results_df = pd.DataFrame(results, columns=['Stock 1', 'Stock 2', 'Cointegration Score', 'Cointegration p-value', 'ADF p-value'])
    return results_df

In [83]:
# Calculate z-score for the ratio
def calculate_zscore(ratio):
    mean = ratio.rolling(window=30).mean()
    std = ratio.rolling(window=30).std()
    zscore = (ratio - mean) / std
    return zscore

In [84]:
# Feature engineering: calculate moving averages and z-scores
def generate_features(ratio):
    ratio = ratio.dropna()
    features = pd.DataFrame()
    
    features['30d_ma'] = ratio.rolling(window=30).mean()
    features['5d_ma'] = ratio.rolling(window=5).mean()
    features['z_score'] = calculate_zscore(ratio)
    
    # Drop rows with NaN values after rolling windows
    features.dropna(inplace=True)
    
    return features

In [85]:
# Get S&P 500 tickers and historical data
ticks = get_sp500_tickers()
historical_data = get_historical_data(ticks)

# Cointegration test
results_df = cointegration_test(historical_data)

# Filter pairs with the lowest ADF p-values
filtered_results = results_df[results_df['ADF p-value'] < 0.05]

# Sort pairs by cointegration score to find the most cointegrated pair
sorted_results = filtered_results.sort_values(by='Cointegration Score', ascending=True)

# Get the top cointegrated pair
top_pair = sorted_results.iloc[0]
stock1, stock2 = top_pair['Stock 1'], top_pair['Stock 2']

# Calculate the ratio for the top pair
ratio = historical_data[stock1] / historical_data[stock2]

# Generate features for the ratio
features = generate_features(ratio)

# Define target signals based on z-score thresholds
entry_threshold = 1
exit_threshold = 0

features['target'] = np.where(features['z_score'] > entry_threshold, -1,
                              np.where(features['z_score'] < -entry_threshold, 1, 0))

# Split data into train and test sets
X = features[['30d_ma', '5d_ma', 'z_score']]
y = features['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest classifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predict signals on the test set
y_pred = model.predict(X_test)


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

In [86]:
# Backtest using ML signals
def backtest_ml_strategy(ratio, ml_predictions, zscore):
    positions = []
    returns = []
    
    position = None
    entry_ratio = None

    # Make sure both arrays are aligned in size
    ratio = ratio[-len(ml_predictions):]
    zscore = zscore[-len(ml_predictions):]  # Ensure zscore is aligned as well

    for i in range(len(ratio)):
        if ml_predictions[i] == 1 and position is None:
            # Enter long
            entry_ratio = ratio.iloc[i]
            position = 'long'
            positions.append(('long', i))

        elif ml_predictions[i] == -1 and position is None:
            # Enter short
            entry_ratio = ratio.iloc[i]
            position = 'short'
            positions.append(('short', i))

        # Exit on model prediction or if zscore comes close to 0 (another exit rule)
        elif (ml_predictions[i] == 0) and position is not None:
            # Exit position
            exit_ratio = ratio.iloc[i]
            if position == 'long':
                returns.append(exit_ratio - entry_ratio)
            elif position == 'short':
                returns.append(entry_ratio - exit_ratio)
            
            # Reset position
            position = None

    return positions, returns

# Adjust ratio and zscore for the test period
ratio_test = ratio[-len(y_test):]
zscore_test = features['z_score'].iloc[-len(y_test):]

# Call the backtest function with appropriate inputs
positions, returns = backtest_ml_strategy(ratio_test, y_pred, zscore_test)

# Print positions and returns
print("Positions:", positions)
print("Returns:", returns)
print("Total return:", sum(returns))


Positions: [('short', 0), ('long', 12), ('short', 14), ('short', 16), ('long', 18), ('long', 20), ('short', 24), ('long', 30), ('long', 35), ('long', 41), ('short', 44), ('long', 48), ('short', 52), ('long', 55), ('long', 58), ('long', 65), ('long', 72), ('short', 77), ('long', 83), ('long', 88), ('short', 90), ('long', 92), ('long', 95), ('short', 102), ('long', 105), ('long', 110), ('short', 112), ('long', 116), ('long', 120), ('long', 123), ('short', 125)]
Returns: [np.float64(0.009094028723591552), np.float64(0.028920454685533725), np.float64(0.05637635291780718), np.float64(0.01654084100808051), np.float64(0.027645876269662084), np.float64(0.02037596850064749), np.float64(0.01682834418327306), np.float64(-0.00893214329771208), np.float64(0.04391303390410939), np.float64(-0.017368996354835797), np.float64(0.0018310302055861705), np.float64(-0.030070087978768978), np.float64(0.01860315639511212), np.float64(0.024144128256416986), np.float64(-0.011910529777234613), np.float64(-0.1058