In [19]:
import pandas as pd
import random
import yfinance as yf
import numpy as np
from statsmodels.tsa.stattools import coint, adfuller
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [20]:
# Get S&P 500 tickers
def get_sp500_tickers(sample_size=250):
    url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    table = pd.read_html(url)[0]
    tickers = table['Symbol'].tolist()
    sample_tickers = random.sample(tickers, sample_size)
    return sample_tickers

In [21]:
# Fetch historical data
def get_historical_data(tickers):
    data = pd.DataFrame()
    for i in tickers:
        stock_data = yf.download(i, start="2022-01-01", end="2024-09-10")
        data[i] = stock_data['Adj Close']
    return data

In [22]:
# Perform the ADF test for stationarity
def adf_test(series):
    result = adfuller(series)
    return result[1]  # Return p-val

In [23]:
# Cointegration test between pairs of stocks
def cointegration_test(data):
    pairs = []
    results = []
    
    tickers = data.columns
    for i in range(len(tickers)):
        for j in range(i + 1, len(tickers)):
            x = data[tickers[i]].dropna()
            y = data[tickers[j]].dropna()
            
            x, y = x.align(y, join='inner')  # Align x and y by matching labels (dates)
            if x.empty or y.empty:
                continue
            
            try:
                score, p_value, _ = coint(x, y)
            except ValueError:
                continue
            
            ratio = x / y
            adf_p_value = adf_test(ratio)
            
            pairs.append((tickers[i], tickers[j]))
            results.append((tickers[i], tickers[j], score, p_value, adf_p_value))
    
    results_df = pd.DataFrame(results, columns=['Stock 1', 'Stock 2', 'Cointegration Score', 'Cointegration p-value', 'ADF p-value'])
    return results_df

In [24]:
# Calculate z-score for the ratio
def calculate_zscore(ratio):
    mean = ratio.rolling(window=30).mean()
    std = ratio.rolling(window=30).std()
    zscore = (ratio - mean) / std
    return zscore

In [25]:
# Feature engineering: calculate moving averages and z-scores
def generate_features(ratio):
    ratio = ratio.dropna()
    features = pd.DataFrame()
    
    features['30d_ma'] = ratio.rolling(window=30).mean()
    features['5d_ma'] = ratio.rolling(window=5).mean()
    features['z_score'] = calculate_zscore(ratio)
    
    # Drop rows with NaN values after rolling windows
    features.dropna(inplace=True)
    
    return features

In [26]:
# Get S&P 500 tickers and historical data
ticks = get_sp500_tickers()
historical_data = get_historical_data(ticks)

# Cointegration test
results_df = cointegration_test(historical_data)

# Filter pairs with the lowest ADF p-values
filtered_results = results_df[results_df['ADF p-value'] < 0.05]

# Sort pairs by cointegration score to find the most cointegrated pair
sorted_results = filtered_results.sort_values(by='Cointegration Score', ascending=True)

# Get the top cointegrated pair
top_pair = sorted_results.iloc[0]
stock1, stock2 = top_pair['Stock 1'], top_pair['Stock 2']

# Calculate the ratio for the top pair
ratio = historical_data[stock1] / historical_data[stock2]

# Generate features for the ratio
features = generate_features(ratio)

# Define target signals based on z-score thresholds
entry_threshold = 1.5
exit_threshold = 0

features['target'] = np.where(features['z_score'] > entry_threshold, -1,
                              np.where(features['z_score'] < -entry_threshold, 1, 0))

# Split data into train and test sets
X = features[['30d_ma', '5d_ma']]
y = features['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest classifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predict signals on the test set
y_pred = model.predict(X_test)

print(historical_data.head())
print(results_df.head())
print(filtered_results.head())
print(sorted_results.head())

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

KeyboardInterrupt: 

In [10]:
# Backtest using ML signals
def backtest_ml_strategy(stock1_prices, stock2_prices, ml_predictions):
    positions = []
    returns = []
    
    position = None
    entry_stock1_price = None
    entry_stock2_price = None

    for i in range(len(ml_predictions)):
        if ml_predictions[i] == 1 and position is None:
            # Enter long (Long Stock 1, Short Stock 2)
            entry_stock1_price = stock1_prices.iloc[i]
            entry_stock2_price = stock2_prices.iloc[i]
            position = 'long'
            positions.append(('long', i))

        elif ml_predictions[i] == -1 and position is None:
            # Enter short (Short Stock 1, Long Stock 2)
            entry_stock1_price = stock1_prices.iloc[i]
            entry_stock2_price = stock2_prices.iloc[i]
            position = 'short'
            positions.append(('short', i))

        # Exit on model prediction or if zscore comes close to 0 (another exit rule)
        elif (ml_predictions[i] == 0 or abs(features['z_score'].iloc[i]) < 0.1) and position is not None:
            # Exit position
            exit_stock1_price = stock1_prices.iloc[i]
            exit_stock2_price = stock2_prices.iloc[i]
            
            if position == 'long':
                # Return for a long position
                returns.append(((exit_stock1_price - entry_stock1_price)/entry_stock1_price) + ((entry_stock2_price - exit_stock2_price)/entry_stock2_price))
            elif position == 'short':
                # Return for a short position
                returns.append(((entry_stock1_price - exit_stock1_price)/entry_stock1_price) - ((exit_stock2_price - entry_stock2_price)/entry_stock2_price))
            
            # Reset position
            position = None

    return positions, returns

# Adjust stock prices for the test period
stock1_prices_test = historical_data[stock1][-len(y_test):]
stock2_prices_test = historical_data[stock2][-len(y_test):]

# Call the backtest function with appropriate inputs
positions, returns = backtest_ml_strategy(stock1_prices_test, stock2_prices_test, y_pred)

# Print positions and returns
print("Positions:", positions)
print("Returns:", returns)
print("Total return:", sum(returns))


Positions: [('long', 7), ('short', 14), ('short', 21), ('short', 23), ('long', 26), ('long', 29), ('short', 36), ('short', 42), ('short', 46), ('long', 54), ('long', 63), ('short', 69), ('short', 74), ('long', 81), ('long', 91), ('short', 95), ('long', 107), ('long', 111), ('long', 120)]
Returns: [np.float64(0.001822717694703074), np.float64(-0.030034985439328185), np.float64(0.006470850775002449), np.float64(0.04494473687901777), np.float64(-0.023164861043366715), np.float64(-0.04856754942090276), np.float64(-0.0035960056747558556), np.float64(-0.012837268012460521), np.float64(0.023759256147496827), np.float64(0.007274921314645688), np.float64(-0.014596674297340271), np.float64(0.016639477921134332), np.float64(0.027327566780605574), np.float64(0.019309596035311623), np.float64(0.008752660616492883), np.float64(0.04171466600076015), np.float64(0.008132286402347831), np.float64(0.0031250817665545213), np.float64(-0.0029076205768389964)]
Total return: 0.07356885386907944
