In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Step 1: Generate synthetic stock price data
np.random.seed(42)
dates = pd.date_range(start='2024-01-01', periods=100)
stock_A = np.cumsum(np.random.normal(0, 1, len(dates))) + 100
stock_B = stock_A + np.random.normal(0, 1, len(dates))  # Highly correlated with stock A
stock_C = np.cumsum(np.random.normal(0, 1, len(dates))) + 150
stock_D = np.cumsum(np.random.normal(0, 1, len(dates))) + 200

# Create a DataFrame
prices = pd.DataFrame({'Stock_A': stock_A, 'Stock_B': stock_B, 'Stock_C': stock_C, 'Stock_D': stock_D}, index=dates)
prices.head()


In [None]:

# Step 2: Standardize the data
scaler = StandardScaler()
scaled_prices = scaler.fit_transform(prices)

# Step 3: Apply PCA for dimensionality reduction
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(scaled_prices)

# Step 4: Apply KMeans clustering
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(reduced_data)

# Add clusters to the DataFrame
prices['Cluster'] = clusters
prices.head()


In [None]:

# Step 5: Visualize clusters
plt.figure(figsize=(10, 6))
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=clusters, cmap='viridis', marker='o')
plt.title('Stock Clusters using KMeans')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.grid()
plt.show()


In [None]:

# Step 6: Identify pairs within clusters
cluster_0 = prices[prices['Cluster'] == 0].iloc[:, :-1]
cluster_1 = prices[prices['Cluster'] == 1].iloc[:, :-1]

pairs_0 = cluster_0.corr().unstack().sort_values(ascending=False).drop_duplicates()
pairs_1 = cluster_1.corr().unstack().sort_values(ascending=False).drop_duplicates()

# Filter pairs with high correlation (above 0.9)
pairs_cluster_0 = pairs_0[pairs_0 < 1].index[pairs_0 > 0.9]
pairs_cluster_1 = pairs_1[pairs_1 < 1].index[pairs_1 > 0.9]

print("Highly correlated pairs in Cluster 0:", pairs_cluster_0)
print("Highly correlated pairs in Cluster 1:", pairs_cluster_1)


In [None]:

# Step 7: Implement pairs trading strategy
def pairs_trading(prices, stock_1, stock_2):
    spread = prices[stock_1] - prices[stock_2]
    mean_spread = spread.mean()
    std_spread = spread.std()

    buy_signal = spread < mean_spread - std_spread
    sell_signal = spread > mean_spread + std_spread

    # Calculate returns
    returns = pd.Series(0, index=prices.index)
    returns[buy_signal] = (prices[stock_2] - prices[stock_1])[buy_signal]
    returns[sell_signal] = (prices[stock_1] - prices[stock_2])[sell_signal]

    cumulative_returns = returns.cumsum()
    return cumulative_returns

# Example pair: Stock_A and Stock_B (highly correlated pair)
cumulative_returns = pairs_trading(prices, 'Stock_A', 'Stock_B')

# Step 8: Plot the cumulative returns
plt.figure(figsize=(10, 6))
plt.plot(cumulative_returns, marker='o', linestyle='-')
plt.title("Cumulative Returns of Pairs Trading Strategy (Stock_A and Stock_B)")
plt.xlabel("Date")
plt.ylabel("Cumulative Return")
plt.grid()
plt.show()
