In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# Step 1: Generate synthetic stock price data for 10 stocks
np.random.seed(42)
dates = pd.date_range(start='2024-01-01', periods=252)  # 252 trading days
stocks = {
    'Stock_A': np.cumsum(np.random.normal(0, 1, len(dates))) + 100,
    'Stock_B': np.cumsum(np.random.normal(0, 1, len(dates))) + 120,
    'Stock_C': np.cumsum(np.random.normal(0, 1, len(dates))) + 140,
    'Stock_D': np.cumsum(np.random.normal(0, 1, len(dates))) + 160,
    'Stock_E': np.cumsum(np.random.normal(0, 1, len(dates))) + 180,
    'Stock_F': np.cumsum(np.random.normal(0, 1, len(dates))) + 200,
    'Stock_G': np.cumsum(np.random.normal(0, 1, len(dates))) + 220,
    'Stock_H': np.cumsum(np.random.normal(0, 1, len(dates))) + 240,
    'Stock_I': np.cumsum(np.random.normal(0, 1, len(dates))) + 260,
    'Stock_J': np.cumsum(np.random.normal(0, 1, len(dates))) + 280
}

# Create a DataFrame
prices = pd.DataFrame(stocks, index=dates)
prices.head()


In [None]:

# Step 2: Calculate daily returns
returns = prices.pct_change().dropna()
returns.head()


In [None]:

# Step 3: Standardize the returns data
scaler = StandardScaler()
scaled_returns = scaler.fit_transform(returns)

# Step 4: Apply PCA for dimensionality reduction
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(scaled_returns)

# Step 5: Determine optimal number of clusters using the silhouette score
silhouette_scores = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(reduced_data)
    score = silhouette_score(reduced_data, kmeans.labels_)
    silhouette_scores.append(score)

# Plot silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(range(2, 11), silhouette_scores, marker='o')
plt.title('Silhouette Score for Different Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.grid()
plt.show()


In [None]:

# Step 6: Apply KMeans clustering with optimal clusters (based on silhouette score)
optimal_clusters = np.argmax(silhouette_scores) + 2
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
clusters = kmeans.fit_predict(reduced_data)

# Add clusters to the DataFrame
returns['Cluster'] = clusters
returns.head()


In [None]:

# Step 7: Visualize the clusters using PCA components
plt.figure(figsize=(10, 6))
sns.scatterplot(x=reduced_data[:, 0], y=reduced_data[:, 1], hue=clusters, palette='viridis', marker='o')
plt.title('Stock Clusters using KMeans and PCA')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.grid()
plt.show()


In [None]:

# Step 8: Analyze portfolio composition based on clusters
cluster_summary = returns.groupby('Cluster').mean().T
plt.figure(figsize=(12, 8))
sns.heatmap(cluster_summary, annot=True, cmap='coolwarm')
plt.title('Average Returns for Each Cluster')
plt.show()


In [None]:

# Step 9: Construct equal-weighted portfolio for each cluster
portfolio_returns = {}
for cluster in range(optimal_clusters):
    cluster_stocks = returns[returns['Cluster'] == cluster].iloc[:, :-1]
    portfolio_returns[f'Portfolio_{cluster}'] = cluster_stocks.mean(axis=1)

# Create a DataFrame for portfolio returns
portfolio_df = pd.DataFrame(portfolio_returns)
portfolio_df['Total_Portfolio'] = portfolio_df.mean(axis=1)

# Calculate cumulative returns
cumulative_returns = (1 + portfolio_df).cumprod()

# Step 10: Plot cumulative returns of each portfolio
plt.figure(figsize=(12, 8))
for column in cumulative_returns.columns:
    plt.plot(cumulative_returns.index, cumulative_returns[column], label=column)
plt.title('Cumulative Returns of Cluster-Based Portfolios')
plt.xlabel('Date')
plt.ylabel('Cumulative Return')
plt.legend()
plt.grid()
plt.show()
