In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist
from matplotlib.gridspec import GridSpec
import matplotlib.cm as cm
from sklearn.decomposition import PCA

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:

X = pd.read_excel("C:/Users/wkaco/Desktop/Assets_data.xlsx")
macro_data = pd.read_excel("C:/Users/wkaco/Desktop/CPIGDP.xlsx")

In [12]:
def prepare_macro_data(df):
    """
    Prepare macro data by ensuring correct types and handling categorical variables.
    
    Parameters:
    df (pd.DataFrame): Raw macro data
    
    Returns:
    pd.DataFrame: Processed macro data with correct types
    list: Numeric feature names
    list: Categorical feature names
    """
    # Define expected features and their types
    numeric_features = [
        'GDP', 'Inflation', 'Unemployment', 'Debt_Change', 
        'Credit_Spread', 'M2', '2y10y', '10y30y', 
        'Duration_Premium', '3m30y', 'Banking_Reserves', 
        'Feds_Total_Assets'
    ]
    
    categorical_features = ['Yield_Curve_Regime']
    
    # Ensure all numeric features are float type
    for feature in numeric_features:
        if feature in df.columns:
            df[feature] = pd.to_numeric(df[feature], errors='coerce')
    
    # Ensure categorical features are string type
    for feature in categorical_features:
        if feature in df.columns:
            df[feature] = df[feature].astype(str)
    
    return df, numeric_features, categorical_features

def plot_gmm_analysis(macro_data, X, cluster_labels, gmm_model, scaler):
    """Create comprehensive visualizations for GMM clustering analysis using matplotlib."""
    # Prepare data
    macro_data, numeric_features, categorical_features = prepare_macro_data(macro_data)
    
    # Create figure with multiple subplots
    fig = plt.figure(figsize=(20, 35))  # Increased height for additional features
    gs = GridSpec(8, 2, figure=fig)  # Increased number of rows
    
    # 1. Plot first two principal components of macro features
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(scaler.transform(macro_data[numeric_features]))
    
    ax1 = fig.add_subplot(gs[0, :])
    scatter = ax1.scatter(pca_result[:, 0], pca_result[:, 1], 
                         c=cluster_labels + 1, 
                         cmap='viridis', 
                         s=100)
    ax1.set_xlabel('First Principal Component', fontsize=12)
    ax1.set_ylabel('Second Principal Component', fontsize=12)
    ax1.set_title('Economic Regimes: PCA Visualization', fontsize=14)
    ax1.grid(True)
    plt.colorbar(scatter, ax=ax1, label='Cluster')
    
    # 2. Plot clusters over time
    ax2 = fig.add_subplot(gs[1, :])
    ax2.plot(macro_data.index, cluster_labels + 1, 'o-', linewidth=2)
    ax2.set_xlabel('Date', fontsize=12)
    ax2.set_ylabel('Cluster', fontsize=12)
    ax2.set_title('Economic Regime Evolution Over Time', fontsize=14)
    ax2.grid(True)
    
    # 3. Plot mean macro features by cluster
    cluster_stats = pd.DataFrame()
    for cluster in range(len(np.unique(cluster_labels))):
        mask = cluster_labels == cluster
        cluster_stats[f'Cluster {cluster+1}'] = macro_data.loc[mask, numeric_features].mean()
    
    ax3 = fig.add_subplot(gs[2, :])
    cluster_stats.T.plot(kind='bar', ax=ax3)
    ax3.set_title('Mean Macro Features by Cluster', fontsize=14)
    ax3.set_xlabel('Cluster', fontsize=12)
    ax3.set_ylabel('Value', fontsize=12)
    ax3.tick_params(axis='x', rotation=45)
    ax3.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # 4. Plot feature correlations within clusters
    for i, cluster in enumerate(range(len(np.unique(cluster_labels)))):
        ax = fig.add_subplot(gs[3+i, :])
        mask = cluster_labels == cluster
        corr_matrix = macro_data.loc[mask, numeric_features].corr()
        
        im = ax.imshow(corr_matrix, cmap='coolwarm', aspect='auto', vmin=-1, vmax=1)
        plt.colorbar(im, ax=ax)
        
        ax.set_xticks(np.arange(len(numeric_features)))
        ax.set_yticks(np.arange(len(numeric_features)))
        ax.set_xticklabels(numeric_features, rotation=45, ha='right')
        ax.set_yticklabels(numeric_features)
        
        for i in range(len(numeric_features)):
            for j in range(len(numeric_features)):
                text = ax.text(j, i, f'{corr_matrix.iloc[i, j]:.2f}',
                             ha="center", va="center", color="black")
        
        ax.set_title(f'Feature Correlations - Cluster {cluster+1}', fontsize=14)
    
    plt.tight_layout()
    plt.show()
    
    # 5. Feature distributions by cluster
    plot_feature_distributions(macro_data, numeric_features, cluster_labels)
    
    # 6. New: Yield Curve Regime analysis by cluster
    plot_regime_analysis(macro_data, cluster_labels)

def plot_feature_distributions(macro_data, numeric_features, cluster_labels):
    """Plot distributions for all numeric features by cluster."""
    n_features = len(numeric_features)
    n_clusters = len(np.unique(cluster_labels))
    
    fig, axes = plt.subplots(n_features, 1, figsize=(15, 5*n_features))
    fig.suptitle('Feature Distributions by Cluster', fontsize=16)
    
    for idx, feature in enumerate(numeric_features):
        ax = axes[idx]
        for cluster in range(n_clusters):
            mask = cluster_labels == cluster
            feature_data = macro_data.loc[mask, feature]
            ax.hist(feature_data, bins=20, alpha=0.5, 
                   label=f'Cluster {cluster+1}', density=True)
        ax.set_title(f'{feature} Distribution')
        ax.set_xlabel(feature)
        ax.set_ylabel('Density')
        ax.legend()
        ax.grid(True)
    
    plt.tight_layout()
    plt.show()

def plot_regime_analysis(macro_data, cluster_labels):
    """Plot analysis of Yield Curve Regime distribution within each cluster."""
    n_clusters = len(np.unique(cluster_labels))
    
    fig, ax = plt.subplots(figsize=(15, 8))
    
    regime_counts = []
    for cluster in range(n_clusters):
        mask = cluster_labels == cluster
        regime_dist = macro_data.loc[mask, 'Yield_Curve_Regime'].value_counts()
        regime_counts.append(regime_dist)
    
    regime_df = pd.DataFrame(regime_counts).fillna(0)
    regime_df.plot(kind='bar', ax=ax)
    
    ax.set_title('Yield Curve Regime Distribution by Cluster', fontsize=14)
    ax.set_xlabel('Cluster')
    ax.set_ylabel('Count')
    ax.legend(title='Regime', bbox_to_anchor=(1.05, 1))
    plt.tight_layout()
    plt.show()

def create_cluster_summary(macro_data, cluster_labels):
    """Create a comprehensive summary of each cluster's characteristics."""
    macro_data, numeric_features, categorical_features = prepare_macro_data(macro_data)
    summary = pd.DataFrame()
    
    for cluster in range(len(np.unique(cluster_labels))):
        mask = cluster_labels == cluster
        cluster_data = macro_data.loc[mask]
        
        # Numeric features summary
        stats = {}
        for feature in numeric_features:
            stats.update({
                f'{feature}_Mean': cluster_data[feature].mean(),
                f'{feature}_Std': cluster_data[feature].std(),
                f'{feature}_Min': cluster_data[feature].min(),
                f'{feature}_Max': cluster_data[feature].max()
            })
        
        # Categorical features summary
        for feature in categorical_features:
            mode_value = cluster_data[feature].mode().iloc[0]
            mode_freq = (cluster_data[feature] == mode_value).mean()
            stats.update({
                f'{feature}_Mode': mode_value,
                f'{feature}_Mode_Freq': mode_freq
            })
        
        summary[f'Cluster_{cluster+1}'] = pd.Series(stats)
    
    return summary

def analyze_gmm_clusters(macro_data, asset_returns, gmm_model, cluster_labels, scaler):
    """Analyze asset returns across different GMM clusters."""
    macro_data, numeric_features, _ = prepare_macro_data(macro_data)
    
    # Add cluster labels
    macro_data = macro_data.copy()
    macro_data['Cluster'] = cluster_labels
    
    # Ensure dates are datetime
    macro_data.index = pd.to_datetime(macro_data.index)
    asset_returns.index = pd.to_datetime(asset_returns.index)
    
    # Merge data
    combined_data = asset_returns.merge(
        macro_data,
        left_index=True,
        right_index=True,
        how='left'
    )
    
    # Calculate statistics
    cluster_stats = pd.DataFrame()
    
    for cluster in range(gmm_model.n_components):
        cluster_data = combined_data[combined_data['Cluster'] == cluster]
        
        # Asset returns statistics
        mean_returns = cluster_data[asset_returns.columns].mean()
        volatility = cluster_data[asset_returns.columns].std()
        sharpe = mean_returns / volatility
        macro_means = cluster_data[numeric_features].mean()
        
        # Store results
        cluster_stats[f'Cluster_{cluster+1}_Mean'] = mean_returns
        cluster_stats[f'Cluster_{cluster+1}_Vol'] = volatility
        cluster_stats[f'Cluster_{cluster+1}_Sharpe'] = sharpe
        for feature in numeric_features:
            cluster_stats.loc[feature, f'Cluster_{cluster+1}_Mean'] = macro_means[feature]
    
    return cluster_stats, combined_data

# Example usage:
"""
# Prepare data
macro_data, numeric_features, categorical_features = prepare_macro_data(macro_data)

# Fit GMM with optimal number of clusters (e.g., 3)
gmm_model, cluster_labels, scaler = fit_optimal_gmm(macro_data[numeric_features], 3)

# Create visualizations and analysis
plot_gmm_analysis(macro_data, X, cluster_labels, gmm_model, scaler)
cluster_stats, combined_data = analyze_gmm_clusters(macro_data, X, gmm_model, cluster_labels, scaler)
cluster_summary = create_cluster_summary(macro_data, cluster_labels)

# Print results
print("\nCluster Summary:")
print(cluster_summary)
print("\nCluster Statistics:")
print(cluster_stats)
"""

'\n# Prepare data\nmacro_data, numeric_features, categorical_features = prepare_macro_data(macro_data)\n\n# Fit GMM with optimal number of clusters (e.g., 3)\ngmm_model, cluster_labels, scaler = fit_optimal_gmm(macro_data[numeric_features], 3)\n\n# Create visualizations and analysis\nplot_gmm_analysis(macro_data, X, cluster_labels, gmm_model, scaler)\ncluster_stats, combined_data = analyze_gmm_clusters(macro_data, X, gmm_model, cluster_labels, scaler)\ncluster_summary = create_cluster_summary(macro_data, cluster_labels)\n\n# Print results\nprint("\nCluster Summary:")\nprint(cluster_summary)\nprint("\nCluster Statistics:")\nprint(cluster_stats)\n'

In [13]:
macro_data, numeric_features, categorical_features = prepare_macro_data(macro_data)

In [14]:
numeric_features = macro_data.select_dtypes(include=[np.number]).columns

In [15]:
analyze_clustering(macro_data)

NameError: name 'analyze_clustering' is not defined

NameError: name 'fit_optimal_gmm' is not defined