In [3]:
import pandas as pd
import numpy as np
from scipy import stats

# Define parameters
BREAK_DATE = '2023-10-23'
cryptos = ['BTC', 'ETH', 'LTC']

def calculate_statistics(returns_series_before, returns_series_after):
    """Calculate all required statistics for returns series including statistical tests."""
    # Clean data by removing NaN values
    clean_before = returns_series_before.dropna()
    clean_after = returns_series_after.dropna()
    
    # Get number of observations
    n_before = len(clean_before)
    n_after = len(clean_after)
    
    # Calculate basic statistics for before period
    before_stats = {
        'Number of Observations': n_before,
        'Mean Return (%)': clean_before.mean() * 100,
        'Median Return (%)': clean_before.median() * 100,
        'Mode Return (%)': stats.mode(clean_before, keepdims=True)[0][0] * 100,
        'Std Dev (%)': clean_before.std() * 100,
        'Variance (%)': clean_before.var() * 100,
        'Skewness': clean_before.skew(),
        'Kurtosis': clean_before.kurtosis(),
        'Q1 (%)': clean_before.quantile(0.25) * 100,
        'Q3 (%)': clean_before.quantile(0.75) * 100,
        'IQR (%)': (clean_before.quantile(0.75) - clean_before.quantile(0.25)) * 100
    }
    
    # Calculate basic statistics for after period
    after_stats = {
        'Number of Observations': n_after,
        'Mean Return (%)': clean_after.mean() * 100,
        'Median Return (%)': clean_after.median() * 100,
        'Mode Return (%)': stats.mode(clean_after, keepdims=True)[0][0] * 100,
        'Std Dev (%)': clean_after.std() * 100,
        'Variance (%)': clean_after.var() * 100,
        'Skewness': clean_after.skew(),
        'Kurtosis': clean_after.kurtosis(),
        'Q1 (%)': clean_after.quantile(0.25) * 100,
        'Q3 (%)': clean_after.quantile(0.75) * 100,
        'IQR (%)': (clean_after.quantile(0.75) - clean_after.quantile(0.25)) * 100
    }
    
    # Perform t-test for means on cleaned data
    t_stat, t_pvalue = stats.ttest_ind(clean_before, clean_after, nan_policy='omit')
    
    # Perform KS test on cleaned data
    ks_stat, ks_pvalue = stats.ks_2samp(clean_before, clean_after)
    
    # Add test results to statistics
    before_stats['T-test Statistic'] = t_stat
    before_stats['T-test p-value'] = t_pvalue
    before_stats['KS-test Statistic'] = ks_stat
    before_stats['KS-test p-value'] = ks_pvalue
    
    after_stats['T-test Statistic'] = t_stat
    after_stats['T-test p-value'] = t_pvalue
    after_stats['KS-test Statistic'] = ks_stat
    after_stats['KS-test p-value'] = ks_pvalue
    
    # Print date ranges
    print(f"\nDate Ranges:")
    print(f"Before period: {returns_series_before.index[0].strftime('%Y-%m-%d')} to {returns_series_before.index[-1].strftime('%Y-%m-%d')}")
    print(f"After period: {returns_series_after.index[0].strftime('%Y-%m-%d')} to {returns_series_after.index[-1].strftime('%Y-%m-%d')}")
    
    return before_stats, after_stats

def analyze_crypto(crypto):
    """Analyze a single cryptocurrency and return its statistics."""
    # Read the CSV file
    filename = f"{crypto}_data.csv"
    df = pd.read_csv(filename)
    
    # Convert Date column to datetime
    df['Date'] = pd.to_datetime(df['Date'])
    df.set_index('Date', inplace=True)
    
    # Split data at break point
    before_data = df[df.index < BREAK_DATE]['Returns']
    after_data = df[df.index >= BREAK_DATE]['Returns']
    
    # Calculate statistics for both periods including tests
    before_stats, after_stats = calculate_statistics(before_data, after_data)
    
    # Create DataFrame with the statistics
    stats_df = pd.DataFrame({
        'Statistic': list(before_stats.keys()),
        'Before': list(before_stats.values()),
        'After': list(after_stats.values())
    })
    
    stats_df.set_index('Statistic', inplace=True)
    
    return stats_df

def format_statistics(df):
    """Format the statistics for better readability."""
    # Define which rows should be formatted as percentages
    pct_rows = ['Mean Return (%)', 'Median Return (%)', 'Mode Return (%)', 
                'Std Dev (%)', 'Variance (%)', 'Q1 (%)', 'Q3 (%)', 'IQR (%)']
    
    # Define which rows are test statistics (4 decimal places, no %)
    test_rows = ['T-test Statistic', 'T-test p-value', 'KS-test Statistic', 'KS-test p-value']
    
    # Define integer rows
    int_rows = ['Number of Observations']
    
    # Create a formatted DataFrame
    formatted_df = df.copy()
    
    # Format numbers
    for col in ['Before', 'After']:
        for idx in formatted_df.index:
            value = formatted_df.loc[idx, col]
            if idx in int_rows:
                formatted_df.loc[idx, col] = f"{int(value)}"
            elif idx in pct_rows:
                formatted_df.loc[idx, col] = f"{value:.4f}%"
            elif idx in test_rows:
                formatted_df.loc[idx, col] = f"{value:.4f}"
            else:
                formatted_df.loc[idx, col] = f"{value:.4f}"
    
    return formatted_df

def main():
    for crypto in cryptos:
        print(f"\nAnalyzing {crypto}...")
        
        # Get statistics
        stats_df = analyze_crypto(crypto)
        
        # Format statistics
        formatted_stats = format_statistics(stats_df)
        
        # Save to Excel with proper formatting
        excel_file = f"{crypto}_statistics.xlsx"
        with pd.ExcelWriter(excel_file, engine='openpyxl') as writer:
            formatted_stats.to_excel(writer, sheet_name='Statistics')
            
            # Get the workbook and worksheet
            workbook = writer.book
            worksheet = writer.sheets['Statistics']
            
            # Format columns
            worksheet.column_dimensions['A'].width = 20
            worksheet.column_dimensions['B'].width = 15
            worksheet.column_dimensions['C'].width = 15
            
            # Add borders and formatting
            from openpyxl.styles import Border, Side, PatternFill, Font
            
            border = Border(left=Side(style='thin'), right=Side(style='thin'),
                          top=Side(style='thin'), bottom=Side(style='thin'))
            
            for row in worksheet.iter_rows():
                for cell in row:
                    cell.border = border
                    if cell.row == 1:
                        cell.font = Font(bold=True)
        
        # Print the statistics
        print(f"\n{crypto} Statistics:")
        print("=" * 50)
        print(formatted_stats)
        print(f"\nStatistics saved to {excel_file}")
        
        # Add interpretation of test results
        t_pvalue = float(formatted_stats.loc['T-test p-value', 'Before'])
        ks_pvalue = float(formatted_stats.loc['KS-test p-value', 'Before'])
        
        print("\nTest Results Interpretation:")
        print("-" * 25)
        print("T-test (comparing means):")
        print(f"{'Significant difference in means' if t_pvalue < 0.05 else 'No significant difference in means'}")
        print(f"p-value: {t_pvalue:.4f}")
        
        print("\nKS-test (comparing distributions):")
        print(f"{'Significant difference in distributions' if ks_pvalue < 0.05 else 'No significant difference in distributions'}")
        print(f"p-value: {ks_pvalue:.4f}")

if __name__ == "__main__":
    print("Starting statistical analysis...")
    print(f"Break date: {BREAK_DATE}")
    print("-" * 50)
    main()
    print("\nAnalysis completed!")

Starting statistical analysis...
Break date: 2023-10-23
--------------------------------------------------

Analyzing BTC...

Date Ranges:
Before period: 2022-06-01 to 2023-10-22
After period: 2023-10-23 to 2024-12-31



Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '509' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '436' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.




BTC Statistics:
                           Before     After
Statistic                                  
Number of Observations        509       436
Mean Return (%)           0.0255%   0.2982%
Median Return (%)        -0.1050%   0.1451%
Mode Return (%)         -15.9747%  -8.3434%
Std Dev (%)               2.7065%   2.7490%
Variance (%)              0.0733%   0.0756%
Skewness                  -0.3694    0.5194
Kurtosis                   6.2763    2.1175
Q1 (%)                   -1.0182%  -1.1169%
Q3 (%)                    1.1092%   1.5559%
IQR (%)                   2.1274%   2.6728%
T-test Statistic          -1.5332   -1.5332
T-test p-value             0.1256    0.1256
KS-test Statistic          0.1011    0.1011
KS-test p-value            0.0150    0.0150

Statistics saved to BTC_statistics.xlsx

Test Results Interpretation:
-------------------------
T-test (comparing means):
No significant difference in means
p-value: 0.1256

KS-test (comparing distributions):
Significant difference in


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '539' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '436' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '539' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '436' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.



In [2]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from scipy import stats
import plotly.figure_factory as ff

# Define parameters
BREAK_DATE = '2023-10-23'
cryptos = ['BTC', 'ETH', 'LTC']
colors = {'Before': '#1f77b4', 'After': '#ff7f0e'}

def clean_data(data):
    """Clean data by removing NaN and infinite values."""
    return data[~np.isinf(data) & ~np.isnan(data)]

def load_and_split_data(crypto):
    """Load crypto data and split into before/after periods with cleaning."""
    try:
        # Read data
        df = pd.read_csv(f"{crypto}_data.csv")
        df['Date'] = pd.to_datetime(df['Date'])
        df.set_index('Date', inplace=True)
        
        # Clean and split data
        returns = clean_data(df['Returns'])
        before_data = returns[returns.index < BREAK_DATE]
        after_data = returns[returns.index >= BREAK_DATE]
        
        return before_data, after_data
    except Exception as e:
        print(f"Error loading {crypto} data: {str(e)}")
        return pd.Series(), pd.Series()

def create_distribution_plots():
    """Create interactive distribution plots using Plotly."""
    fig = make_subplots(
        rows=len(cryptos), cols=1,
        subplot_titles=[f"{crypto} Returns Distribution" for crypto in cryptos],
        vertical_spacing=0.15
    )

    for idx, crypto in enumerate(cryptos):
        before_data, after_data = load_and_split_data(crypto)
        
        if before_data.empty or after_data.empty:
            print(f"Skipping {crypto} due to insufficient data")
            continue
            
        # Add histogram for before period
        fig.add_trace(
            go.Histogram(
                x=before_data,
                name=f'Before ({crypto})',
                nbinsx=50,
                opacity=0.7,
                histnorm='probability density',
                showlegend=True if idx == 0 else False
            ),
            row=idx+1, col=1
        )
        
        # Add histogram for after period
        fig.add_trace(
            go.Histogram(
                x=after_data,
                name=f'After ({crypto})',
                nbinsx=50,
                opacity=0.7,
                histnorm='probability density',
                showlegend=True if idx == 0 else False
            ),
            row=idx+1, col=1
        )
        
        try:
            # Add KDE curves
            for period, data in [('Before', before_data), ('After', after_data)]:
                if len(data) > 1:  # Check if enough data points
                    kde = stats.gaussian_kde(data)
                    x_range = np.linspace(data.min(), data.max(), 100)
                    y_kde = kde(x_range)
                    
                    fig.add_trace(
                        go.Scatter(
                            x=x_range,
                            y=y_kde,
                            name=f'KDE {period} ({crypto})',
                            line=dict(color=colors[period]),
                            showlegend=True if idx == 0 else False
                        ),
                        row=idx+1, col=1
                    )
                    
                    # Add vertical line for mean
                    fig.add_vline(
                        x=data.mean(),
                        line_dash="dash",
                        line_color=colors[period],
                        row=idx+1, col=1
                    )
            
            # Add statistical information as annotations
            if len(before_data) > 1 and len(after_data) > 1:
                stats_text = (
                    f"Before - Mean: {before_data.mean():.4f}, Std: {before_data.std():.4f}<br>"
                    f"After - Mean: {after_data.mean():.4f}, Std: {after_data.std():.4f}<br>"
                    f"KS test p-value: {stats.ks_2samp(before_data, after_data)[1]:.4f}"
                )
                
                fig.add_annotation(
                    text=stats_text,
                    xref="paper", yref="paper",
                    x=0.02, y=0.95,
                    showarrow=False,
                    font=dict(size=10),
                    bgcolor="white",
                    bordercolor="black",
                    borderwidth=1,
                    row=idx+1, col=1
                )
        except Exception as e:
            print(f"Error in KDE calculation for {crypto}: {str(e)}")
            continue

    # Update layout
    fig.update_layout(
        title_text="Cryptocurrency Returns Distribution Analysis",
        height=300 * len(cryptos) + 100,
        width=1000,
        showlegend=True,
        template="plotly_white",
        bargap=0.1
    )

    return fig

def create_qq_plots():
    """Create Q-Q plots using Plotly."""
    fig = make_subplots(
        rows=len(cryptos), cols=2,
        subplot_titles=[f"{crypto} Q-Q Plot (Before)" for crypto in cryptos] +
                      [f"{crypto} Q-Q Plot (After)" for crypto in cryptos],
        vertical_spacing=0.15,
        horizontal_spacing=0.1
    )

    for idx, crypto in enumerate(cryptos):
        before_data, after_data = load_and_split_data(crypto)
        
        if before_data.empty or after_data.empty:
            continue
            
        # Create Q-Q plots for both periods
        for col, (period, data) in enumerate([('Before', before_data), ('After', after_data)]):
            if len(data) > 1:
                theoretical_quantiles = stats.norm.ppf(np.linspace(0.01, 0.99, len(data)))
                sorted_data = np.sort(data)
                
                fig.add_trace(
                    go.Scatter(
                        x=theoretical_quantiles,
                        y=sorted_data,
                        mode='markers',
                        name=f'{crypto} - {period}',
                        marker=dict(color=colors[period]),
                        showlegend=True if idx == 0 else False
                    ),
                    row=idx+1, col=col+1
                )
                
                # Add diagonal line
                fig.add_trace(
                    go.Scatter(
                        x=[min(theoretical_quantiles), max(theoretical_quantiles)],
                        y=[min(theoretical_quantiles), max(theoretical_quantiles)],
                        mode='lines',
                        line=dict(color='red', dash='dash'),
                        showlegend=False
                    ),
                    row=idx+1, col=col+1
                )

    # Update layout
    fig.update_layout(
        title_text="Q-Q Plots for Cryptocurrency Returns",
        height=300 * len(cryptos) + 100,
        width=1200,
        template="plotly_white",
        showlegend=True
    )

    return fig

def create_boxplots():
    """Create interactive boxplots using Plotly."""
    all_data = []
    for crypto in cryptos:
        before_data, after_data = load_and_split_data(crypto)
        
        if not (before_data.empty or after_data.empty):
            all_data.extend([
                pd.DataFrame({
                    'Returns': before_data,
                    'Period': 'Before',
                    'Crypto': crypto
                }),
                pd.DataFrame({
                    'Returns': after_data,
                    'Period': 'After',
                    'Crypto': crypto
                })
            ])
    
    if all_data:
        combined_data = pd.concat(all_data)
        
        # Create boxplot
        fig = px.box(
            combined_data,
            x='Crypto',
            y='Returns',
            color='Period',
            title='Returns Distribution Comparison (Boxplot)',
            template='plotly_white',
            color_discrete_map=colors
        )
        
        # Update layout
        fig.update_layout(
            width=1000,
            height=600,
            boxmode='group'
        )
        
        return fig
    return None

if __name__ == "__main__":
    print("Starting distribution analysis with Plotly...")
    
    # Create and save all plots
    try:
        dist_fig = create_distribution_plots()
        dist_fig.write_html("crypto_distributions_interactive.html")
        print("Distribution plots saved successfully")
    except Exception as e:
        print(f"Error creating distribution plots: {str(e)}")
    
    try:
        qq_fig = create_qq_plots()
        qq_fig.write_html("crypto_qq_plots_interactive.html")
        print("Q-Q plots saved successfully")
    except Exception as e:
        print(f"Error creating Q-Q plots: {str(e)}")
    
    try:
        box_fig = create_boxplots()
        if box_fig:
            box_fig.write_html("crypto_boxplots_interactive.html")
            print("Boxplots saved successfully")
    except Exception as e:
        print(f"Error creating boxplots: {str(e)}")
    
    print("\nAnalysis completed!")

Starting distribution analysis with Plotly...
Distribution plots saved successfully
Q-Q plots saved successfully
Boxplots saved successfully

Analysis completed!
