In [12]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime

def load_and_validate_data(file_path):
    """
    Load and validate the cryptocurrency data file.
    
    Parameters:
        file_path (str): Path to the CSV file
        
    Returns:
        DataFrame: Validated cryptocurrency data
    """
    try:
        data = pd.read_csv(file_path, index_col='Date')
        data.index = pd.to_datetime(data.index)
        
        if data.empty:
            raise ValueError("The loaded dataset is empty")
            
        return data
        
    except FileNotFoundError:
        raise FileNotFoundError(f"Could not find the data file at {file_path}")
    except Exception as e:
        raise Exception(f"Error loading data: {str(e)}")

def calculate_id_persistence(data, start_dates):
    """
    Calculate the percentage of IDs from a start date that persist in subsequent dates.
    
    Parameters:
        data (DataFrame): Rotated cryptocurrency data where dates are rows and ranks are columns
        start_dates (list): List of dates to analyze persistence
    
    Returns:
        persistence_data (dict): Dictionary with dates as keys and persistence percentages as values
        summary_stats (dict): Summary statistics for each start date
    """
    persistence_data = {}
    summary_stats = {}
    
    for start_date in start_dates:
        if start_date not in data.index:
            print(f"Warning: Date {start_date} not found in the dataset.")
            continue
        
        start_ids = set(data.loc[start_date].dropna())
        initial_count = len(start_ids)
        persistence_data[start_date] = []
        min_persistence = 100
        min_date = None
        
        for current_date in data.index[data.index >= start_date]:
            current_ids = set(data.loc[current_date].dropna())
            common_ids = start_ids & current_ids
            persistence_percentage = (len(common_ids) / initial_count) * 100
            
            persistence_data[start_date].append((current_date, persistence_percentage))
            
            if persistence_percentage < min_persistence:
                min_persistence = persistence_percentage
                min_date = current_date
        
        summary_stats[start_date] = {
            'initial_count': initial_count,
            'min_persistence': min_persistence,
            'min_persistence_date': min_date,
            'final_persistence': persistence_data[start_date][-1][1]
        }
    
    return persistence_data, summary_stats

def plot_persistence_plotly(persistence_data, summary_stats, start_dates):
    """
    Plot the persistence percentages with enhanced visualization.
    
    Parameters:
        persistence_data (dict): Persistence data generated by calculate_id_persistence
        summary_stats (dict): Summary statistics for each start date
        start_dates (list): List of start dates to plot
    """
    fig = make_subplots(rows=1, cols=1)
    colors = ['rgb(31, 119, 180)', 'rgb(255, 127, 14)', 'rgb(44, 160, 44)']
    
    for idx, start_date in enumerate(start_dates):
        if start_date not in persistence_data:
            continue
        
        dates, percentages = zip(*persistence_data[start_date])
        
        fig.add_trace(
            go.Scatter(
                x=pd.to_datetime(dates),
                y=percentages,
                mode='lines',
                name=f"Cohort {start_date[:7]}",
                line=dict(color=colors[idx % len(colors)]),
                hovertemplate="Date: %{x|%Y-%m-%d}<br>Persistence: %{y:.1f}%<extra></extra>"
            )
        )
    
    fig.add_annotation(
        text="_cryptovizart",
        xref="paper",
        yref="paper",
        x=0.5,
        y=0.5,
        showarrow=False,
        font=dict(
            size=60,
            color="rgba(0,0,0,0.07)"
        ),
        textangle=0
    )
    
    fig.update_layout(
        width=1200,  # Define width in pixels
        height=600,  # Define height in pixels
        plot_bgcolor='white',
        paper_bgcolor='white',
        xaxis=dict(
            showgrid=False,
            zeroline=False,
            showline=False,  # No x-axis line
            tickformat='%y-%b',
            tickfont=dict(color='black', size=12),
            dtick="M6"
        ),
        yaxis=dict(
            showgrid=False,
            showline=True,  # Show y-axis line
            linecolor='black',
            linewidth=1,
            range=[0, 100],
            tickfont=dict(color='black', size=12),
            title=dict(
                text="Persistence Percentage",
                font=dict(color='black', size=14)
            )
        ),
        legend=dict(
            title=dict(
                text="Start Dates",
                font=dict(color='black', size=14)
            ),
            font=dict(color='black', size=12),
            bgcolor='rgba(255, 255, 255, 0.9)',
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        ),
        margin=dict(t=100),
        hovermode='x unified'
    )
    
    return fig

def process_summary_statistics(summary_stats):
    """
    Process and format summary statistics for reporting.
    
    Parameters:
        summary_stats (dict): Raw summary statistics
        
    Returns:
        dict: Processed statistics with formatted values
    """
    processed_stats = {}
    for date, stats in summary_stats.items():
        processed_stats[date] = {
            'initial_count': stats['initial_count'],
            'min_persistence': f"{stats['min_persistence']:.1f}%",
            'min_persistence_date': stats['min_persistence_date'].strftime('%Y-%m-%d'),
            'final_persistence': f"{stats['final_persistence']:.1f}%"
        }
    return processed_stats

def main():
    """
    Main execution function to run the cryptocurrency persistence analysis.
    """
    file_path = 'rotated_cryptocurrency_ids.csv'
    start_dates = ['2017-12-04', '2019-09-19', '2020-12-28']
    
    try:
        data = load_and_validate_data(file_path)
        persistence_data, summary_stats = calculate_id_persistence(data, start_dates)
        fig = plot_persistence_plotly(persistence_data, summary_stats, start_dates)
        fig.show()
        
        processed_stats = process_summary_statistics(summary_stats)
        for date, stats in processed_stats.items():
            print(f"\nStatistics for {date}:")
            print(f"Initial Count: {stats['initial_count']}")
            print(f"Minimum Persistence: {stats['min_persistence']} on {stats['min_persistence_date']}")
            print(f"Final Persistence: {stats['final_persistence']}")
            
    except Exception as e:
        print(f"Error in analysis: {str(e)}")

if __name__ == "__main__":
    main()


Statistics for 2017-12-04:
Initial Count: 125
Minimum Persistence: 13.6% on 2024-08-27
Final Persistence: 13.6%

Statistics for 2019-09-19:
Initial Count: 125
Minimum Persistence: 24.8% on 2024-10-28
Final Persistence: 25.6%

Statistics for 2020-12-28:
Initial Count: 125
Minimum Persistence: 35.2% on 2024-11-01
Final Persistence: 36.0%
