# Workout Log Analysis Notebook

This notebook analyzes and processes log data from a simulated workout session to extract meaningful insights and visualize workout metrics.

## Import Required Libraries

Import necessary libraries such as pandas, matplotlib, and re for data processing and visualization.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datetime import datetime
import os
import json

# Set plot styling
plt.style.use('ggplot')
sns.set(font_scale=1.2)

## Load and Parse Log Data

Load the log data into a structured format (e.g., DataFrame) by parsing timestamps, log levels, and messages.

In [None]:
def parse_log_file(log_file_path):
    """
    Parse a log file and return a structured DataFrame.
    
    Args:
        log_file_path (str): Path to the log file
        
    Returns:
        pd.DataFrame: DataFrame with parsed log entries
    """
    # Log pattern: timestamp - level - component - message
    log_pattern = r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}) - (\w+) - (\w+) - (.+)'
    
    log_entries = []
    
    try:
        with open(log_file_path, 'r') as file:
            for line in file:
                match = re.match(log_pattern, line)
                if match:
                    timestamp_str, level, component, message = match.groups()
                    timestamp = datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S,%f')
                    log_entries.append({
                        'timestamp': timestamp,
                        'level': level,
                        'component': component,
                        'message': message.strip()
                    })
                else:
                    # This might be a continuation of a previous message
                    if log_entries:
                        log_entries[-1]['message'] += '\n' + line.strip()
    except Exception as e:
        print(f"Error parsing log file: {e}")
        return pd.DataFrame()
    
    return pd.DataFrame(log_entries)

# Example usage:
# Specify the path to your log file
log_file_path = input("Enter the path to the log file: ")

if os.path.exists(log_file_path):
    log_df = parse_log_file(log_file_path)
    if not log_df.empty:
        print(f"Successfully parsed {len(log_df)} log entries")
        display(log_df.head())
    else:
        print("No log entries were parsed from the file")
else:
    print(f"Log file not found at: {log_file_path}")

## Filter Simulator Data

Extract simulator-related logs to analyze the data generation process.

In [None]:
def extract_simulator_logs(log_df):
    """
    Extract simulator-related logs from the parsed log DataFrame.
    
    Args:
        log_df (pd.DataFrame): DataFrame with parsed log entries
        
    Returns:
        pd.DataFrame: DataFrame with only simulator-related logs
    """
    if log_df.empty:
        print("No log data available")
        return pd.DataFrame()
    
    # Filter logs related to simulator
    simulator_df = log_df[
        log_df['component'].str.contains('simulator', case=False) | 
        log_df['message'].str.contains('simulator|simulated', case=False)
    ].copy()
    
    if simulator_df.empty:
        print("No simulator-related logs found")
    else:
        print(f"Found {len(simulator_df)} simulator-related log entries")
    
    return simulator_df

if 'log_df' in locals() and not log_df.empty:
    simulator_logs = extract_simulator_logs(log_df)
    if not simulator_logs.empty:
        display(simulator_logs.head())
else:
    print("Please parse a log file first")

## Extract Workout Data from Logs

Parse the log entries to extract workout data points and metrics.

In [None]:
def extract_data_points(log_df):
    """
    Extract workout data points from log entries.
    
    Args:
        log_df (pd.DataFrame): DataFrame with parsed log entries
        
    Returns:
        pd.DataFrame: DataFrame with structured workout data points
    """
    data_points = []
    
    # Look for log entries containing JSON data
    data_pattern = r"Received data: (.+)"
    
    for _, row in log_df.iterrows():
        match = re.search(data_pattern, row['message'])
        if match:
            json_str = match.group(1).strip()
            try:
                data = json.loads(json_str)
                # Add timestamp from log
                data['log_timestamp'] = row['timestamp']
                data_points.append(data)
            except json.JSONDecodeError:
                # Not valid JSON, might be a different format
                continue
    
    if not data_points:
        print("No data points found in logs")
        return pd.DataFrame()
    
    # Convert to DataFrame
    data_df = pd.DataFrame(data_points)
    print(f"Extracted {len(data_df)} data points from logs")
    
    return data_df

if 'simulator_logs' in locals() and not simulator_logs.empty:
    data_points_df = extract_data_points(simulator_logs)
    if not data_points_df.empty:
        print("\nColumns in data points DataFrame:")
        print(data_points_df.columns.tolist())
        display(data_points_df.head())
else:
    print("Please extract simulator logs first")

## Analyze Data Generation Frequency

Analyze how frequently data points are generated during the workout.

In [None]:
def analyze_data_frequency(data_df):
    """
    Analyze the frequency of data point generation.
    
    Args:
        data_df (pd.DataFrame): DataFrame with extracted data points
    """
    if data_df.empty or 'log_timestamp' not in data_df.columns:
        print("No timestamp data available for frequency analysis")
        return
    
    # Sort by timestamp
    data_df = data_df.sort_values('log_timestamp')
    
    # Calculate time differences between consecutive data points
    data_df['time_diff'] = data_df['log_timestamp'].diff().dt.total_seconds()
    
    # Summary statistics for time differences
    time_diffs = data_df['time_diff'].dropna()
    
    if time_diffs.empty:
        print("Not enough data points to analyze frequency")
        return
    
    print("\nData Generation Frequency Statistics (seconds):")
    print(f"Min interval: {time_diffs.min():.4f}")
    print(f"Max interval: {time_diffs.max():.4f}")
    print(f"Mean interval: {time_diffs.mean():.4f}")
    print(f"Median interval: {time_diffs.median():.4f}")
    
    # Plot frequency histogram
    plt.figure(figsize=(10, 5))
    plt.hist(time_diffs, bins=20, alpha=0.7)
    plt.xlabel('Time Between Data Points (seconds)')
    plt.ylabel('Frequency')
    plt.title('Data Generation Frequency Distribution')
    plt.grid(True, alpha=0.3)
    plt.axvline(time_diffs.mean(), color='red', linestyle='dashed', linewidth=1)
    plt.text(time_diffs.mean()*1.1, plt.ylim()[1]*0.9, f'Mean: {time_diffs.mean():.2f}s', color='red')
    plt.tight_layout()
    plt.show()
    
    # Plot time intervals over time to see if there's any pattern
    plt.figure(figsize=(12, 5))
    plt.plot(data_df['log_timestamp'][1:], time_diffs, '-o', alpha=0.5, markersize=3)
    plt.xlabel('Timestamp')
    plt.ylabel('Time Interval (seconds)')
    plt.title('Data Generation Intervals Over Time')
    plt.grid(True, alpha=0.3)
    plt.axhline(1.0, color='green', linestyle='dashed', linewidth=1)
    plt.text(plt.xlim()[0], 1.1, 'Target: 1.0s', color='green')
    plt.tight_layout()
    plt.show()

if 'data_points_df' in locals() and not data_points_df.empty and 'log_timestamp' in data_points_df.columns:
    analyze_data_frequency(data_points_df)
else:
    print("Please extract data points with timestamps first")

## Visualize Workout Metrics

Create visualizations for key workout metrics extracted from logs.

In [None]:
def visualize_metrics(data_df):
    """
    Visualize key workout metrics over time.
    
    Args:
        data_df (pd.DataFrame): DataFrame with extracted data points
    """
    if data_df.empty:
        print("No data available for visualization")
        return
    
    # Identify numeric columns for potential metrics
    numeric_cols = data_df.select_dtypes(include=[np.number]).columns.tolist()
    
    # Common workout metrics to visualize if available
    metric_names = [
        'power', 'instantaneous_power',
        'cadence', 'instantaneous_cadence',
        'heart_rate',
        'speed', 'instantaneous_speed',
        'distance', 'total_distance',
        'calories', 'total_calories'
    ]
    
    # Filter for available metrics
    available_metrics = [col for col in metric_names if col in numeric_cols]
    
    if not available_metrics:
        print("No recognized workout metrics found in the data")
        print("Available numeric columns:", numeric_cols)
        return
    
    print(f"Visualizing {len(available_metrics)} workout metrics: {', '.join(available_metrics)}")
    
    # Sort by timestamp if available
    if 'log_timestamp' in data_df.columns:
        data_df = data_df.sort_values('log_timestamp')
        x_values = data_df['log_timestamp']
        x_label = 'Time'
    else:
        # Use row index as x-axis
        x_values = data_df.index
        x_label = 'Data Point Index'
    
    # Create a multi-panel figure for each metric
    fig, axes = plt.subplots(len(available_metrics), 1, figsize=(12, 4*len(available_metrics)), sharex=True)
    
    # Handle case with only one metric
    if len(available_metrics) == 1:
        axes = [axes]
    
    for i, metric in enumerate(available_metrics):
        ax = axes[i]
        ax.plot(x_values, data_df[metric], '-o', alpha=0.7, markersize=3)
        ax.set_ylabel(metric.replace('_', ' ').title())
        ax.set_title(f'{metric.replace("_", " ").title()} Over Time')
        ax.grid(True, alpha=0.3)
        
        # Add some statistics
        if not data_df[metric].empty:
            mean_val = data_df[metric].mean()
            max_val = data_df[metric].max()
            ax.axhline(mean_val, color='red', linestyle='dashed', linewidth=1)
            ax.text(x_values.iloc[0], mean_val*1.1, f'Mean: {mean_val:.1f}', color='red')
            ax.text(x_values.iloc[0], max_val*0.9, f'Max: {max_val:.1f}', color='blue')
    
    # Set common x-axis label
    axes[-1].set_xlabel(x_label)
    plt.tight_layout()
    plt.show()
    
    # Print summary statistics for each metric
    print("\nSummary Statistics for Workout Metrics:")
    display(data_df[available_metrics].describe())

if 'data_points_df' in locals() and not data_points_df.empty:
    visualize_metrics(data_points_df)
else:
    print("Please extract data points first")

## Detect Workout Start and End

Identify the start and end of workout sessions from the logs.

In [None]:
def detect_workout_sessions(log_df):
    """
    Detect workout start and end events in the logs.
    
    Args:
        log_df (pd.DataFrame): DataFrame with parsed log entries
    """
    if log_df.empty:
        print("No log data available")
        return
    
    # Keywords for workout start and end
    start_patterns = [
        r"workout started",
        r"starting workout",
        r"begin(ning)? workout",
        r"workout session (started|beginning)",
        r"new workout (created|initiated)"
    ]
    
    end_patterns = [
        r"workout (ended|finished|completed)",
        r"ending workout",
        r"stopping workout",
        r"workout session (ended|finished|completed)",
        r"workout (saved|recorded)"
    ]
    
    # Find start and end events
    start_events = []
    end_events = []
    
    for _, row in log_df.iterrows():
        message = row['message'].lower()
        
        # Check for start patterns
        if any(re.search(pattern, message) for pattern in start_patterns):
            start_events.append({
                'timestamp': row['timestamp'],
                'message': row['message']
            })
        
        # Check for end patterns
        if any(re.search(pattern, message) for pattern in end_patterns):
            end_events.append({
                'timestamp': row['timestamp'],
                'message': row['message']
            })
    
    print(f"Found {len(start_events)} workout start events and {len(end_events)} end events")
    
    # Display start events
    if start_events:
        print("\nWorkout Start Events:")
        start_df = pd.DataFrame(start_events)
        display(start_df)
    
    # Display end events
    if end_events:
        print("\nWorkout End Events:")
        end_df = pd.DataFrame(end_events)
        display(end_df)
    
    # Calculate workout durations if we have matching start/end pairs
    if start_events and end_events:
        print("\nWorkout Sessions:")
        sessions = []
        
        # Simple matching algorithm - may need refinement for complex logs
        start_idx = 0
        end_idx = 0
        
        while start_idx < len(start_events) and end_idx < len(end_events):
            start_time = start_events[start_idx]['timestamp']
            end_time = end_events[end_idx]['timestamp']
            
            if end_time > start_time:  # Valid session
                duration = (end_time - start_time).total_seconds()
                sessions.append({
                    'start_time': start_time,
                    'end_time': end_time,
                    'duration_seconds': duration,
                    'duration_formatted': f"{int(duration//60)}:{int(duration%60):02d}"
                })
                start_idx += 1
                end_idx += 1
            else:  # End event without matching start
                end_idx += 1
        
        if sessions:
            sessions_df = pd.DataFrame(sessions)
            display(sessions_df)
        else:
            print("No clear workout sessions identified")

if 'log_df' in locals() and not log_df.empty:
    detect_workout_sessions(log_df)
else:
    print("Please parse a log file first")

## Analyze Error and Warning Logs

Review error and warning messages to identify potential issues.

In [None]:
def analyze_errors_and_warnings(log_df):
    """
    Analyze error and warning messages in the logs.
    
    Args:
        log_df (pd.DataFrame): DataFrame with parsed log entries
    """
    if log_df.empty:
        print("No log data available")
        return
    
    # Filter for errors and warnings
    error_logs = log_df[log_df['level'] == 'ERROR']
    warning_logs = log_df[log_df['level'] == 'WARNING']
    
    print(f"Found {len(error_logs)} ERROR logs and {len(warning_logs)} WARNING logs")
    
    # Display errors
    if not error_logs.empty:
        print("\nERROR Messages:")
        display(error_logs[['timestamp', 'component', 'message']])
    
    # Display warnings
    if not warning_logs.empty:
        print("\nWARNING Messages:")
        display(warning_logs[['timestamp', 'component', 'message']])
    
    # Count errors by component
    if not error_logs.empty:
        print("\nErrors by Component:")
        error_counts = error_logs['component'].value_counts()
        display(error_counts)
        
        # Plot error distribution by component
        plt.figure(figsize=(10, 5))
        error_counts.plot(kind='bar')
        plt.title('Error Count by Component')
        plt.xlabel('Component')
        plt.ylabel('Count')
        plt.tight_layout()
        plt.show()

if 'log_df' in locals() and not log_df.empty:
    analyze_errors_and_warnings(log_df)
else:
    print("Please parse a log file first")

## Compare with Database

Compare log data with what's stored in the database to verify data flow.

In [None]:
def compare_with_database():
    """
    Connect to the database and compare log data with database records.
    """
    try:
        import sqlite3
        
        # Locate the database - could be in various locations
        possible_db_paths = [
            os.path.join(os.getcwd(), 'rogue_garmin.db'),
            os.path.join(os.getcwd(), '..', 'src', 'data', 'rogue_garmin.db'),
            os.path.join('E:\\', 'rogue_garmin_bridge', 'src', 'data', 'rogue_garmin.db')
        ]
        
        db_path = None
        for path in possible_db_paths:
            if os.path.exists(path):
                db_path = path
                break
        
        if not db_path:
            print("Database not found. Please specify the correct path.")
            return
        
        print(f"Connecting to database at: {db_path}")
        conn = sqlite3.connect(db_path)
        
        # Get recent workouts
        workouts_df = pd.read_sql_query(
            "SELECT * FROM workouts ORDER BY id DESC LIMIT 5",
            conn
        )
        
        if workouts_df.empty:
            print("No workouts found in the database")
            conn.close()
            return
        
        print(f"Found {len(workouts_df)} recent workouts in the database")
        display(workouts_df)
        
        # Get the most recent workout
        most_recent_workout_id = workouts_df.iloc[0]['id']
        
        # Get data points for this workout
        data_points_db = pd.read_sql_query(
            f"SELECT * FROM workout_data WHERE workout_id = {most_recent_workout_id} ORDER BY timestamp",
            conn
        )
        
        print(f"Found {len(data_points_db)} data points for workout ID {most_recent_workout_id}")
        
        if not data_points_db.empty:
            display(data_points_db.head())
            
            # Compare with log data if available
            if 'data_points_df' in globals() and not data_points_df.empty:
                print("\nComparison between log data and database data:")
                print(f"Log data points: {len(data_points_df)}")
                print(f"Database data points: {len(data_points_db)}")
                
                # More detailed comparison could be added here
        
        conn.close()
        
    except Exception as e:
        print(f"Error comparing with database: {e}")

# Uncomment to run the database comparison
# compare_with_database()

## Conclusion and Recommendations

Summarize findings and provide recommendations based on the log analysis.

### Key Findings

Based on the log analysis, here are the key findings:

1. **Data Generation Frequency**: The simulator appears to be generating data points at regular intervals (ideally every second)
2. **Metrics Coverage**: The data includes essential workout metrics like power, cadence, distance, and calories
3. **Workout Sessions**: The logs show clear workout start and end events
4. **Error Analysis**: Any errors or warnings in the logs should be addressed

### Recommendations

1. **Data Consistency**: Ensure that timestamps are unique to prevent database collisions
2. **Error Handling**: Address any recurring errors or warnings in the logs
3. **Data Flow**: Verify that all data points from the simulator are correctly stored in the database
4. **Performance**: Monitor the time between data generation and storage to ensure real-time processing