In [None]:
# !pip install psutil matplotlib pandas
import psutil
import time
import threading
import pandas as pd
import matplotlib.pyplot as plt
import os

# Assuming TurboDB is in a file named turbo_tosec_engine.py
# from turbo_tosec_engine import TurboDB 

class IOMonitor:
    """
    Background thread that monitors Process I/O counters using psutil.
    Acts as an oscilloscope for Disk usage.
    """
    def __init__(self, interval=0.1):
        self.interval = interval
        self.process = psutil.Process(os.getpid()) # Capture current process
        self.running = False
        self.thread = None
        self.data = []
        self.start_io = None

    def _monitor_loop(self):
        """Main loop running in the background thread."""
        while self.running:
            # Get IO Counters (Read/Write Count and Bytes)
            try:
                io_counters = self.process.io_counters()
            except Exception:
                break
            
            current_time = time.time()
            
            record = {
                'timestamp': current_time,
                'read_count': io_counters.read_count,   # Number of read syscalls
                'write_count': io_counters.write_count, # Number of write syscalls
                'read_bytes': io_counters.read_bytes,   # Physical bytes read
                'write_bytes': io_counters.write_bytes  # Physical bytes written
            }
            self.data.append(record)
            time.sleep(self.interval)

    def start(self):
        """Starts the monitoring thread."""
        # Get a reference point (optional, but good for relative calc)
        self.start_io = self.process.io_counters()
        self.data = []
        self.running = True
        self.thread = threading.Thread(target=self._monitor_loop)
        self.thread.start()
        print("üî¥ I/O Recording Started...")

    def stop(self):
        """Stops the thread and returns a processed DataFrame."""
        self.running = False
        if self.thread:
            self.thread.join()
        print("‚ö´ I/O Recording Stopped.")
        
        # Convert raw data to Pandas DataFrame
        df = pd.DataFrame(self.data)
        
        if df.empty:
            return df
        
        # Calculate Relative Time (0, 0.1, 0.2...)
        df['time_rel'] = df['timestamp'] - df['timestamp'].iloc[0]
        
        # Calculate Deltas (Instantaneous speed instead of cumulative total)
        # psutil returns totals since process start, we want speed per interval.
        df['read_mb'] = df['read_bytes'].diff().fillna(0) / (1024 * 1024)
        df['write_mb'] = df['write_bytes'].diff().fillna(0) / (1024 * 1024)
        df['ops_read'] = df['read_count'].diff().fillna(0)
        df['ops_write'] = df['write_count'].diff().fillna(0)
        
        return df

# ==============================================================================
# üß™ EXPERIMENT EXECUTION
# ==============================================================================

# --- CONFIGURATION ---
# Point this to a real, reasonably large DAT file for valid testing
TEST_FILE_PATH = r"E:\HOME\RetroVault\TOSEC_DATs\Extracted\TOSEC-v2025-03-13\Commodore\C64\Games\Commodore 64 - Games - [DAT] (TOSEC-v2016-11-11).dat" 
DB_FILE = "experiment_stats.duckdb"

# --- RUN TEST ---
if os.path.exists(DB_FILE):
    os.remove(DB_FILE) # Clean start

monitor = IOMonitor(interval=0.1) # Sample every 100ms
engine = TurboDB(DB_FILE)

# 1. Start Recording
monitor.start()

# 2. Run the Engine (Streaming Mode)
try:
    print(f"üöÄ Processing: {os.path.basename(TEST_FILE_PATH)}")
    start_t = time.time()
    
    # Ingest the file
    count = engine.ingest_xml_stream(TEST_FILE_PATH)
    
    end_t = time.time()
    duration = end_t - start_t
    print(f"‚úÖ Operation Completed: {count} ROMs processed.")
    print(f"‚è±Ô∏è Duration: {duration:.2f} seconds")
    
except Exception as e:
    print(f"‚ùå Error: {e}")
finally:
    engine.close()

# 3. Stop Recording
df_results = monitor.stop()

# ==============================================================================
# üìä VISUALIZATION & ANALYSIS
# ==============================================================================

if not df_results.empty:
    fig, ax = plt.subplots(2, 1, figsize=(12, 10), sharex=True)

    # Plot 1: Throughput (MB/s)
    ax[0].plot(df_results['time_rel'], df_results['read_mb'], label='Read (MB/s)', color='blue', alpha=0.7)
    ax[0].plot(df_results['time_rel'], df_results['write_mb'], label='Write (MB/s)', color='red', alpha=0.7)
    ax[0].set_title('Disk Throughput (MB/s) - Streaming Mode')
    ax[0].set_ylabel('Speed (MB/s)')
    ax[0].legend()
    ax[0].grid(True, linestyle='--', alpha=0.5)

    # Plot 2: IOPS (Operations per Second)
    ax[1].plot(df_results['time_rel'], df_results['ops_read'], label='Read Ops (Syscalls)', color='cyan')
    ax[1].plot(df_results['time_rel'], df_results['ops_write'], label='Write Ops (Syscalls)', color='orange')
    ax[1].set_title('I/O Operations (IOPS)')
    ax[1].set_xlabel('Time (Seconds)')
    ax[1].set_ylabel('Operation Count')
    ax[1].legend()
    ax[1].grid(True, linestyle='--', alpha=0.5)

    plt.tight_layout()
    plt.show()

    # Statistical Summary
    print("\nüìä STATISTICAL REPORT")
    print("-" * 30)
    print(f"Total Read      : {df_results['read_bytes'].max() / (1024*1024):.2f} MB")
    print(f"Total Written   : {df_results['write_bytes'].max() / (1024*1024):.2f} MB")
    print(f"Total Read Ops  : {int(df_results['read_count'].max())}")
    print(f"Total Write Ops : {int(df_results['write_count'].max())}")
    print(f"Avg Write Speed : {df_results['write_mb'].mean():.2f} MB/s")
    print("-" * 30)
else:
    print("‚ö†Ô∏è No data recorded.")