# AlgoSpace Data Preparation for Google Colab

This notebook handles data preparation and preprocessing for MARL training in Google Colab.

## Key Features:
- Download and process historical market data
- Generate training matrices for each agent
- Create train/validation/test splits
- Optimize data format for Colab training
- Upload processed data to Google Drive

## Data Sources:
- Yahoo Finance (for demonstration)
- Alpha Vantage API (optional)
- Your own CSV/Parquet files

## 1. Environment Setup

In [None]:
# Check if running in Colab
try:
    import google.colab
    IN_COLAB = True
    print("✅ Running in Google Colab")
except ImportError:
    IN_COLAB = False
    print("⚠️ Not running in Google Colab")

# Mount Google Drive
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    DRIVE_BASE = "/content/drive/MyDrive/AlgoSpace"
    !mkdir -p {DRIVE_BASE}/data/{raw,processed,compressed}
else:
    DRIVE_BASE = "./drive_simulation"
    import os
    os.makedirs(f"{DRIVE_BASE}/data/raw", exist_ok=True)
    os.makedirs(f"{DRIVE_BASE}/data/processed", exist_ok=True)

In [None]:
# Install required packages
!pip install -q yfinance pandas numpy h5py
!pip install -q ta pandas-ta
!pip install -q scikit-learn tqdm
!pip install -q pyarrow  # For parquet support

print("✅ Packages installed")

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import yfinance as yf
import h5py
from datetime import datetime, timedelta
import ta
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import json
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries imported")

## 2. Data Configuration

In [None]:
# Data configuration
DATA_CONFIG = {
    # Market symbols to download
    'symbols': [
        'SPY', 'QQQ', 'IWM', 'DIA',  # Major indices
        'GLD', 'SLV', 'USO', 'UNG',  # Commodities
        'TLT', 'IEF', 'SHY', 'AGG',  # Bonds
        'VIX', 'VXX', 'UVXY',         # Volatility
        'AAPL', 'MSFT', 'GOOGL', 'AMZN',  # Tech stocks
        'JPM', 'BAC', 'GS', 'MS'      # Financial stocks
    ],
    
    # Date range
    'start_date': '2018-01-01',
    'end_date': '2023-12-31',
    
    # Data splits
    'train_ratio': 0.7,
    'val_ratio': 0.15,
    'test_ratio': 0.15,
    
    # Feature engineering
    'lookback_periods': [5, 10, 20, 50, 100, 200],
    'technical_indicators': True,
    'market_microstructure': True,
    
    # Matrix generation
    'correlation_window': 20,
    'volume_profile_bins': 10,
    'orderbook_levels': 5,
    
    # Normalization
    'normalization_method': 'robust',  # 'standard' or 'robust'
    'clip_outliers': True,
    'outlier_threshold': 5
}

print("📋 Data Configuration:")
print(f"- Symbols: {len(DATA_CONFIG['symbols'])} assets")
print(f"- Date Range: {DATA_CONFIG['start_date']} to {DATA_CONFIG['end_date']}")
print(f"- Train/Val/Test Split: {DATA_CONFIG['train_ratio']}/{DATA_CONFIG['val_ratio']}/{DATA_CONFIG['test_ratio']}")

## 3. Download Market Data

In [None]:
# Download market data
def download_market_data(symbols, start_date, end_date):
    """Download historical market data from Yahoo Finance."""
    
    all_data = {}
    failed_symbols = []
    
    print(f"📥 Downloading data for {len(symbols)} symbols...")
    
    for symbol in tqdm(symbols, desc="Downloading"):
        try:
            # Download data
            ticker = yf.Ticker(symbol)
            data = ticker.history(start=start_date, end=end_date)
            
            if len(data) > 0:
                # Add symbol column
                data['Symbol'] = symbol
                all_data[symbol] = data
            else:
                failed_symbols.append(symbol)
                
        except Exception as e:
            print(f"\n⚠️ Failed to download {symbol}: {e}")
            failed_symbols.append(symbol)
    
    print(f"\n✅ Downloaded data for {len(all_data)} symbols")
    if failed_symbols:
        print(f"❌ Failed symbols: {failed_symbols}")
    
    return all_data

# Download the data
market_data = download_market_data(
    DATA_CONFIG['symbols'],
    DATA_CONFIG['start_date'],
    DATA_CONFIG['end_date']
)

In [None]:
# Data overview
print("📊 Data Overview:")
for symbol, data in list(market_data.items())[:5]:  # Show first 5
    print(f"\n{symbol}:")
    print(f"  - Shape: {data.shape}")
    print(f"  - Date Range: {data.index[0].date()} to {data.index[-1].date()}")
    print(f"  - Columns: {list(data.columns)}")

## 4. Feature Engineering

In [None]:
# Add technical indicators
def add_technical_indicators(df, lookback_periods):
    """Add technical indicators to the dataframe."""
    
    # Price-based features
    df['returns'] = df['Close'].pct_change()
    df['log_returns'] = np.log(df['Close'] / df['Close'].shift(1))
    df['high_low_ratio'] = df['High'] / df['Low']
    df['close_open_ratio'] = df['Close'] / df['Open']
    
    # Volume features
    df['volume_sma'] = df['Volume'].rolling(window=20).mean()
    df['volume_ratio'] = df['Volume'] / df['volume_sma']
    df['dollar_volume'] = df['Close'] * df['Volume']
    
    # Moving averages
    for period in lookback_periods:
        df[f'sma_{period}'] = df['Close'].rolling(window=period).mean()
        df[f'ema_{period}'] = df['Close'].ewm(span=period).mean()
        df[f'close_sma_{period}_ratio'] = df['Close'] / df[f'sma_{period}']
    
    # Volatility indicators
    df['atr'] = ta.volatility.average_true_range(df['High'], df['Low'], df['Close'])
    df['bb_high'], df['bb_mid'], df['bb_low'] = ta.volatility.bollinger_hband(df['Close']), \
                                                 ta.volatility.bollinger_mavg(df['Close']), \
                                                 ta.volatility.bollinger_lband(df['Close'])
    df['bb_width'] = (df['bb_high'] - df['bb_low']) / df['bb_mid']
    df['bb_position'] = (df['Close'] - df['bb_low']) / (df['bb_high'] - df['bb_low'])
    
    # Momentum indicators
    df['rsi'] = ta.momentum.rsi(df['Close'])
    df['macd'] = ta.trend.macd(df['Close'])
    df['macd_signal'] = ta.trend.macd_signal(df['Close'])
    df['macd_diff'] = df['macd'] - df['macd_signal']
    df['stoch'] = ta.momentum.stoch(df['High'], df['Low'], df['Close'])
    
    # Trend indicators
    df['adx'] = ta.trend.adx(df['High'], df['Low'], df['Close'])
    df['cci'] = ta.trend.cci(df['High'], df['Low'], df['Close'])
    
    # Support/Resistance levels
    for period in [20, 50]:
        df[f'resistance_{period}'] = df['High'].rolling(window=period).max()
        df[f'support_{period}'] = df['Low'].rolling(window=period).min()
        df[f'sr_ratio_{period}'] = (df['Close'] - df[f'support_{period}']) / \
                                   (df[f'resistance_{period}'] - df[f'support_{period}'])
    
    return df

# Apply technical indicators
print("🔧 Adding technical indicators...")
for symbol in tqdm(market_data.keys(), desc="Processing"):
    market_data[symbol] = add_technical_indicators(
        market_data[symbol], 
        DATA_CONFIG['lookback_periods']
    )

print("✅ Technical indicators added")

In [None]:
# Add market microstructure features
def add_microstructure_features(df):
    """Add market microstructure features."""
    
    # Spread estimation (using high-low as proxy)
    df['spread_pct'] = (df['High'] - df['Low']) / df['Close'] * 100
    df['spread_ma'] = df['spread_pct'].rolling(window=20).mean()
    
    # Volume profile
    df['volume_profile'] = df['Volume'].rolling(window=20).apply(
        lambda x: np.percentile(x, 75) / np.percentile(x, 25) if np.percentile(x, 25) > 0 else 1
    )
    
    # Price efficiency
    df['price_efficiency'] = abs(df['returns']) / df['spread_pct']
    
    # Amihud illiquidity
    df['amihud_illiquidity'] = abs(df['returns']) / (df['dollar_volume'] + 1e-10)
    df['amihud_ma'] = df['amihud_illiquidity'].rolling(window=20).mean()
    
    # Roll measure (proxy for effective spread)
    df['roll_measure'] = df['returns'].rolling(window=2).cov()
    
    # Order flow imbalance (using volume and price direction)
    df['price_direction'] = np.sign(df['returns'])
    df['signed_volume'] = df['Volume'] * df['price_direction']
    df['order_imbalance'] = df['signed_volume'].rolling(window=20).sum() / \
                           df['Volume'].rolling(window=20).sum()
    
    return df

# Apply microstructure features
if DATA_CONFIG['market_microstructure']:
    print("🔬 Adding market microstructure features...")
    for symbol in tqdm(market_data.keys(), desc="Processing"):
        market_data[symbol] = add_microstructure_features(market_data[symbol])
    print("✅ Market microstructure features added")

## 5. Generate Agent-Specific Matrices

In [None]:
# Generate correlation matrices for Regime Detector
def generate_correlation_matrices(data_dict, window=20):
    """Generate rolling correlation matrices."""
    
    # Combine returns data
    returns_data = pd.DataFrame()
    for symbol, data in data_dict.items():
        if 'returns' in data.columns:
            returns_data[symbol] = data['returns']
    
    # Align indices
    returns_data = returns_data.dropna()
    
    # Calculate rolling correlations
    correlation_matrices = []
    dates = []
    
    print(f"📊 Generating correlation matrices (window={window})...")
    
    for i in tqdm(range(window, len(returns_data)), desc="Processing"):
        window_data = returns_data.iloc[i-window:i]
        corr_matrix = window_data.corr().values
        
        # Replace NaN with 0
        corr_matrix = np.nan_to_num(corr_matrix, 0)
        
        correlation_matrices.append(corr_matrix)
        dates.append(returns_data.index[i])
    
    return np.array(correlation_matrices), dates, list(returns_data.columns)

# Generate correlation matrices
corr_matrices, corr_dates, corr_symbols = generate_correlation_matrices(
    market_data, 
    DATA_CONFIG['correlation_window']
)

print(f"✅ Generated {corr_matrices.shape[0]} correlation matrices")
print(f"   Shape: {corr_matrices.shape}")

In [None]:
# Generate volume profile matrices for Structure Analyzer
def generate_volume_profile_matrices(data_dict, n_bins=10):
    """Generate volume profile matrices."""
    
    volume_profiles = []
    dates = []
    
    # Get common dates
    common_dates = None
    for symbol, data in data_dict.items():
        if common_dates is None:
            common_dates = set(data.index)
        else:
            common_dates = common_dates.intersection(set(data.index))
    common_dates = sorted(list(common_dates))
    
    print(f"📊 Generating volume profile matrices (bins={n_bins})...")
    
    for date in tqdm(common_dates[100:], desc="Processing"):  # Skip first 100 days for history
        daily_profiles = []
        
        for symbol, data in data_dict.items():
            if date in data.index:
                # Get last 20 days of data
                idx = data.index.get_loc(date)
                if idx >= 20:
                    window_data = data.iloc[idx-20:idx+1]
                    
                    # Create volume profile
                    price_range = window_data['High'].max() - window_data['Low'].min()
                    if price_range > 0:
                        bins = np.linspace(
                            window_data['Low'].min(), 
                            window_data['High'].max(), 
                            n_bins + 1
                        )
                        
                        profile = np.zeros(n_bins)
                        for _, row in window_data.iterrows():
                            # Distribute volume across price range
                            bin_idx = np.digitize(
                                (row['High'] + row['Low']) / 2, 
                                bins
                            ) - 1
                            if 0 <= bin_idx < n_bins:
                                profile[bin_idx] += row['Volume']
                        
                        # Normalize
                        if profile.sum() > 0:
                            profile = profile / profile.sum()
                        
                        daily_profiles.append(profile)
                    else:
                        daily_profiles.append(np.zeros(n_bins))
                else:
                    daily_profiles.append(np.zeros(n_bins))
            else:
                daily_profiles.append(np.zeros(n_bins))
        
        if daily_profiles:
            volume_profiles.append(np.array(daily_profiles))
            dates.append(date)
    
    return np.array(volume_profiles), dates

# Generate volume profile matrices
volume_matrices, volume_dates = generate_volume_profile_matrices(
    market_data,
    DATA_CONFIG['volume_profile_bins']
)

print(f"✅ Generated {volume_matrices.shape[0]} volume profile matrices")
print(f"   Shape: {volume_matrices.shape}")

In [None]:
# Generate feature matrices for all agents
def generate_feature_matrices(data_dict):
    """Generate feature matrices for all agents."""
    
    # Select features for each agent
    regime_features = ['returns', 'volume_ratio', 'atr', 'adx', 'vix_proxy']
    structure_features = ['spread_pct', 'volume_profile', 'order_imbalance', 'amihud_illiquidity']
    tactical_features = ['rsi', 'macd_diff', 'bb_position', 'stoch', 'close_sma_20_ratio']
    risk_features = ['atr', 'bb_width', 'spread_pct', 'amihud_illiquidity', 'max_drawdown']
    
    # Combine all data
    all_features = {}
    
    print("🔨 Generating feature matrices for all agents...")
    
    # Get common dates
    common_dates = None
    for symbol, data in data_dict.items():
        if common_dates is None:
            common_dates = set(data.index)
        else:
            common_dates = common_dates.intersection(set(data.index))
    common_dates = sorted(list(common_dates))
    
    # Extract features
    for agent_name, feature_list in [
        ('regime_detector', regime_features),
        ('structure_analyzer', structure_features),
        ('tactical_trader', tactical_features),
        ('risk_manager', risk_features)
    ]:
        agent_data = []
        
        for date in tqdm(common_dates[200:], desc=f"Processing {agent_name}"):
            daily_features = []
            
            for symbol, data in data_dict.items():
                if date in data.index:
                    row_features = []
                    for feat in feature_list:
                        if feat in data.columns:
                            row_features.append(data.loc[date, feat])
                        elif feat == 'vix_proxy':  # Calculate VIX proxy
                            idx = data.index.get_loc(date)
                            if idx >= 20:
                                returns = data['returns'].iloc[idx-20:idx]
                                vix_proxy = returns.std() * np.sqrt(252) * 100
                                row_features.append(vix_proxy)
                            else:
                                row_features.append(0)
                        elif feat == 'max_drawdown':  # Calculate max drawdown
                            idx = data.index.get_loc(date)
                            if idx >= 50:
                                prices = data['Close'].iloc[idx-50:idx+1]
                                cum_returns = (1 + prices.pct_change()).cumprod()
                                running_max = cum_returns.expanding().max()
                                drawdown = (cum_returns - running_max) / running_max
                                row_features.append(drawdown.min())
                            else:
                                row_features.append(0)
                        else:
                            row_features.append(0)
                    
                    daily_features.append(row_features)
            
            if daily_features:
                agent_data.append(daily_features)
        
        all_features[agent_name] = np.array(agent_data)
    
    return all_features, common_dates[200:]

# Generate feature matrices
feature_matrices, feature_dates = generate_feature_matrices(market_data)

print("\n✅ Feature matrices generated:")
for agent, matrix in feature_matrices.items():
    print(f"   {agent}: {matrix.shape}")

## 6. Data Normalization and Preprocessing

In [None]:
# Normalize data
def normalize_data(data, method='robust', clip_threshold=5):
    """Normalize data using specified method."""
    
    # Reshape data for normalization
    original_shape = data.shape
    if len(data.shape) == 3:
        # Reshape from (samples, assets, features) to (samples*assets, features)
        n_samples, n_assets, n_features = data.shape
        data_reshaped = data.reshape(-1, n_features)
    else:
        data_reshaped = data
    
    # Replace inf and -inf with NaN
    data_reshaped = np.where(np.isinf(data_reshaped), np.nan, data_reshaped)
    
    # Fill NaN with 0
    data_reshaped = np.nan_to_num(data_reshaped, 0)
    
    # Normalize
    if method == 'robust':
        scaler = RobustScaler()
    else:
        scaler = StandardScaler()
    
    data_normalized = scaler.fit_transform(data_reshaped)
    
    # Clip outliers
    if clip_threshold:
        data_normalized = np.clip(data_normalized, -clip_threshold, clip_threshold)
    
    # Reshape back
    if len(original_shape) == 3:
        data_normalized = data_normalized.reshape(original_shape)
    
    return data_normalized, scaler

# Normalize all data
print("🔧 Normalizing data...")

normalized_features = {}
scalers = {}

# Normalize feature matrices
for agent, matrix in feature_matrices.items():
    normalized_features[agent], scalers[agent] = normalize_data(
        matrix,
        method=DATA_CONFIG['normalization_method'],
        clip_threshold=DATA_CONFIG['outlier_threshold']
    )
    print(f"   {agent}: normalized")

# Normalize correlation matrices (already in [-1, 1] range)
normalized_features['correlation_matrices'] = corr_matrices

# Normalize volume profiles (already normalized to sum to 1)
normalized_features['volume_profiles'] = volume_matrices

print("✅ Data normalization complete")

## 7. Create Train/Validation/Test Splits

In [None]:
# Create data splits
def create_data_splits(data, dates, train_ratio, val_ratio, test_ratio):
    """Create train/validation/test splits."""
    
    n_samples = len(data)
    
    # Calculate split indices
    train_end = int(n_samples * train_ratio)
    val_end = int(n_samples * (train_ratio + val_ratio))
    
    # Split data
    train_data = data[:train_end]
    val_data = data[train_end:val_end]
    test_data = data[val_end:]
    
    # Split dates
    train_dates = dates[:train_end]
    val_dates = dates[train_end:val_end]
    test_dates = dates[val_end:]
    
    return {
        'train': (train_data, train_dates),
        'val': (val_data, val_dates),
        'test': (test_data, test_dates)
    }

# Create splits for all data
print("✂️ Creating data splits...")

data_splits = {}

# Split feature matrices
for agent, matrix in normalized_features.items():
    if agent in ['correlation_matrices', 'volume_profiles']:
        # Use appropriate dates
        dates = corr_dates if agent == 'correlation_matrices' else volume_dates
    else:
        dates = feature_dates
    
    data_splits[agent] = create_data_splits(
        matrix, dates,
        DATA_CONFIG['train_ratio'],
        DATA_CONFIG['val_ratio'],
        DATA_CONFIG['test_ratio']
    )
    
    print(f"\n{agent}:")
    print(f"  Train: {data_splits[agent]['train'][0].shape} samples")
    print(f"  Val: {data_splits[agent]['val'][0].shape} samples")
    print(f"  Test: {data_splits[agent]['test'][0].shape} samples")

print("\n✅ Data splits created")

## 8. Save Processed Data

In [None]:
# Save data to HDF5 format
def save_to_hdf5(filename, data_splits, scalers, metadata):
    """Save processed data to HDF5 file."""
    
    with h5py.File(filename, 'w') as f:
        # Save metadata
        meta_group = f.create_group('metadata')
        for key, value in metadata.items():
            if isinstance(value, (list, dict)):
                meta_group.attrs[key] = json.dumps(value)
            else:
                meta_group.attrs[key] = value
        
        # Save data splits
        for agent, splits in data_splits.items():
            agent_group = f.create_group(agent)
            
            for split_name, (data, dates) in splits.items():
                split_group = agent_group.create_group(split_name)
                
                # Save data
                split_group.create_dataset('data', data=data, compression='gzip')
                
                # Save dates as strings
                date_strings = [str(d) for d in dates]
                split_group.create_dataset(
    'dates', 
    data=np.array(date_strings, dtype='S')
                )
        
        # Save scalers parameters
        if scalers:
            scaler_group = f.create_group('scalers')
            for agent, scaler in scalers.items():
                agent_scaler = scaler_group.create_group(agent)
                if hasattr(scaler, 'center_'):
                    agent_scaler.create_dataset('center', data=scaler.center_)
                    agent_scaler.create_dataset('scale', data=scaler.scale_)
                elif hasattr(scaler, 'mean_'):
                    agent_scaler.create_dataset('mean', data=scaler.mean_)
                    agent_scaler.create_dataset('scale', data=scaler.scale_)
    
    print(f"✅ Data saved to {filename}")

# Prepare metadata
metadata = {
    'created_date': datetime.now().isoformat(),
    'data_config': DATA_CONFIG,
    'symbols': list(market_data.keys()),
    'n_symbols': len(market_data),
    'date_range': {
        'start': DATA_CONFIG['start_date'],
        'end': DATA_CONFIG['end_date']
    },
    'normalization_method': DATA_CONFIG['normalization_method'],
    'features_per_agent': {
        agent: matrix.shape[-1] if len(matrix.shape) > 2 else matrix.shape[1]
        for agent, matrix in normalized_features.items()
    }
}

# Save to HDF5
output_file = f"{DRIVE_BASE}/data/processed/marl_training_data_{datetime.now().strftime('%Y%m%d')}.h5"
save_to_hdf5(output_file, data_splits, scalers, metadata)

# Also save a compressed version
print("\n📦 Creating compressed version...")
import zipfile
compressed_file = output_file.replace('.h5', '.zip')
with zipfile.ZipFile(compressed_file, 'w', zipfile.ZIP_DEFLATED) as zf:
    zf.write(output_file, os.path.basename(output_file))
print(f"✅ Compressed file saved: {compressed_file}")

In [None]:
# Verify saved data
print("🔍 Verifying saved data...")

with h5py.File(output_file, 'r') as f:
    print("\nFile structure:")
    
    def print_structure(name, obj):
        if isinstance(obj, h5py.Dataset):
            print(f"  Dataset: {name} - Shape: {obj.shape}")
        elif isinstance(obj, h5py.Group):
            print(f"  Group: {name}")
    
    f.visititems(print_structure)
    
    # Print metadata
    print("\nMetadata:")
    for key, value in f['metadata'].attrs.items():
        if key == 'data_config':
            print(f"  {key}: <config dictionary>")
        else:
            print(f"  {key}: {value}")

# File size
file_size_mb = os.path.getsize(output_file) / (1024 * 1024)
compressed_size_mb = os.path.getsize(compressed_file) / (1024 * 1024)

print(f"\n📊 File sizes:")
print(f"  Original: {file_size_mb:.2f} MB")
print(f"  Compressed: {compressed_size_mb:.2f} MB")
print(f"  Compression ratio: {file_size_mb/compressed_size_mb:.2f}x")

## 9. Data Upload Summary

In [None]:
# Create data preparation summary
summary = f"""
# Data Preparation Summary

## Dataset Information
- **Created**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
- **Symbols**: {len(market_data)} assets
- **Date Range**: {DATA_CONFIG['start_date']} to {DATA_CONFIG['end_date']}
- **Total Days**: {len(feature_dates)}

## Data Splits
- **Train**: {DATA_CONFIG['train_ratio']*100:.0f}% ({len(data_splits['regime_detector']['train'][0])} samples)
- **Validation**: {DATA_CONFIG['val_ratio']*100:.0f}% ({len(data_splits['regime_detector']['val'][0])} samples)
- **Test**: {DATA_CONFIG['test_ratio']*100:.0f}% ({len(data_splits['regime_detector']['test'][0])} samples)

## Feature Matrices
"""

for agent, splits in data_splits.items():
    train_shape = splits['train'][0].shape
    summary += f"\n### {agent}"
    summary += f"\n- Shape: {train_shape}"
    if len(train_shape) == 3:
        summary += f" (samples × assets × features)"
    elif len(train_shape) == 2:
        summary += f" (samples × features)"

summary += f"""

## Technical Indicators Added
- Price-based: returns, log returns, ratios
- Moving averages: SMA, EMA (multiple periods)
- Volatility: ATR, Bollinger Bands
- Momentum: RSI, MACD, Stochastic
- Trend: ADX, CCI
- Microstructure: spread, order imbalance, illiquidity

## File Information
- **Output File**: `{os.path.basename(output_file)}`
- **File Size**: {file_size_mb:.2f} MB
- **Compressed Size**: {compressed_size_mb:.2f} MB
- **Location**: `{DRIVE_BASE}/data/processed/`

## Usage in Training
```python
# Load data in training notebook
import h5py

with h5py.File('{output_file}', 'r') as f:
    # Load regime detector training data
    regime_train = f['regime_detector/train/data'][:]
    regime_train_dates = f['regime_detector/train/dates'][:]
```
"""

print(summary)

# Save summary
summary_file = f"{DRIVE_BASE}/data/processed/data_preparation_summary_{datetime.now().strftime('%Y%m%d')}.md"
with open(summary_file, 'w') as f:
    f.write(summary)
print(f"\n✅ Summary saved to: {summary_file}")

In [None]:
print("\n🎉 Data preparation complete!")
print("\n📋 Next steps:")
print("1. The processed data is now saved in your Google Drive")
print("2. Use the Master Training Notebook to load this data")
print("3. The data is optimized for efficient loading in Colab")
print("\nData file path for training:")
print(f"'{output_file}'")