In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
with open("/kaggle/working/TraningConfig.py", "w") as f:
    f.write("""
from pathlib import Path

class TrainingConfig:
    '''
    All configurations required for the models to train
    '''

    DATA_DIR = Path('output/data')
    OUTPUT_DIR = Path('output')
    MODEL_DIR = OUTPUT_DIR / "models_greatlakes"
    VIZ_DIR = OUTPUT_DIR / "visualizations"

    #? Model Hyperparameters - Enhanced for better performance
    EMBEDDING_DIM = 256
    HIDDEN_DIM = 512
    NUM_HEADS = 8
    NUM_LAYERS = 4
    DROPOUT = 0.3
    MAX_SEQ_LENGTH = 50

    #? Training hyperparameters - Optimized
    BATCH_SIZE = 128
    LEARNING_RATE = 0.0005
    NUM_EPOCHS = 1
    PATIENCE = 8
    WARMUP_EPOCHS = 3
    LABEL_SMOOTHING = 0.1

    TOP_K_VALUES = [1, 5, 10, 20]

    #? System
    NUM_WORKERS = 4
    RANDOM_SEED = 42

    def __init__(self):
        '''Initialize output directories'''
        self.MODEL_DIR.mkdir(parents=True, exist_ok=True)
        self.VIZ_DIR.mkdir(parents=True, exist_ok=True)
        self.DATA_DIR.mkdir(parents=True, exist_ok=True)
""")


In [None]:
with open("/kaggle/working/config.py", "w") as f:
    f.write("""
import json
import pandas as pd
import numpy as np
from pathlib import Path
from collections import defaultdict, Counter
from typing import List, Dict, Tuple
import pickle
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# For ML models
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

# For visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# For Arrow file export
import pyarrow as pa
import pyarrow.parquet as pq



class Config:
    '''
    This class holds all the configurations like 
    input file path, output file path and hyperparameters for the models
    '''
    MPD_DIR = Path('/kaggle/input/millionplaylistdataset/data')
    OUTPUT_DIR = Path('output')
    HF_DATASET_PATH = "/kaggle/input/mpd-audio-features/extracted_audio_features.csv"
    NUM_FILES_TO_PROCESS = 100  #? How many files should we process in the dataset
    MIN_PLAYLIST_LENGTH = 5 #? Lower bound for number of songs in the playlist
    MAX_PLAYLIST_LENGTH = 200  #? Upper bound for number of songs in the playlist
    MIN_SONG_FREQUENCY = 10  #? How many times that song should be present in all playlists in order to be used.

    AUDIO_FEATURES = ['danceability', 'energy', 'loudness', 'speechiness', 
                     'acousticness', 'instrumentalness', 'liveness', 
                     'valence', 'tempo', 'duration_ms']
    
    KNN_NEIGHBORS = 20 #? Hyperparameter for NearestNeigbor classifier
    RANDOM_SEED = 42 #? For reproduciblity of code
    
    # Data splits
    TRAIN_RATIO = 0.70 #? 70% of data for training
    VAL_RATIO = 0.15 #? 15% Data for validation
    TEST_RATIO = 0.15 #? 15% data for testing
    
    # Evaluation
    TOP_K_VALUES = [1, 5, 10, 20]
    
    def __init__(self):
        self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
        (self.OUTPUT_DIR / "models").mkdir(exist_ok=True)
        (self.OUTPUT_DIR / "visualizations").mkdir(exist_ok=True)
        (self.OUTPUT_DIR / "data").mkdir(exist_ok=True)
        (self.OUTPUT_DIR / "metrics").mkdir(exist_ok=True)

    
""")


In [None]:
with open("/kaggle/working/dataloader.py", "w") as f:
    f.write("""
from config import Config
from typing import List, Dict
from tqdm import tqdm
import json
import pandas as pd
from collections import Counter

class SpotifyDataLoader:
    '''
    Handles loading dataset and merging spotify MPD audio features
    '''
    def __init__(self, config: Config):
        self.config = config
        self.playlists = []
        self.track_uri_to_id = {}
        self.audio_features_df = None

    def load_mpd_files(self) -> List[Dict]:
        '''
        Loading the Spotify MPD Dataset
        '''
        print('[1/7] Loading the spotify MPD Dataset')
        
        json_files = sorted(list(self.config.MPD_DIR.glob("*.json")))

        if len(json_files) == 0:
            raise FileNotFoundError(f'No JSON files found in {self.config.MPD_DIR}')
        
        files_to_load = json_files[:self.config.NUM_FILES_TO_PROCESS]

        print(f'Found {len(json_files)} files, loading {len(files_to_load)} files...')

        all_playlists = []

        for json_file in tqdm(files_to_load, desc="Loading JSON files"):
            with open(json_file, 'r') as f:
                data = json.load(f)
                all_playlists.extend(data['playlists'])
        
        print(f"Loaded {len(all_playlists):,} playlists")
        return all_playlists
    

    def load_audio_features(self) -> pd.DataFrame:
        df = pd.read_csv(self.config.HF_DATASET_PATH)

        print(f"Loaded {len(df):,} tracks with audio features")
        print(f"Columns: {df.columns.tolist()}")
        
        return df
    

    def preprocess_data(self, playlists: List[Dict], audio_df: pd.DataFrame):
        '''
        Clean and merge playlist data with audio features
        '''

        filtered_playlists = [
            p for p in playlists 
            if self.config.MIN_PLAYLIST_LENGTH <= len(p['tracks']) <= self.config.MAX_PLAYLIST_LENGTH
        ]

        print(f"After length filtering: {len(filtered_playlists):,} playlists")

        # Count track frequency
        track_counter = Counter()
        for playlist in filtered_playlists:
            for track in playlist['tracks']:
                track_counter[track['track_uri']] += 1

        frequent_tracks = {
            uri for uri, count in track_counter.items()
            if count >= self.config.MIN_SONG_FREQUENCY
        }
        print(f"Tracks appearing in {self.config.MIN_SONG_FREQUENCY}+ playlists: {len(frequent_tracks):,}")

        print("Building track URI to ID mapping...")
        if 'track_uri' in audio_df.columns and 'id' not in audio_df.columns:
            audio_df = audio_df.rename(columns={'track_uri': 'id'})
        
        uri_to_id = {}
        for _, row in tqdm(audio_df.iterrows(), total=len(audio_df), desc="Mapping URIs"):
            if 'id' in row and pd.notna(row['id']):
                uri = f"spotify:track:{row['id']}"
                uri_to_id[uri] = row['id']
        
        print(f"Mapped {len(uri_to_id):,} track URIs")

        final_playlists = []
        for playlist in tqdm(filtered_playlists, desc="Filtering tracks"):
            filtered_tracks = [
                t for t in playlist['tracks']
                if t['track_uri'] in uri_to_id and t['track_uri'] in frequent_tracks
            ]
            if len(filtered_tracks) >= self.config.MIN_PLAYLIST_LENGTH:
                playlist['tracks'] = filtered_tracks
                final_playlists.append(playlist)
        
        print(f"Final dataset: {len(final_playlists):,} playlists")

        final_track_uris = set()
        for playlist in final_playlists:
            for track in playlist['tracks']:
                final_track_uris.add(track['track_uri'])

        print(f"Final vocabulary: {len(final_track_uris):,} unique tracks")

        final_track_ids = [
            uri_to_id[uri] for uri in final_track_uris if uri in uri_to_id
        ]

        audio_df_filtered = audio_df[audio_df['id'].isin(final_track_ids)].copy()

        print(f"Audio features for {len(audio_df_filtered):,} tracks")

        self.playlists = final_playlists
        self.track_uri_to_id = uri_to_id
        self.audio_features_df = audio_df_filtered

        return final_playlists, audio_df_filtered, uri_to_id
""")


NO need to install package. Already available!!

In [None]:
with open("/kaggle/working/DeepLearningVisualizer.py", "w") as f:
    f.write("""
import matplotlib.pyplot as plt
from PlaylistTransformer import PlaylistTransformer
import torch
import seaborn as sns
import numpy as np

# Device detection with proper error handling
if torch.cuda.is_available():
    device = torch.device('cuda')
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')
class DeepLearningVisualizer:
    '''Visualizations for deep learning results'''
    
    def __init__(self, config):
        self.config = config
        
    def plot_training_curves(self, trainers_dict):
        '''Plot training and validation curves'''
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        
        # Loss curves
        for name, trainer in trainers_dict.items():
            epochs = range(1, len(trainer.train_losses) + 1)
            axes[0].plot(epochs, trainer.train_losses, label=f'{name} Train', marker='o')
            axes[0].plot(epochs, trainer.val_losses, label=f'{name} Val', marker='s', linestyle='--')
        
        axes[0].set_xlabel('Epoch')
        axes[0].set_ylabel('Loss')
        axes[0].set_title('Training and Validation Loss')
        axes[0].legend()
        axes[0].grid(alpha=0.3)
        
        # Accuracy curves (Top-10)
        for name, trainer in trainers_dict.items():
            epochs = range(1, len(trainer.val_accuracies) + 1)
            top10_accs = [acc[10] for acc in trainer.val_accuracies]
            axes[1].plot(epochs, top10_accs, label=name, marker='o')
        
        axes[1].set_xlabel('Epoch')
        axes[1].set_ylabel('Top-10 Accuracy (%)')
        axes[1].set_title('Validation Top-10 Accuracy')
        axes[1].legend()
        axes[1].grid(alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(self.config.VIZ_DIR / 'training_curves.png', dpi=300, bbox_inches='tight')
        print("‚úì Saved: training_curves.png")
        plt.close()
    
    def plot_model_comparison(self, results_dict):
        '''Compare all models including baselines'''
        fig, ax = plt.subplots(figsize=(12, 6))
        
        models = list(results_dict.keys())
        k_values = self.config.TOP_K_VALUES
        
        x = np.arange(len(k_values))
        width = 0.15
        
        for i, model in enumerate(models):
            # Handle missing k values gracefully
            accuracies = []
            for k in k_values:
                if k in results_dict[model]:
                    accuracies.append(results_dict[model][k])
                else:
                    # Skip this model if it doesn't have all required k values
                    print(f"‚ö†Ô∏è Warning: {model} missing Top-{k} results, skipping this model")
                    break
            else:
                # Only plot if we have all k values
                ax.bar(x + i * width, accuracies, width, label=model)
        
        ax.set_xlabel('Top-K')
        ax.set_ylabel('Accuracy (%)')
        ax.set_title('Model Performance Comparison (All Methods)')
        ax.set_xticks(x + width * (len(models) - 1) / 2)
        ax.set_xticklabels([f'Top-{k}' for k in k_values])
        ax.legend()
        ax.grid(axis='y', alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(self.config.VIZ_DIR / 'model_comparison_all.png', dpi=300, bbox_inches='tight')
        print("‚úì Saved: model_comparison_all.png")
        plt.close()
    
    def visualize_attention(self, model, dataset, idx=0):
        '''Visualize attention weights from Transformer'''
        if not isinstance(model, PlaylistTransformer):
            print("Attention visualization only available for Transformer")
            return
        
        model.eval()
        sample = dataset[idx]
        
        input_ids = torch.LongTensor([sample['history']]).to(device)
        mask = torch.ones(1, len(sample['history']), dtype=torch.bool).to(device)
        
        attention_weights = model.get_attention_weights(input_ids, mask)
        
        if len(attention_weights) > 0:
            # Plot attention from last layer
            last_layer_attn = attention_weights[-1][0].cpu().numpy()  # (num_heads, seq_len, seq_len)
            
            # Average over heads
            avg_attn = last_layer_attn.mean(axis=0)
            
            fig, ax = plt.subplots(figsize=(10, 8))
            sns.heatmap(avg_attn, cmap='viridis', ax=ax, cbar_kws={'label': 'Attention Weight'})
            ax.set_xlabel('Key Position')
            ax.set_ylabel('Query Position')
            ax.set_title('Transformer Attention Weights (Last Layer, Averaged over Heads)')
            
            plt.tight_layout()
            plt.savefig(self.config.VIZ_DIR / 'attention_heatmap.png', dpi=300, bbox_inches='tight')
            print("‚úì Saved: attention_heatmap.png")
            plt.close()

""")

In [None]:
with open("/kaggle/working/main_2.py.py", "w") as f:
    f.write("""
from TrainingConfig import TrainingConfig
import torch
import numpy as np
import pandas as pd
from PlayListDataset import PlaylistDataset
from torch.utils.data import DataLoader
from GRU4Rec import GRU4Rec
from Trainer import Trainer
from PlaylistTransformer import PlaylistTransformer
from DeepLearningVisualizer import DeepLearningVisualizer
import json

def build_vocabulary(train_path):
    '''Build track vocabulary from training data'''
    df = pd.read_parquet(train_path)
    
    all_tracks = set()
    for _, row in df.iterrows():
        if isinstance(row['history'], str):
            tracks = row['history'].split('|') if row['history'] else []
        else:
            tracks = row['history']
        all_tracks.update(tracks)
        all_tracks.add(row['target'])
    
    vocab = sorted(list(all_tracks))
    print(f"Vocabulary size: {len(vocab):,} tracks")
    
    return vocab


def main():
    '''Main training pipeline'''
    config = TrainingConfig()
    
    # Set random seeds
    torch.manual_seed(config.RANDOM_SEED)
    np.random.seed(config.RANDOM_SEED)
    
    print("=" * 80)
    print(" DEEP LEARNING TRAINING")
    print("=" * 80)
    
    # Build vocabulary
    print("\\n[1/5] Building vocabulary...")
    vocab = build_vocabulary(config.DATA_DIR / "train.parquet")
    num_items = len(vocab) + 2  # +2 for PAD and UNK
    
    # Create datasets
    print("\\n[2/5] Creating datasets...")
    train_dataset = PlaylistDataset(
        config.DATA_DIR / "train.parquet", 
        vocab, 
        config.MAX_SEQ_LENGTH
    )
    val_dataset = PlaylistDataset(
        config.DATA_DIR / "val.parquet", 
        vocab, 
        config.MAX_SEQ_LENGTH
    )
    test_dataset = PlaylistDataset(
        config.DATA_DIR / "test.parquet", 
        vocab, 
        config.MAX_SEQ_LENGTH
    )
    
    train_loader = DataLoader(
        train_dataset, 
        batch_size=config.BATCH_SIZE,
        shuffle=True,
        collate_fn=train_dataset.collate_fn,
        num_workers=config.NUM_WORKERS
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=config.BATCH_SIZE,
        shuffle=False,
        collate_fn=val_dataset.collate_fn,
        num_workers=config.NUM_WORKERS
    )
    
    test_loader = DataLoader(
        test_dataset,
        batch_size=config.BATCH_SIZE,
        shuffle=False,
        collate_fn=test_dataset.collate_fn,
        num_workers=config.NUM_WORKERS
    )
    
    
    print(f"Train: {len(train_dataset):,} sequences")
    print(f"Val: {len(val_dataset):,} sequences")
    print(f"Test: {len(test_dataset):,} sequences")
    # Add this RIGHT AFTER creating your dataset in main()
    print("\\n" + "=" * 80)
    print("DATASET DIAGNOSTICS")
    print("=" * 80)

    # Check vocabulary
    print(f"\\nVocabulary size: {len(vocab):,}")
    print(f"Dataset expects num_items: {num_items:,}")
    print(f"Match: {'‚úÖ' if num_items == len(vocab) + 2 else '‚ùå'}")

    # Check a sample from the dataset
    sample = train_dataset[0]
    print(f"\\nSample training example:")
    print(f"  History indices: {sample['history'][:5]}... (first 5)")
    print(f"  Target index: {sample['target']}")
    print(f"  Sequence length: {sample['seq_length']}")

    # Critical checks
    print(f"\\nüîç Critical Checks:")
    print(f"  Max history index: {max([max(train_dataset[i]['history']) if len(train_dataset[i]['history']) > 0 else 0 for i in range(min(100, len(train_dataset)))])}")
    print(f"  Max target index: {max([train_dataset[i]['target'] for i in range(min(100, len(train_dataset)))])}")
    print(f"  Num items in model: {num_items}")

    # Check if indices are in range
    max_idx_found = max([max(train_dataset[i]['history'] + [train_dataset[i]['target']]) if len(train_dataset[i]['history']) > 0 else train_dataset[i]['target'] for i in range(min(1000, len(train_dataset)))])
    print(f"  Max index found: {max_idx_found}")
    print(f"  Should be < {num_items}: {'‚úÖ' if max_idx_found < num_items else '‚ùå PROBLEM!'}")

    # Check for unknown tokens
    unk_count = sum([1 for i in range(min(1000, len(train_dataset))) if train_dataset.UNK_IDX in train_dataset[i]['history'] or train_dataset[i]['target'] == train_dataset.UNK_IDX])
    print(f"  Samples with UNK tokens: {unk_count}/1000 ({unk_count/10:.1f}%)")
    if unk_count > 100:
        print(f"  ‚ö†Ô∏è HIGH UNK RATE - Vocabulary mismatch!")

    print("=" * 80 + "\\n")
    # Train GRU4Rec
    print("\\n[3/5] Training GRU4Rec...")
    gru_model = GRU4Rec(
        num_items=num_items,
        embedding_dim=config.EMBEDDING_DIM,
        hidden_dim=config.HIDDEN_DIM,
        dropout=config.DROPOUT
    )
    
    gru_trainer = Trainer(gru_model, train_loader, val_loader, config, "GRU4Rec")
    gru_results = gru_trainer.train()
    
    # Train Transformer
    print("\\n[4/5] Training Transformer...")
    transformer_model = PlaylistTransformer(
        num_items=num_items,
        d_model=config.EMBEDDING_DIM,
        nhead=config.NUM_HEADS,
        num_layer=config.NUM_LAYERS,
        dropout=config.DROPOUT
    )
    
    transformer_trainer = Trainer(
        transformer_model, train_loader, val_loader, config, "Transformer"
    )
    transformer_results = transformer_trainer.train()
    
    # Final evaluation on test set
    print("\\n[5/5] Final evaluation on test set...")
    
    # Load best models
    gru_model.load_state_dict(
        torch.load(config.MODEL_DIR / 'GRU4Rec_best.pt')['model_state_dict']
    )
    transformer_model.load_state_dict(
        torch.load(config.MODEL_DIR / 'Transformer_best.pt')['model_state_dict']
    )
    
    gru_test_loss, gru_test_accs = gru_trainer.evaluate(test_loader)
    trans_test_loss, trans_test_accs = transformer_trainer.evaluate(test_loader)
    
    print("\\n" + "=" * 80)
    print("FINAL TEST RESULTS")
    print("=" * 80)
    print("\\nGRU4Rec:")
    for k, acc in gru_test_accs.items():
        print(f"  Top-{k}: {acc:.2f}%")
    
    print("\\nTransformer:")
    for k, acc in trans_test_accs.items():
        print(f"  Top-{k}: {acc:.2f}%")
    
    # Visualizations
    print("\\n" + "=" * 80)
    print("GENERATING VISUALIZATIONS")
    print("=" * 80)
    
    viz = DeepLearningVisualizer(config)
    
    trainers_dict = {
        'GRU4Rec': gru_trainer,
        'Transformer': transformer_trainer
    }
    viz.plot_training_curves(trainers_dict)
    
    # Load baseline results if available
    results_dict = {
        'GRU4Rec': gru_test_accs,
        'Transformer': trans_test_accs
    }
    
    # Try to load baseline results
    try:
        with open(config.OUTPUT_DIR / "metrics" / "summary.json", 'r') as f:
            summary = json.load(f)
            if 'baseline_results' in summary:
                # Convert string keys to integers for consistency
                for model_name, model_results in summary['baseline_results'].items():
                    results_dict[model_name] = {int(k): v for k, v in model_results.items()}
                print(f"‚úì Loaded baseline results for: {', '.join(summary['baseline_results'].keys())}")
    except Exception as e:
        print(f"Could not load baseline results: {e}")
    
    # Debug: Print structure of results_dict
    print("\\n" + "=" * 80)
    print("DEBUG: results_dict structure")
    print("=" * 80)
    for model_name, model_results in results_dict.items():
        print(f"\\n{model_name}:")
        print(f"  Type: {type(model_results)}")
        if isinstance(model_results, dict):
            print(f"  Keys: {list(model_results.keys())}")
            print(f"  Sample values: {dict(list(model_results.items())[:3])}")
        else:
            print(f"  Value: {model_results}")
    print("=" * 80 + "\\n")
    
    viz.plot_model_comparison(results_dict)
    viz.visualize_attention(transformer_model, test_dataset, idx=10)
    
    # Save final results
    final_results = {
        'test_results': {
            'GRU4Rec': gru_test_accs,
            'Transformer': trans_test_accs
        },
        'config': {
            'embedding_dim': config.EMBEDDING_DIM,
            'hidden_dim': config.HIDDEN_DIM,
            'num_heads': config.NUM_HEADS,
            'num_layers': config.NUM_LAYERS,
            'learning_rate': config.LEARNING_RATE,
            'batch_size': config.BATCH_SIZE
        }
    }
    
    with open(config.OUTPUT_DIR / "metrics" / "deep_learning_results.json", 'w') as f:
        json.dump(final_results, f, indent=2)
    
    print("\\n" + "=" * 80)
    print("TRAINING COMPLETE!")
    print("=" * 80)
    print(f"\\n‚úÖ Models saved to: {config.MODEL_DIR}")
    print(f"‚úÖ Visualizations saved to: {config.VIZ_DIR}")
    print(f"‚úÖ Results saved to: {config.OUTPUT_DIR / 'metrics'}")


if __name__ == "__main__":
    main()

""")

In [None]:
!python /kaggle/working/main.py


STARTING ML PIPELINE

[1/7] Loading the spotify MPD Dataset
Found 1000 files, loading 100 files...
Loading JSON files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:42<00:00,  2.37it/s]
Loaded 100,000 playlists
Loaded 2,262,292 tracks with audio features
Columns: ['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence', 'track_uri']
After length filtering: 96,507 playlists
Tracks appearing in 10+ playlists: 67,345
Building track URI to ID mapping...
Mapping URIs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2262292/2262292 [01:27<00:00, 25886.61it/s]
Mapped 2,262,292 track URIs
Filtering tracks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 96507/96507 [00:04<00:00, 23717.82it/s]
Final dataset: 92,991 playlists
Final vocabulary: 67,345 unique tracks
Audio features for 67,345 tracks

[4/7] Creating audio feature embedding

In [None]:
!python /kaggle/working/main_2.py

DEEP LEARNING TRAINING

[1/5] Building vocabulary...
Vocabulary size: 67,345 tracks

[2/5] Creating datasets...
Train: 3,293,908 sequences
Val: 707,739 sequences
Test: 702,677 sequences

DATASET DIAGNOSTICS

Vocabulary size: 67,345
Dataset expects num_items: 67,347
Match: ‚úÖ

Sample training example:
  History indices: [4255]... (first 5)
  Target index: 54293
  Sequence length: 1

üîç Critical Checks:
  Max history index: 66986
  Max target index: 66986
  Num items in model: 67347
  Max index found: 67328
  Should be < 67347: ‚úÖ
  Samples with UNK tokens: 0/1000 (0.0%)


[3/5] Training GRU4Rec...

Training GRU4Rec

Warmup Epoch 1/3 - LR: 0.000167

POST-EPOCH-1 DIAGNOSTICS

Embedding Statistics:
  Mean norm: 15.9827
  Std norm: 0.7089
  Min norm: 13.1815
  Max norm: 19.2997

Prediction Diversity:
  Entropy: 11.0911 / 11.1176 (99.8%)


Epoch 1/1
Training GRU4Rec: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 25734/25734 [21:42<00:00, 19.76it/s, loss=9.38]
Evaluating GRU4Rec: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà

In [None]:
"""
Quick diagnostic script to check what's in your model checkpoints.
This helps identify if models have actual training data or just zeros.
"""
import sys
sys.path.insert(0, '/kaggle/working/')
import torch
from pathlib import Path
from TraningConfig import TrainingConfig

def check_checkpoint(checkpoint_path):
    """Analyze a model checkpoint"""
    print(f"\n{'='*80}")
    print(f"Analyzing: {checkpoint_path.name}")
    print('='*80)
    
    if not checkpoint_path.exists():
        print("‚ùå File does not exist!")
        return False
    
    try:
        checkpoint = torch.load(checkpoint_path, map_location='cpu')
        print("‚úÖ Checkpoint loaded successfully\n")
        
        # Check what's in the checkpoint
        print("Checkpoint contents:")
        for key in checkpoint.keys():
            print(f"  ‚Ä¢ {key}")
        
        # Check training history
        print("\n" + "-"*80)
        print("TRAINING HISTORY")
        print("-"*80)
        
        if 'train_losses' in checkpoint:
            losses = checkpoint['train_losses']
            print(f"\nTrain Losses ({len(losses)} epochs):")
            if losses:
                print(f"  First epoch: {losses[0]:.4f}")
                print(f"  Last epoch:  {losses[-1]:.4f}")
                print(f"  Best (min):  {min(losses):.4f}")
            else:
                print("  ‚ö†Ô∏è  EMPTY - No training data recorded!")
        
        if 'val_losses' in checkpoint:
            losses = checkpoint['val_losses']
            print(f"\nValidation Losses ({len(losses)} epochs):")
            if losses:
                print(f"  First epoch: {losses[0]:.4f}")
                print(f"  Last epoch:  {losses[-1]:.4f}")
                print(f"  Best (min):  {min(losses):.4f}")
            else:
                print("  ‚ö†Ô∏è  EMPTY - No validation data recorded!")
        
        if 'val_accuracies' in checkpoint:
            accs = checkpoint['val_accuracies']
            print(f"\nValidation Accuracies ({len(accs)} epochs):")
            if accs:
                # Check if accuracies are dictionaries with K values
                if isinstance(accs[0], dict):
                    print(f"\n  First epoch:")
                    for k, v in accs[0].items():
                        print(f"    Top-{k}: {v:.2f}%")
                    
                    print(f"\n  Last epoch:")
                    for k, v in accs[-1].items():
                        print(f"    Top-{k}: {v:.2f}%")
                    
                    # Find best epoch
                    best_epoch = max(range(len(accs)), key=lambda i: accs[i].get(10, 0))
                    print(f"\n  Best epoch: {best_epoch + 1}")
                    for k, v in accs[best_epoch].items():
                        print(f"    Top-{k}: {v:.2f}%")
                    
                    # Check if all values are zeros
                    all_zeros = all(
                        all(v == 0 for v in epoch_acc.values()) 
                        for epoch_acc in accs
                    )
                    if all_zeros:
                        print("\n  ‚ö†Ô∏è  WARNING: All accuracies are 0! Model may not have trained properly.")
                else:
                    print(f"  First: {accs[0]}")
                    print(f"  Last:  {accs[-1]}")
            else:
                print("  ‚ö†Ô∏è  EMPTY - No accuracy data recorded!")
        
        # Check model state
        print("\n" + "-"*80)
        print("MODEL STATE")
        print("-"*80)
        
        if 'model_state_dict' in checkpoint:
            state_dict = checkpoint['model_state_dict']
            print(f"\nModel parameters: {len(state_dict)} tensors")
            
            # Check a few key parameters
            if 'embedding.weight' in state_dict:
                emb = state_dict['embedding.weight']
                print(f"\nEmbedding layer:")
                print(f"  Shape: {emb.shape}")
                print(f"  Mean: {emb.mean():.6f}")
                print(f"  Std:  {emb.std():.6f}")
                print(f"  Min:  {emb.min():.6f}")
                print(f"  Max:  {emb.max():.6f}")
                
                # Check if initialized (should not be all zeros)
                if emb.abs().max() < 1e-6:
                    print("  ‚ö†Ô∏è  WARNING: Embeddings appear uninitialized (all near zero)!")
            
            # Check a random layer
            sample_key = list(state_dict.keys())[5] if len(state_dict) > 5 else list(state_dict.keys())[0]
            sample_tensor = state_dict[sample_key]
            print(f"\nSample parameter '{sample_key}':")
            print(f"  Shape: {sample_tensor.shape}")
            print(f"  Mean: {sample_tensor.mean():.6f}")
            print(f"  Std:  {sample_tensor.std():.6f}")
        
        # Overall assessment
        print("\n" + "="*80)
        print("ASSESSMENT")
        print("="*80)
        
        has_training_data = (
            'train_losses' in checkpoint and 
            'val_losses' in checkpoint and 
            'val_accuracies' in checkpoint and
            len(checkpoint.get('train_losses', [])) > 0
        )
        
        has_nonzero_accs = False
        if 'val_accuracies' in checkpoint and checkpoint['val_accuracies']:
            if isinstance(checkpoint['val_accuracies'][0], dict):
                has_nonzero_accs = any(
                    any(v > 0 for v in epoch_acc.values())
                    for epoch_acc in checkpoint['val_accuracies']
                )
        
        if has_training_data and has_nonzero_accs:
            print("‚úÖ Checkpoint looks good! Contains valid training data.")
            return True
        elif has_training_data and not has_nonzero_accs:
            print("‚ö†Ô∏è  Checkpoint has training history but all accuracies are 0.")
            print("    This suggests the model didn't learn anything.")
            print("    Possible issues:")
            print("    - Learning rate too low/high")
            print("    - Data loading issue")
            print("    - Model architecture problem")
            return False
        else:
            print("‚ùå Checkpoint is incomplete or empty.")
            print("    You may need to re-train the model.")
            return False
        
    except Exception as e:
        print(f"‚ùå Error loading checkpoint: {e}")
        return False


def main():
    """Check all model checkpoints"""
    
    config = TrainingConfig()
    
    print("="*80)
    print("MODEL CHECKPOINT DIAGNOSTIC")
    print("="*80)
    print(f"\nChecking models in: {config.MODEL_DIR}")
    
    # List all checkpoint files
    checkpoint_files = list(config.MODEL_DIR.glob("*.pt"))
    
    if not checkpoint_files:
        print(f"\n‚ùå No checkpoint files found in {config.MODEL_DIR}")
        print("\nYou need to train models first:")
        print("  python main_2.py")
        return
    
    print(f"\nFound {len(checkpoint_files)} checkpoint files:")
    for f in checkpoint_files:
        print(f"  ‚Ä¢ {f.name}")
    
    # Check each checkpoint
    results = {}
    for checkpoint_path in checkpoint_files:
        results[checkpoint_path.name] = check_checkpoint(checkpoint_path)
    
    # Summary
    print("\n" + "="*80)
    print("SUMMARY")
    print("="*80)
    
    for name, is_valid in results.items():
        status = "‚úÖ VALID" if is_valid else "‚ùå INVALID/EMPTY"
        print(f"{name:<40} {status}")
    
    # Recommendations
    print("\n" + "="*80)
    print("RECOMMENDATIONS")
    print("="*80)
    
    valid_count = sum(results.values())
    total_count = len(results)
    
    if valid_count == 0:
        print("\n‚ùå No valid checkpoints found!")
        print("\nAction needed:")
        print("  1. Re-train your models: python main_2.py")
        print("  2. Check for training errors in the logs")
        print("  3. Verify your dataset is loaded correctly")
    elif valid_count < total_count:
        print(f"\n‚ö†Ô∏è  Only {valid_count}/{total_count} checkpoints are valid.")
        print("\nAction needed:")
        print("  1. Re-train models that failed")
        print("  2. Check training logs for errors")
    else:
        print("\n‚úÖ All checkpoints are valid!")
        print("\nNext steps:")
        print("  1. Run: python evaluate_pretrained_models.py")
        print("  2. Then: python create_poster_visualizations.py")
    
    print("="*80)


if __name__ == "__main__":
    main()

MODEL CHECKPOINT DIAGNOSTIC

Checking models in: output/models_greatlakes

Found 2 checkpoint files:
  ‚Ä¢ GRU4Rec_best.pt
  ‚Ä¢ Transformer_best.pt

Analyzing: GRU4Rec_best.pt
‚úÖ Checkpoint loaded successfully

Checkpoint contents:
  ‚Ä¢ model_state_dict
  ‚Ä¢ optimizer_state_dict
  ‚Ä¢ train_losses
  ‚Ä¢ val_losses
  ‚Ä¢ val_accuracies

--------------------------------------------------------------------------------
TRAINING HISTORY
--------------------------------------------------------------------------------

Train Losses (1 epochs):
  First epoch: 9.6372
  Last epoch:  9.6372
  Best (min):  9.6372

Validation Losses (1 epochs):
  First epoch: 9.1404
  Last epoch:  9.1404
  Best (min):  9.1404

Validation Accuracies (1 epochs):

  First epoch:
    Top-1: 0.49%
    Top-5: 2.02%
    Top-10: 3.58%
    Top-20: 6.19%

  Last epoch:
    Top-1: 0.49%
    Top-5: 2.02%
    Top-10: 3.58%
    Top-20: 6.19%

  Best epoch: 1
    Top-1: 0.49%
    Top-5: 2.02%
    Top-10: 3.58%
    Top-20: 6.1

In [13]:
"""
Integration script to create all poster visualizations.
Run this AFTER training your models (main_2.py).

This script loads your trained models and results, then generates
all the publication-quality visualizations for your poster.
"""

import json
import pickle
import pandas as pd
from pathlib import Path
from EnhancedVisualizer import EnhancedVisualizer
from TraningConfig import TrainingConfig

def load_training_results(config):
    """Load training results from saved checkpoints"""
    print("\n[1/4] Loading training results...")
    
    trainers_dict = {}
    
    # Load GRU4Rec results
    try:
        gru_checkpoint = config.MODEL_DIR / 'GRU4Rec_best.pt'
        if gru_checkpoint.exists():
            import torch
            checkpoint = torch.load(gru_checkpoint, map_location='cpu')
            
            # Create dummy trainer object with loaded data
            class TrainerData:
                def __init__(self, checkpoint):
                    self.train_losses = checkpoint.get('train_losses', [])
                    self.val_losses = checkpoint.get('val_losses', [])
                    self.val_accuracies = checkpoint.get('val_accuracies', [])
            
            trainers_dict['GRU4Rec'] = TrainerData(checkpoint)
            print("  ‚úì Loaded GRU4Rec results")
    except Exception as e:
        print(f"  ‚ö† Could not load GRU4Rec: {e}")
    
    # Load Transformer results
    try:
        trans_checkpoint = config.MODEL_DIR / 'Transformer_best.pt'
        if trans_checkpoint.exists():
            import torch
            checkpoint = torch.load(trans_checkpoint, map_location='cpu')
            
            class TrainerData:
                def __init__(self, checkpoint):
                    self.train_losses = checkpoint.get('train_losses', [])
                    self.val_losses = checkpoint.get('val_losses', [])
                    self.val_accuracies = checkpoint.get('val_accuracies', [])
            
            trainers_dict['Transformer'] = TrainerData(checkpoint)
            print("  ‚úì Loaded Transformer results")
    except Exception as e:
        print(f"  ‚ö† Could not load Transformer: {e}")
    
    return trainers_dict

def load_test_results(config):
    """Load final test results from JSON"""
    print("\n[2/4] Loading test results...")
    
    results_dict = {}
    
    # Load deep learning results
    try:
        dl_results_path = config.OUTPUT_DIR / "metrics" / "deep_learning_results.json"
        if dl_results_path.exists():
            with open(dl_results_path, 'r') as f:
                dl_results = json.load(f)
                results_dict.update(dl_results.get('test_results', {}))
            print("  ‚úì Loaded deep learning test results")
    except Exception as e:
        print(f"  ‚ö† Could not load DL results: {e}")
    
    # Load baseline results
    try:
        baseline_path = config.OUTPUT_DIR / "metrics" / "summary.json"
        if baseline_path.exists():
            with open(baseline_path, 'r') as f:
                summary = json.load(f)
                if 'baseline_results' in summary:
                    results_dict.update(summary['baseline_results'])
            print("  ‚úì Loaded baseline results")
    except Exception as e:
        print(f"  ‚ö† Could not load baseline results: {e}")
    
    return results_dict

def load_sequences_data(config):
    """Load sequence data for distribution analysis"""
    print("\n[3/4] Loading sequence data...")
    
    try:
        train_df = pd.read_parquet(config.DATA_DIR / "train.parquet")
        print(f"  ‚úì Loaded {len(train_df):,} training sequences")
        return train_df
    except Exception as e:
        print(f"  ‚ö† Could not load sequences: {e}")
        return None

def load_audio_features(config):
    """Load audio features for feature analysis"""
    print("\n[4/4] Loading audio features...")
    
    try:
        # Try to load from HuggingFace dataset
        import pandas as pd
        audio_df = pd.read_csv("https://huggingface.co/datasets/maharshipandya/spotify-tracks-dataset/resolve/main/dataset.csv")
        print(f"  ‚úì Loaded {len(audio_df):,} tracks with audio features")
        return audio_df
    except Exception as e:
        print(f"  ‚ö† Could not load audio features: {e}")
        return None

def calculate_summary_stats(config, results_dict, sequences_df):
    """Calculate summary statistics for poster"""
    
    summary_stats = {}
    
    # Dataset stats
    if sequences_df is not None:
        summary_stats['num_playlists'] = sequences_df['playlist_id'].nunique()
        summary_stats['train_sequences'] = len(sequences_df)
        summary_stats['num_unique_tracks'] = len(sequences_df['target'].unique())
    
    # Try to load from summary.json
    try:
        summary_path = config.OUTPUT_DIR / "metrics" / "summary.json"
        if summary_path.exists():
            with open(summary_path, 'r') as f:
                saved_summary = json.load(f)
                summary_stats.update(saved_summary)
    except:
        pass
    
    # Best model results (assume Transformer is best)
    if 'Transformer' in results_dict:
        summary_stats['best_model_results'] = results_dict['Transformer']
    
    # Baseline for comparison
    if 'KNN' in results_dict:
        summary_stats['baseline_top10'] = results_dict['KNN'].get(10, 0)
    
    return summary_stats

def main():
    """Main function to create all poster visualizations"""
    
    print("=" * 80)
    print("CREATING POSTER VISUALIZATIONS")
    print("=" * 80)
    
    # Initialize
    config = TrainingConfig()
    viz = EnhancedVisualizer(config.OUTPUT_DIR)
    
    # Load all data
    trainers_dict = load_training_results(config)
    results_dict = load_test_results(config)
    sequences_df = load_sequences_data(config)
    audio_df = load_audio_features(config)
    
    if not trainers_dict and not results_dict:
        print("\n‚ùå ERROR: No training results found!")
        print("Please run main_2.py first to train models.")
        return
    
    # Calculate summary stats
    summary_stats = calculate_summary_stats(config, results_dict, sequences_df)
    
    print("\n" + "=" * 80)
    print("GENERATING VISUALIZATIONS")
    print("=" * 80)
    
    # 1. Learning curves (if training data available)
    if trainers_dict:
        try:
            print("\n[1/9] Creating detailed learning curves...")
            viz.plot_learning_curves_comparison(trainers_dict)
        except Exception as e:
            print(f"  ‚ö† Error creating learning curves: {e}")
    
    # 2. Architecture comparison (if test results available)
    if results_dict:
        try:
            print("\n[2/9] Creating architecture comparison...")
            viz.plot_architecture_comparison(results_dict)
        except Exception as e:
            print(f"  ‚ö† Error creating architecture comparison: {e}")
    
    # 3. Model improvement timeline
    if results_dict:
        try:
            print("\n[3/9] Creating model improvement timeline...")
            viz.plot_model_improvement_timeline(results_dict)
        except Exception as e:
            print(f"  ‚ö† Error creating improvement timeline: {e}")
    
    # 4. Training efficiency
    if trainers_dict:
        try:
            print("\n[4/9] Creating training efficiency analysis...")
            viz.plot_training_efficiency(trainers_dict)
        except Exception as e:
            print(f"  ‚ö† Error creating training efficiency: {e}")
    
    # 5. Poster summary
    if summary_stats:
        try:
            print("\n[5/9] Creating poster summary figure...")
            viz.create_poster_summary_figure(summary_stats)
        except Exception as e:
            print(f"  ‚ö† Error creating poster summary: {e}")
    
    # 6. Data distribution analysis
    if sequences_df is not None:
        try:
            print("\n[6/9] Creating data distribution analysis...")
            viz.plot_data_distribution_analysis(sequences_df)
        except Exception as e:
            print(f"  ‚ö† Error creating data distribution: {e}")
    
    # 7. Audio features analysis
    if audio_df is not None:
        try:
            print("\n[7/9] Creating audio features analysis...")
            viz.plot_audio_features_analysis(audio_df)
        except Exception as e:
            print(f"  ‚ö† Error creating audio features: {e}")
    
    # 8. Feature importance (example - you can customize)
    try:
        print("\n[8/9] Creating feature importance plot...")
        # These are example values - you can calculate real ones from your embeddings
        feature_correlations = {
            'energy': 0.342,
            'danceability': 0.298,
            'valence': 0.256,
            'tempo': 0.189,
            'loudness': 0.167,
            'acousticness': 0.134,
            'instrumentalness': 0.098,
            'speechiness': 0.076
        }
        viz.plot_feature_importance(feature_correlations)
    except Exception as e:
        print(f"  ‚ö† Error creating feature importance: {e}")
    
    # 9. Error analysis (if you have predictions - optional)
    # Uncomment this if you want to analyze specific predictions
    # print("\n[9/9] Creating error analysis...")
    # predictions = [...]  # Load your predictions
    # targets = [...]      # Load your targets
    # positions = [...]    # Load position data
    # viz.plot_error_analysis(predictions, targets, positions)
    
    print("\n" + "=" * 80)
    print("‚úÖ ALL VISUALIZATIONS CREATED!")
    print("=" * 80)
    print(f"\nüìÅ Visualizations saved to: {config.VIZ_DIR}")
    print("\nüìä Files created:")
    
    # List all created files
    viz_files = list(config.VIZ_DIR.glob("*.png"))
    for i, file in enumerate(sorted(viz_files), 1):
        print(f"  {i}. {file.name}")
    
    print("\nüí° Tips for your poster:")
    print("  ‚Ä¢ Use 'poster_summary.png' in your header section")
    print("  ‚Ä¢ Use 'architecture_comparison.png' for results section")
    print("  ‚Ä¢ Use 'improvement_timeline.png' to show your contribution")
    print("  ‚Ä¢ Use 'learning_curves_detailed.png' to show training process")
    print("  ‚Ä¢ Use 'data_distribution.png' in methodology section")
    print("  ‚Ä¢ All images are 300 DPI - suitable for printing!")
    
    print("\nüéì Recommended poster sections:")
    print("  1. TITLE + SUMMARY: poster_summary.png")
    print("  2. DATASET: data_distribution.png")
    print("  3. METHODOLOGY: architecture diagram (create separately)")
    print("  4. TRAINING: learning_curves_detailed.png")
    print("  5. RESULTS: architecture_comparison.png + improvement_timeline.png")
    print("  6. FEATURES: audio_features.png + feature_importance.png")
    print("  7. CONCLUSION: Highlight key findings from summary")
    
    print("\n" + "=" * 80)

if __name__ == "__main__":
    main()

CREATING POSTER VISUALIZATIONS

[1/4] Loading training results...
  ‚úì Loaded GRU4Rec results
  ‚úì Loaded Transformer results

[2/4] Loading test results...
  ‚úì Loaded deep learning test results
  ‚úì Loaded baseline results

[3/4] Loading sequence data...
  ‚úì Loaded 3,293,908 training sequences

[4/4] Loading audio features...
  ‚úì Loaded 114,000 tracks with audio features

GENERATING VISUALIZATIONS

[1/9] Creating detailed learning curves...
‚úì Saved: learning_curves_detailed.png

[2/9] Creating architecture comparison...
‚úì Saved: architecture_comparison.png

[3/9] Creating model improvement timeline...
‚úì Saved: improvement_timeline.png

[4/9] Creating training efficiency analysis...
‚úì Saved: training_efficiency.png

[5/9] Creating poster summary figure...
‚úì Saved: poster_summary.png

[6/9] Creating data distribution analysis...
‚úì Saved: data_distribution.png

[7/9] Creating audio features analysis...
‚úì Saved: audio_features.png

[8/9] Creating feature importance 