# Data Loader

This file aims to add utils functions and classes to load and handle data of the project.

### Class to import in other files to load datasets of the project

In [7]:
import os
import pandas as pd
from typing import Dict, List, Optional, Tuple, Any
import numpy as np

class DataLoader:
    """
    Flexible DataLoader class for loading various datasets optimized for different 
    recommender system libraries (Cornac, Surprise, scikit-learn, etc.)
    """
    
    def __init__(self, data_folder: str):
        self.data_folder = data_folder
        
        # Core dataset files
        self.files = {
            # Original datasets
            "reviews": "reviews.csv",
            "recipes": "recipes.csv",
            "clean_reviews": "clean_reviews.csv",
            
            # K-core filtered datasets for Cornac
            "reviews_k6": "reviews_dataset_k6.csv",
            "train_k6": "reviews_train_k6.csv", 
            "test_k6": "reviews_test_k6.csv",
        }
        
        self.paths = {k: os.path.join(self.data_folder, v) for k, v in self.files.items()}
        
        # Library-specific configurations
        self.library_configs = {
            'cornac': {
                'required_columns': ['AuthorId', 'RecipeId', 'Rating'],
                'user_col': 'AuthorId',
                'item_col': 'RecipeId', 
                'rating_col': 'Rating',
                'default_dataset': 'reviews_k6'
            }
        }

    # ==================== BASIC LOADING METHODS ====================
    
    def load_raw_reviews(self) -> pd.DataFrame:
        """Load original reviews dataset"""
        return pd.read_csv(self.paths["reviews"])

    def load_recipes(self) -> pd.DataFrame:
        """Load recipes dataset"""
        return pd.read_csv(self.paths["recipes"])
    
    def load_reviews(self) -> pd.DataFrame:
        """Load cleaned reviews dataset"""
        if os.path.exists(self.paths["clean_reviews"]):
            return pd.read_csv(self.paths["clean_reviews"])
        else:
            return self.load_reviews()  # Fallback to original reviews

    # ==================== K-CORE FILTERED DATASETS ====================
    
    def load_reviews_k6(self) -> pd.DataFrame:
        """Load k-core filtered dataset (K=6) - optimized for Cornac"""
        return pd.read_csv(self.paths["reviews_k6"])
    
    def load_train_k6(self) -> pd.DataFrame:
        """Load k-core filtered training set (K=6)"""
        return pd.read_csv(self.paths["train_k6"])
    
    def load_test_k6(self) -> pd.DataFrame:
        """Load k-core filtered test set (K=6)"""
        return pd.read_csv(self.paths["test_k6"])
    
    def load_k6_split(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Load both k-core filtered train and test sets"""
        return self.load_train_k6(), self.load_test_k6()

    # ==================== LIBRARY-SPECIFIC METHODS ====================
    
    def load_for_cornac(self, dataset_type: str = 'full', return_mappings: bool = False) -> Any:
        """
        Load data optimized for Cornac library
        
        Args:
            dataset_type: 'full', 'train', 'test', or 'split'
            return_mappings: If True, also return user/item ID mappings
            
        Returns:
            Cornac-compatible data format
        """
        config = self.library_configs['cornac']
        
        if dataset_type == 'full':
            df = self.load_reviews_k6()
        elif dataset_type == 'train':
            df = self.load_train_k6()
        elif dataset_type == 'test':
            df = self.load_test_k6()
        elif dataset_type == 'split':
            return self.load_k6_split()
        else:
            raise ValueError(f"Unknown dataset_type: {dataset_type}")
        
        # Ensure required columns exist
        self._validate_columns(df, config['required_columns'])
        
        if return_mappings:
            user_mapping = {user: idx for idx, user in enumerate(df[config['user_col']].unique())}
            item_mapping = {item: idx for idx, item in enumerate(df[config['item_col']].unique())}
            return df, user_mapping, item_mapping
        
        return df
    
    # ==================== UTILITY METHODS ====================
    
    def get_available_files(self) -> List[str]:
        """Get list of available dataset files"""
        available = []
        for key, filename in self.files.items():
            if os.path.exists(self.paths[key]):
                available.append(f"{key}: {filename}")
        return available
    
    def get_dataset_info(self, dataset_name: str) -> Dict[str, Any]:
        """Get information about a specific dataset"""
        if dataset_name not in self.files:
            raise ValueError(f"Unknown dataset: {dataset_name}")
        
        path = self.paths[dataset_name]
        if not os.path.exists(path):
            return {"exists": False, "path": path}
        
        # Load dataset to get info
        df = pd.read_csv(path)
        
        info = {
            "exists": True,
            "path": path,
            "shape": df.shape,
            "columns": list(df.columns),
            "file_size_mb": os.path.getsize(path) / (1024 * 1024)
        }
        
        # Add specific info if it's a reviews dataset
        if any(col in df.columns for col in ['AuthorId', 'RecipeId', 'Rating']):
            info.update({
                "n_users": df['AuthorId'].nunique() if 'AuthorId' in df.columns else None,
                "n_items": df['RecipeId'].nunique() if 'RecipeId' in df.columns else None,
                "n_ratings": len(df),
                "rating_range": (df['Rating'].min(), df['Rating'].max()) if 'Rating' in df.columns else None,
                "sparsity": self._calculate_sparsity(df) if all(col in df.columns for col in ['AuthorId', 'RecipeId']) else None
            })
        
        return info
    
    def list_datasets_by_library(self, library: str) -> Dict[str, Any]:
        """List recommended datasets for a specific library"""
        if library not in self.library_configs:
            raise ValueError(f"Unknown library: {library}. Supported: {list(self.library_configs.keys())}")
        
        config = self.library_configs[library]
        recommended_datasets = []
        
        # Check which datasets are compatible
        for dataset_name in ['reviews_k6', 'train_k6', 'test_k6']:
            if os.path.exists(self.paths[dataset_name]):
                info = self.get_dataset_info(dataset_name)
                if all(col in info['columns'] for col in config['required_columns']):
                    recommended_datasets.append({
                        'name': dataset_name,
                        'info': info,
                        'recommended_for': ['training', 'evaluation'] if 'train' in dataset_name or 'test' in dataset_name else ['full_dataset']
                    })
        
        return {
            'library': library,
            'config': config,
            'recommended_datasets': recommended_datasets
        }

    # ==================== PRIVATE HELPER METHODS ====================
    
    def _validate_columns(self, df: pd.DataFrame, required_cols: List[str]) -> None:
        """Validate that DataFrame has required columns"""
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns: {missing_cols}")
    
    
    def _calculate_sparsity(self, df: pd.DataFrame) -> float:
        """Calculate sparsity of the user-item matrix"""
        n_users = df['AuthorId'].nunique()
        n_items = df['RecipeId'].nunique()
        n_ratings = len(df)
        possible_ratings = n_users * n_items
        return (1 - n_ratings / possible_ratings) * 100
    
    
    # ==================== HELP METHOD ====================
    
    def help(self) -> None:
        """
        Display a comprehensive help guide for the DataLoader class
        """
        print("=" * 100)
        print("🚀 DataLoader Class - Comprehensive Help Guide")
        print("=" * 100)
        
        print("\n📋 OVERVIEW:")
        print("-" * 50)
        print("Flexible DataLoader for loading datasets optimized for different recommender system libraries")
        print("Supports: Cornac and custom usage")
        print(f"Data folder: {self.data_folder}")
        
        print("\n📁 AVAILABLE DATASETS:")
        print("-" * 50)
        available = self.get_available_files()
        if available:
            for file_info in available:
                print(f"  ✅ {file_info}")
        else:
            print("  ❌ No datasets found in the data folder")
        
        print("\n🔧 BASIC LOADING METHODS:")
        print("-" * 50)
        basic_methods = [
            ("load_reviews()", "Load original reviews dataset"),
            ("load_recipes()", "Load recipes dataset"),
            ("load_clean_reviews()", "Load cleaned reviews dataset")
        ]
        for method, description in basic_methods:
            print(f"  📄 {method:<25} → {description}")
        
        print("\n⭐ K-CORE FILTERED DATASETS (Recommended for Cornac):")
        print("-" * 50)
        kcore_methods = [
            ("load_reviews_k6()", "Load k-core filtered dataset (K=6) - optimized for Cornac"),
            ("load_train_k6()", "Load k-core filtered training set (K=6)"),
            ("load_test_k6()", "Load k-core filtered test set (K=6)"),
            ("load_k6_split()", "Load both k-core filtered train and test sets")
        ]
        for method, description in kcore_methods:
            print(f"  ⭐ {method:<25} → {description}")
        
        print("\n🔄 LEGACY METHODS (Backward Compatibility):")
        print("-" * 50)
        legacy_methods = [
            ("load_train_reviews()", "Load legacy training reviews"),
            ("load_test_reviews()", "Load legacy test reviews"),
            ("load_reviews_knn()", "Load legacy KNN filtered reviews"),
            ("load_train_knn_reviews()", "Load legacy KNN training reviews"),
            ("load_test_knn_reviews()", "Load legacy KNN test reviews")
        ]
        for method, description in legacy_methods:
            print(f"  🔄 {method:<25} → {description}")
        
        print("\n🎯 LIBRARY-SPECIFIC METHODS:")
        print("-" * 50)
        library_methods = [
            ("load_for_cornac()", "Load data optimized for Cornac library"),
            ("load_for_surprise()", "Load data optimized for Surprise library"),
            ("load_for_sklearn()", "Load data optimized for scikit-learn")
        ]
        for method, description in library_methods:
            print(f"  🎯 {method:<25} → {description}")
        
        print("\n🛠️  UTILITY METHODS:")
        print("-" * 50)
        utility_methods = [
            ("get_available_files()", "Get list of available dataset files"),
            ("get_dataset_info(name)", "Get detailed information about a specific dataset"),
            ("list_datasets_by_library(lib)", "List recommended datasets for a specific library"),
            ("help()", "Display this help guide")
        ]
        for method, description in utility_methods:
            print(f"  🛠️  {method:<25} → {description}")
        
        print("\n💡 QUICK START EXAMPLES:")
        print("-" * 50)
        
        print("\n  🔹 For Cornac:")
        print("     train_df, test_df = loader.load_for_cornac(dataset_type='split')")
        
        print("\n  🔹 Get dataset info:")
        print("     info = loader.get_dataset_info('reviews_k6')")
        
        print("\n  🔹 Check library compatibility:")
        print("     recommendations = loader.list_datasets_by_library('cornac')")
        
        print("\n📊 DATASET PARAMETERS:")
        print("-" * 50)
        print("  dataset_type options: 'full', 'train', 'test', 'split'")
        print("  return_format options (sklearn): 'matrix', 'dataframe', 'arrays'")
        print("  Supported libraries: 'cornac', 'surprise', 'sklearn'")
        
        # Show current dataset stats if available
        try:
            if os.path.exists(self.paths['reviews_k6']):
                info = self.get_dataset_info('reviews_k6')
                print(f"\n📈 CURRENT K-CORE DATASET STATS:")
                print("-" * 50)
                print(f"  👥 Users: {info.get('n_users', 'N/A'):,}")
                print(f"  🍳 Recipes: {info.get('n_items', 'N/A'):,}")
                print(f"  ⭐ Ratings: {info.get('n_ratings', 'N/A'):,}")
                print(f"  📏 Sparsity: {info.get('sparsity', 'N/A'):.2f}%")
                print(f"  💾 File size: {info.get('file_size_mb', 'N/A'):.1f} MB")
        except:
            pass
        
        print("\n" + "=" * 100)
        print("🎉 Ready to load data for your recommender system experiments!")
        print("💬 Use loader.help() anytime to see this guide again.")
        print("=" * 100)

In [8]:
"""
# ==================== USAGE EXAMPLES ====================

# Initialize the DataLoader
data_folder = "data"  # Adjust path as needed
loader = DataLoader(data_folder)

print("=" * 80)
print("DATALOADER USAGE EXAMPLES")
print("=" * 80)

# 1. Check available datasets
print("\n1. Available datasets:")
print("-" * 40)
available_files = loader.get_available_files()
for file in available_files:
    print(f"  ✓ {file}")

# 2. Load k-core filtered dataset for Cornac
print("\n2. Loading data for Cornac:")
print("-" * 40)
try:
    # Load full k-core dataset
    cornac_data = loader.load_for_cornac(dataset_type='full')
    print(f"  ✓ Full dataset loaded: {cornac_data.shape}")
    print(f"    Users: {cornac_data['AuthorId'].nunique():,}")
    print(f"    Recipes: {cornac_data['RecipeId'].nunique():,}")
    print(f"    Ratings: {len(cornac_data):,}")
    
    # Load train/test split
    train_data, test_data = loader.load_for_cornac(dataset_type='split')
    print(f"  ✓ Train/Test split loaded:")
    print(f"    Train: {train_data.shape}")
    print(f"    Test: {test_data.shape}")
    
except Exception as e:
    print(f"  ✗ Error loading Cornac data: {e}")

# 5. Get dataset information
print("\n5. Dataset information:")
print("-" * 40)
try:
    info = loader.get_dataset_info('reviews_k6')
    print(f"  ✓ K6 dataset info:")
    print(f"    Shape: {info.get('shape', 'N/A')}")
    print(f"    Users: {info.get('n_users', 'N/A'):,}")
    print(f"    Items: {info.get('n_items', 'N/A'):,}")
    print(f"    Ratings: {info.get('n_ratings', 'N/A'):,}")
    print(f"    Sparsity: {info.get('sparsity', 'N/A'):.2f}%")
    print(f"    File size: {info.get('file_size_mb', 'N/A'):.1f} MB")
except Exception as e:
    print(f"  ✗ Error getting dataset info: {e}")

# 6. Library-specific recommendations
print("\n6. Library-specific recommendations:")
print("-" * 40)
for library in ['cornac']:
    try:
        recommendations = loader.list_datasets_by_library(library)
        print(f"  ✓ {library.upper()}:")
        print(f"    Default dataset: {recommendations['config']['default_dataset']}")
        print(f"    Available datasets: {len(recommendations['recommended_datasets'])}")
    except Exception as e:
        print(f"  ✗ Error getting {library} recommendations: {e}")

print("\n" + "=" * 80)
print("DataLoader ready for use! 🚀")
print("=" * 80)"""



In [9]:
# loader.help()  # Display the help guide