# Test Notebook Setup

# Setup and Imports

In [3]:
import os
import pandas as pd
from datasets import load_dataset
import zipfile

print("Starting dataset download process...")

Starting dataset download process...


### Download Chatbot Arena Dataset

In [5]:
# Download from Hugging Face
print("Downloading Chatbot Arena dataset from Hugging Face...")
print("This may take 5-10 minutes depending on your internet speed...")

df = pd.read_csv("hf://datasets/lmarena-ai/arena-human-preference-55k/train.csv")

# Convert to DataFrame
df_arena = pd.DataFrame(df)

# Save to our raw data folder
output_path = '../data/raw/chatbot_arena.csv'
df_arena.to_csv(output_path, index=False)

print(f"✓ Chatbot Arena dataset saved to {output_path}")
print(f"  Shape: {df_arena.shape}")
print(f"  Columns: {list(df_arena.columns)}")

Downloading Chatbot Arena dataset from Hugging Face...
This may take 5-10 minutes depending on your internet speed...
✓ Chatbot Arena dataset saved to ../data/raw/chatbot_arena.csv
  Shape: (57477, 9)
  Columns: ['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b', 'winner_model_a', 'winner_model_b', 'winner_tie']


### Download Kaggle Dataset

In [6]:
# Download from Kaggle
print("Downloading Kaggle LLM comparison dataset...")

# Download using Kaggle API
os.system('kaggle datasets download -d samayashar/large-language-models-comparison-dataset -p ../data/raw/')

# Find and unzip the file
zip_path = '../data/raw/large-language-models-comparison-dataset.zip'

if os.path.exists(zip_path):
    print("Extracting zip file...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall('../data/raw/')
    print("✓ Dataset extracted")
    
    # Remove zip file to save space
    os.remove(zip_path)
    print("✓ Zip file removed")
else:
    print("Warning: Zip file not found. Check if download completed.")

Downloading Kaggle LLM comparison dataset...
Dataset URL: https://www.kaggle.com/datasets/samayashar/large-language-models-comparison-dataset
License(s): CC0-1.0
Downloading large-language-models-comparison-dataset.zip to ../data/raw

Extracting zip file...
✓ Dataset extracted
✓ Zip file removed


100%|██████████| 5.76k/5.76k [00:00<00:00, 11.7MB/s]


### Load and Inspect Kaggle Dataset

In [8]:
# Find CSV files in raw folder
csv_files = [f for f in os.listdir('../data/raw/') if f.endswith('.csv') and 'chatbot' not in f]
print(f"CSV files found: {csv_files}")

# Load the Kaggle dataset (adjust filename as needed)
if csv_files:
    kaggle_file = csv_files[0]
    df_kaggle = pd.read_csv(f'../data/raw/{kaggle_file}')
    
    print(f"\n✓ Kaggle dataset loaded: {kaggle_file}")
    print(f"  Shape: {df_kaggle.shape}")
    print(f"  Columns: {list(df_kaggle.columns)}")

CSV files found: ['llm_comparison_dataset.csv']

✓ Kaggle dataset loaded: llm_comparison_dataset.csv
  Shape: (200, 15)
  Columns: ['Model', 'Provider', 'Context Window', 'Speed (tokens/sec)', 'Latency (sec)', 'Benchmark (MMLU)', 'Benchmark (Chatbot Arena)', 'Open-Source', 'Price / Million Tokens', 'Training Dataset Size', 'Compute Power', 'Energy Efficiency', 'Quality Rating', 'Speed Rating', 'Price Rating']
