In [1]:
!pip install nltk




In [2]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
import json
import re
import random
import nltk
from nltk.tokenize import sent_tokenize
import pandas as pd
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split

# Load tokenizer for token counting
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B-Instruct")

def preprocess_document(text):
    """Clean and preprocess the document text."""
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters that might interfere with processing
    text = re.sub(r'[^\w\s.,;:?!()\[\]{}\-\'\"$%]', ' ', text)
    return text.strip()

def extract_sections(text):
    """Extract main sections from the document."""
    # Simple section extraction based on patterns in the document
    sections = {}

    # Extract overview
    if "Deepseek V3: A Game-Changer in A.I. Here's Why It Matters" in text:
        overview_end = text.find("So what made Deepseek such a big impact to A.I. ?")
        if overview_end > 0:
            sections["overview"] = text[:overview_end].strip()

    # Extract significance
    significance_start = text.find("So what made Deepseek such a big impact to A.I. ?")
    significance_end = text.find("Summary of how Deepseek v3 was so efficient at training the frontier model")
    if significance_start > 0 and significance_end > 0:
        sections["significance"] = text[significance_start:significance_end].strip()

    # Extract model architecture
    architecture_start = text.find("Model Architecture")
    architecture_end = text.find("FP8 Mixed Precision Training:")
    if architecture_start > 0 and architecture_end > 0:
        sections["architecture"] = text[architecture_start:architecture_end].strip()

    # Extract training approach
    training_start = text.find("FP8 Mixed Precision Training:")
    training_end = text.find("Load Balancing Strategy")
    if training_start > 0 and training_end > 0:
        sections["training"] = text[training_start:training_end].strip()

    # Extract cost analysis
    cost_start = text.find("Breakdown of the costs of the Deepseek v3 model")
    if cost_start > 0:
        sections["cost"] = text[cost_start:].strip()

    return sections

def generate_qa_pairs(document, sections):
    """Generate question-answer pairs from the document."""
    qa_pairs = []

    # General questions about the paper
    qa_pairs.extend([
        {
            "question": "What is DeepSeek V3 and why is it significant in the AI industry?",
            "answer": "DeepSeek V3 is an AI model from the Chinese startup Deepseek that has caused a stir in the AI space due to its impressive performance at a fraction of the training and inference cost compared to competitors. It uses a Mixture-of-Experts (MoE) architecture with 671B parameters but only 37B active parameters per token, which significantly reduces compute requirements. Its significance lies in improving algorithms at the software level rather than pushing for better hardware, making AI development more efficient and potentially disrupting the industry.",
            "source": "overview"
        },
        {
            "question": "How did DeepSeek V3 achieve training efficiency compared to other large language models?",
            "answer": "DeepSeek V3 achieved significant training efficiency through several innovations: 1) Using a Mixture-of-Experts (MoE) architecture where only 37B parameters fire for each token out of the total 671B, 2) Implementing FP8 mixed precision training which reduced memory usage by up to 50% compared to traditional formats, 3) Developing a custom training framework called HAI-LLM with optimizations like the DualPipe algorithm for efficient pipeline parallelism, 4) Using Multi-head Latent Attention (MLA) to compress the Key-Value cache, 5) Pioneering an auxiliary loss-free strategy for load balancing in the MoE architecture, and 6) Implementing efficient cross-node communication kernels and memory optimizations.",
            "source": "significance,architecture,training"
        },
        {
            "question": "What was the approximate cost of training DeepSeek V3 and how does it compare to other models?",
            "answer": "The approximate cost of training DeepSeek V3 was around $5.5-5.6 million, which included approximately 2.788 million GPU hours at an estimated rate of $2 per GPU hour. The training used a cluster of 2,048 H800 GPUs and was completed in less than two months. This is significantly more efficient than competitor models like Llama 3.1, which reportedly required 30.84 million GPU hours for training on a similar amount of data (15 trillion tokens vs. DeepSeek's 14.8 trillion tokens).",
            "source": "cost"
        }
    ])

    # Extract sentences and generate more specific QA pairs
    for section_name, section_text in sections.items():
        sentences = sent_tokenize(section_text)

        if section_name == "architecture":
            qa_pairs.extend([
                {
                    "question": "What type of architecture does DeepSeek V3 use?",
                    "answer": "DeepSeek V3 uses a Mixture-of-Experts (MoE) architecture where only 37B parameters fire for each token out of the total 671B parameters. This sparse activation significantly reduces compute requirements compared to dense models. The model also uses Multi-head Latent Attention (MLA) which compresses the Key-Value cache, reducing memory usage and enabling more efficient training.",
                    "source": "architecture"
                },
                {
                    "question": "How many parameters does DeepSeek V3 have and how many are active during inference?",
                    "answer": "DeepSeek V3 has a total of 671B parameters in its Mixture-of-Experts (MoE) architecture, but only 37B parameters are active (fire) for each token during processing. This sparse activation approach significantly reduces the computational requirements compared to dense models of similar size.",
                    "source": "architecture"
                }
            ])

        elif section_name == "training":
            qa_pairs.extend([
                {
                    "question": "What precision format did DeepSeek V3 use for training and what were its benefits?",
                    "answer": "DeepSeek V3 implemented an FP8 mixed precision training framework, which reduced memory usage and accelerated training compared to higher precision formats. This approach reduced the memory footprint by up to 50% compared to traditional FP16/FP32 formats. They used fine-grained quantization strategies and increased accumulation precision to maintain accuracy while gaining the efficiency benefits.",
                    "source": "training"
                },
                {
                    "question": "What is the HAI-LLM framework mentioned in the DeepSeek V3 paper?",
                    "answer": "HAI-LLM is a custom training framework developed by DeepSeek for training their V3 model. It includes several optimizations such as the DualPipe algorithm for efficient pipeline parallelism (which reduces pipeline bubbles and overlaps computation and communication), efficient cross-node all-to-all communication kernels to fully utilize network bandwidth, and careful memory optimizations to avoid using costly tensor parallelism.",
                    "source": "training"
                }
            ])

        elif section_name == "cost":
            qa_pairs.extend([
                {
                    "question": "How many GPU hours did it take to train DeepSeek V3?",
                    "answer": "Training DeepSeek V3 took approximately 2.788 million GPU hours in total. This breaks down to 2,664,000 GPU hours for pre-training, 119,000 GPU hours for context length extension, and 5,000 GPU hours for post-training. The training was completed in less than two months using a cluster of 2,048 H800 GPUs.",
                    "source": "cost"
                },
                {
                    "question": "How many tokens was DeepSeek V3 trained on?",
                    "answer": "DeepSeek V3 was trained on approximately 14.8 trillion high-quality tokens. For perspective, this is comparable to Llama 3.1's 15 trillion tokens, but DeepSeek achieved much greater efficiency in its training process.",
                    "source": "cost"
                }
            ])

    # Generate more detailed/specific questions based on document content
    specific_facts = [
        {
            "question": "What compression ratio did DeepSeek achieve for key value indices?",
            "answer": "According to the document, DeepSeek compressed key value indices which consume significant VRAM, achieving 93% compression ratios. This was part of their overall approach to make training more efficient.",
            "source": "significance"
        },
        {
            "question": "What is Multi-head Latent Attention (MLA) and how was it used in DeepSeek V3?",
            "answer": "Multi-head Latent Attention (MLA) is a technique used in DeepSeek V3 that compresses the Key-Value cache. This reduces memory usage and enables more efficient training by decreasing the memory footprint required during model operation.",
            "source": "architecture"
        },
        {
            "question": "How does DeepSeek V3 perform at reasoning and math compared to other models?",
            "answer": "According to the document, DeepSeek V3 excels at reasoning and math tasks, surpassing the performance of GPT-4 and Claude 3.5 Sonnet in these areas. For writing and coding tasks, however, Claude 3.5 Sonnet maintains a slight lead over DeepSeek V3.",
            "source": "cost"
        },
        {
            "question": "What was DeepSeek's approach to load balancing in their MoE architecture?",
            "answer": "DeepSeek pioneered an auxiliary loss-free strategy for load balancing in their Mixture-of-Experts (MoE) architecture. This improved performance without the drawbacks of traditional auxiliary loss methods that are typically used in MoE architectures.",
            "source": "training"
        }
    ]
    qa_pairs.extend(specific_facts)

    # Generate comparative questions
    comparative_questions = [
        {
            "question": "How does the training efficiency of DeepSeek V3 compare to Llama 3.1?",
            "answer": "DeepSeek V3 was significantly more efficient in training compared to Llama 3.1. While DeepSeek V3 required about 2.788 million GPU hours to train on 14.8 trillion tokens, Llama 3.1 reportedly required 30.84 million GPU hours to train on a similar amount of data (15 trillion tokens). This means DeepSeek V3 was approximately 11 times more efficient than Llama 3.1 in terms of GPU hours required for training.",
            "source": "cost"
        },
        {
            "question": "What are the key differences between DeepSeek's approach and that of other AI companies?",
            "answer": "The key difference in DeepSeek's approach compared to other AI companies is that while others pushed for better hardware, DeepSeek focused on improving algorithms, achieving better results at a software level. Specifically, they made training 45 times more efficient by: using 8-bit instead of 32-bit to save memory, compressing key value indices with 93% compression ratios, implementing multi-token prediction instead of single-token prediction (doubling inference speeds), and using a Mixture-of-Experts model that decomposes a big model into small models capable of running on consumer-grade hardware.",
            "source": "significance"
        }
    ]
    qa_pairs.extend(comparative_questions)

    # Generate application and future-focused questions
    application_questions = [
        {
            "question": "What potential impact could DeepSeek V3's efficient approach have on the AI industry?",
            "answer": "DeepSeek V3's efficient approach could have several significant impacts on the AI industry: 1) It could democratize access to powerful AI by reducing the hardware requirements and costs, 2) It could accelerate AI development by enabling faster training cycles, 3) It could shift industry focus from hardware scaling to algorithmic efficiency, 4) It might lead to more environmentally sustainable AI by reducing energy consumption, and 5) It could disrupt the market dominance of major US tech companies, as evidenced by the market concerns and impact on share prices mentioned in the document.",
            "source": "overview,significance"
        },
        {
            "question": "How might the techniques used in DeepSeek V3 be applied to smaller models or different domains?",
            "answer": "The techniques used in DeepSeek V3 could be applied to smaller models or different domains in several ways: 1) The Mixture-of-Experts architecture could help create more efficient smaller models by activating only relevant parameters, 2) FP8 mixed precision training could reduce memory requirements for models of any size, 3) The Key-Value cache compression techniques could improve efficiency in models designed for limited hardware, 4) Multi-token prediction could speed up inference across various applications, and 5) The custom training framework optimizations could benefit any distributed training setup regardless of model size or application domain.",
            "source": "architecture,training,significance"
        }
    ]
    qa_pairs.extend(application_questions)

    return qa_pairs

def create_instruction_dataset(qa_pairs):
    """Convert QA pairs to instruction format for fine-tuning."""
    dataset = []

    for pair in qa_pairs:
        # Format for Qwen2.5 Instruct model
        instruction = {
            "conversations": [
                {"role": "user", "content": f"Based on the research about DeepSeek V3, please answer the following question: {pair['question']}"},
                {"role": "assistant", "content": pair['answer']}
            ]
        }
        dataset.append(instruction)

    return dataset

# Load the document
document_text = """author - Visith Kumarapperuma

Deepseek V3: A Game-Changer in A.I. Here's Why It Matters
Currently, the AI models from the Chinese startup Deepseek are causing quite a stir in the AI space. Their latest reasoning model, Deepseek r1, shows better or equal performance to competitors. But above all, they achieved it with a fraction of the training and inference cost. DeepSeek's AI Assistant overtook ChatGPT to become the most downloaded free app on the U.S. App Store. This development has led to market concerns about A.I. investments to major U.S. tech companies. Impacting share prices of tech firms including Nvidia.

So what made Deepseek such a big impact to A.I. ?
The significance of Deepseek as a disruptor in the industry lies in its approach. Unlike other companies that pushed for better hardware, Deepseek improved the algorithms. Thus achieving better results at a software level. Note that the following details are for the Deepseek V3 model. • Deepseek said it trained a model using a data centre of some 2,000 of Nvidia H800 GPUs. • Time duration 2 months with the cost of the *final training run being ~$5.5 million This ~$5.5M reflects the "rental" cost for the GPU hours needed to train DeepSeek‑V3. It does not include:

The capital expenditure for owning the hardware.
Costs associated with prior research, ablation studies, or experiments on alternative architectures/algorithms/data.
Deepseek made training more efficient (45 times more efficient)
Use 8-bit instead of 32-bit to save memory.
Compress key value indices which eat up a lot of VRAM; they got 93% compression ratios.
Do multi-token prediction instead of single-token prediction -> doubled inference speeds
The MOE model decomposes a big model into small models that can run on consumer-grade hardware.
Summary of how Deepseek v3 was so efficient at training the frontier model
Model Architecture The model employs a Mixture-of-Experts (MoE) architecture, where only 37B parameters fire for each token out of the total 671B. This sparse activation significantly reduces compute requirements compared to dense models. The model uses Multi-head Latent Attention (MLA). This compresses the Key-Value cache, reducing memory usage and enabling more efficient training.
FP8 Mixed Precision Training: They implemented an FP8 mixed precision training framework. Which reduces memory usage and accelerates training compared to higher precision formats. Reduced memory footprint by up to 50% compared to traditional FP16/FP32 formats. They use fine-grained quantisation strategies and increased accumulation precision to maintain accuracy.
Load Balancing Strategy They pioneered an auxiliary loss-free strategy for load balancing in the MoE architecture. This improved performance without the drawbacks of traditional auxiliary loss methods.
Training Framework They developed a custom training framework called HAI-LLM with several optimisations: DualPipe algorithm for efficient pipeline parallelism. This reduces pipeline bubbles and overlapping computation and communication. Efficient cross-node all-to-all communication kernels to fully utilise network bandwidth. Careful memory optimisations to avoid using costly tensor parallelism.
Breakdown of the costs of the Deepseek v3 model
Deepseek's flagship model v3 showcases an architecture with a 671B parameter MOE (Mixture of Agents) with 37B active parameters per token

Their success stems from breakthrough engineering: using MoE architecture, implementing FP8 mixed precision training, and developing a custom HAI-LLM framework.
Deepseek excels at reasoning and math, surpassing GPT-4 and Claude 3.5 Sonnet.
For writing and coding tasks, Claude 3.5 Sonnet maintains a slight lead.
Deepseek pre-trained this model on 14.8 trillion high-quality data, taking 2,788,000 GPU hours on the Nvidia h800s cluster, costing around only $6 million
the Llama 403b was trained on 11x of that, taking 30,840,000 GPU hours, also on 15 trillion tokens. So how true is the claim of $5.5 million, or is it another marketing trick?
Underlying FLOP calculations Model Details:
Active Parameters: 37B (using FP8 precision)
FLOPs per token: Using the rule of thumb "6 FLOPs per parameter per token." 37B×6 = 222B FLOPs per token
Total Training Tokens: Approximately 14.8 trillion tokens
Total FLOPs required: 222 B FLOPs/token×14.8 T tokens ≈ 3.3×10²⁴ FLOPs
GPU FLOP Capacity (H800/H100):
An H100 is roughly estimated to deliver about. 3.958×10¹⁵ FLOPs (per second or per some standardised interval — here used as a comparative metric). Ideal (Perfect Efficiency) GPU hours. (Dividing total required FLOPs by per‑GPU capability gives) 3.3×10²⁴ / 3.958×10¹⁵ ​≈ 8.33×10⁸ seconds⇒≈0.4 million GPU hour Note: This "perfect efficiency" scenario is a lower bound. Real-world training is less efficient. 2. Adjusting for Real‑World Inefficiencies (Comparison with Llama 3.1) Reference Model: Llama 3.1 (405B parameters, 15 T tokens) reportedly required 30.84 M GPU hours in practice. Recalculating FLOPs for Llama 3.1: Using the same math: 3.64×10²⁵ FLOPs required Scaling Efficiency Using the ratio of FLOPs needed for DeepSeek‑V3 versus Llama 3.1. and assuming similar inefficiencies. The estimate adjusts to roughly 2.79M GPU hours for DeepSeek‑V3 training. 3. DeepSeek‑V3 Reported Training Breakdown According to the DeepSeek‑V3 paper Pre‑training Stage:

Per Trillion Tokens: 180K H800 GPU hours
Overall Pre‑training: Total of 2,664K GPU hours
This stage was completed in less than two months using a cluster of 2,048 H800 GPUs. Context Length Extension:
Additional 119K GPU hours Post‑training:
An extra 5K GPU hours Total GPU Hours: 2,664 K+119 K+5 K≈2.788M GPU hours
Cost Estimation Assumed GPU Rental Price: $2 per GPU hour Total Rental Cost: 2.788M GPU hours×$2/hour≈$5.576 million as stated in Deepseek paper During the pre‑training stage, training DeepSeek‑V3 on each trillion tokens requires only 180K H800 GPU hours… Consequently, our pre‑training stage is completed in less than two months and costs 2664K GPU hours. Combined with 119K GPU hours for the context length extension and 5K GPU hours for post‑training, DeepSeek‑V3 costs only 2.788M GPU hours for its full training. Assuming the rental price of the H800 GPU is $2 per GPU hour, our total training costs amount to only $5.576M.
Summary Theoretical (Perfect Efficiency) Estimate: ~0.4 M GPU hours (using idealised FLOP counts and assuming perfect hardware utilisation0 Adjusted (Real‑World) Estimate (via Llama 3.1 comparison): ~2.79 GPU hours DeepSeek‑V3 Reported Breakdown: Pre‑training: 2,664K GPU hours Context Extension: 119K GPU hours Post‑training: 5K GPU hours Total: ~2.788 M GPU hours
Cost (at $2 per GPU hour): ~$5.576 million"""

# Clean the document
clean_doc = preprocess_document(document_text)

# Extract sections
doc_sections = extract_sections(clean_doc)

# Generate question-answer pairs
qa_pairs = generate_qa_pairs(clean_doc, doc_sections)

# Convert to instruction format
instruction_dataset = create_instruction_dataset(qa_pairs)

# Augment data by creating variations
def augment_data(dataset, augmentation_factor=2):
    augmented_dataset = dataset.copy()
    question_variations = [
        "Can you tell me about {}?",
        "Please explain {}.",
        "I'd like to know more about {}.",
        "What information do you have on {}?",
        "Could you elaborate on {}?",
        "What does the research say about {}?"
    ]

    for _ in range(augmentation_factor - 1):
        for item in dataset:
            original_question = item["conversations"][0]["content"]
            # Extract the core question by removing the prefix
            core_question = original_question.replace("Based on the research about DeepSeek V3, please answer the following question: ", "")

            # Create a new variation
            template = random.choice(question_variations)
            new_question = f"Based on the DeepSeek V3 research paper, {template.format(core_question.lower().rstrip('?'))}"

            # Create a new data point
            new_item = {
                "conversations": [
                    {"role": "user", "content": new_question},
                    {"role": "assistant", "content": item["conversations"][1]["content"]}
                ]
            }
            augmented_dataset.append(new_item)

    return augmented_dataset

# Augment the dataset
augmented_dataset = augment_data(instruction_dataset, augmentation_factor=3)

# Generate additional training examples with RAG context format
def create_rag_format_examples(qa_pairs, doc_sections):
    rag_examples = []

    for pair in qa_pairs:
        source_section = pair.get("source", "").split(",")[0]
        context = doc_sections.get(source_section, "")

        if context:
            # Format for RAG scenario where context is provided
            instruction = {
                "conversations": [
                    {"role": "user", "content": f"""Here is information about DeepSeek V3:

{context}

Based on this information, please answer: {pair['question']}"""},
                    {"role": "assistant", "content": pair['answer']}
                ]
            }
            rag_examples.append(instruction)

    return rag_examples

# Create RAG-format examples
rag_examples = create_rag_format_examples(qa_pairs, doc_sections)

# Combine all examples
combined_dataset = augmented_dataset + rag_examples

# Function to calculate token counts for dataset assessment
def calculate_token_count(text):
    return len(tokenizer.encode(text))

# Calculate token statistics
token_counts = []
for item in combined_dataset:
    user_msg = item["conversations"][0]["content"]
    assistant_msg = item["conversations"][1]["content"]
    token_counts.append({
        "user_tokens": calculate_token_count(user_msg),
        "assistant_tokens": calculate_token_count(assistant_msg),
        "total_tokens": calculate_token_count(user_msg) + calculate_token_count(assistant_msg)
    })

token_df = pd.DataFrame(token_counts)
token_stats = {
    "mean_user_tokens": token_df["user_tokens"].mean(),
    "mean_assistant_tokens": token_df["assistant_tokens"].mean(),
    "mean_total_tokens": token_df["total_tokens"].mean(),
    "max_total_tokens": token_df["total_tokens"].max(),
    "min_total_tokens": token_df["total_tokens"].min()
}

# Split into train, validation, and test sets
train_data, test_data = train_test_split(combined_dataset, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)

# Save the datasets to JSON files
with open("train_data.json", "w") as f:
    json.dump(train_data, f, indent=2)

with open("val_data.json", "w") as f:
    json.dump(val_data, f, indent=2)

with open("test_data.json", "w") as f:
    json.dump(test_data, f, indent=2)

# Print dataset statistics
print(f"Total examples: {len(combined_dataset)}")
print(f"Training examples: {len(train_data)}")
print(f"Validation examples: {len(val_data)}")
print(f"Test examples: {len(test_data)}")
print("\nToken statistics:")
for key, value in token_stats.items():
    print(f"{key}: {value:.2f}")

# Generate a few examples for review
print("\nExample data points:")
for i in range(min(3, len(combined_dataset))):
    print(f"\nExample {i+1}:")
    print(f"User: {combined_dataset[i]['conversations'][0]['content'][:200]}...")
    print(f"Assistant: {combined_dataset[i]['conversations'][1]['content'][:200]}...")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Total examples: 68
Training examples: 48
Validation examples: 6
Test examples: 14

Token statistics:
mean_user_tokens: 131.87
mean_assistant_tokens: 86.29
mean_total_tokens: 218.16
max_total_tokens: 1207.00
min_total_tokens: 62.00

Example data points:

Example 1:
User: Based on the research about DeepSeek V3, please answer the following question: What is DeepSeek V3 and why is it significant in the AI industry?...
Assistant: DeepSeek V3 is an AI model from the Chinese startup Deepseek that has caused a stir in the AI space due to its impressive performance at a fraction of the training and inference cost compared to compe...

Example 2:
User: Based on the research about DeepSeek V3, please answer the following question: How did DeepSeek V3 achieve training efficiency compared to other large language models?...
Assistant: DeepSeek V3 achieved significant training efficiency through several innovations: 1) Using a Mixture-of-Experts (MoE) architecture where only 37B parameters fire for 