In [None]:
pip install 'torch==2.3.1' 'accelerate==0.31.0' 'flash_attn==2.5.8' 'transformers==4.43.0' Minio dotenv

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from dotenv import load_dotenv
from minio import Minio

import torch
import os
import pickle

torch.random.manual_seed(0)

<torch._C.Generator at 0x7f93843ca6d0>

In [4]:
load_dotenv()

YANDEX_CLOUD_ACCESS_KEY = os.getenv("YANDEX_CLOUD_ACCESS_KEY")
YANDEX_CLOUD_SECRET_KEY = os.getenv("YANDEX_CLOUD_SECRET_KEY")

BUCKET_NAME = 'rag-project' # S3

# 1. Init def, model, tokenizer, Load data 

In [8]:
# init tokenizer and model
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3.5-mini-instruct",
    device_map="cuda",
    torch_dtype=torch.float16,
    trust_remote_code=True,
    low_cpu_mem_usage=True
)

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   3%|3         | 157M/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [9]:
generation_config = model.generation_config
generation_config.max_length = 4096

In [None]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
    )

In [None]:
generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "do_sample": False,
    "pad_token_id": tokenizer.eos_token_id 
}

In [20]:
def build_rag_prompt(user_query: str, rag_context: str) -> str:
    """Create prompt with/whiout RAG-content"""

    if rag_context:
        messages = [
            {
                "role": "system",
                "content": (
                    "You - AI-asistant with access to documents. "
                    "Use the provided context to respond..\n\n"
                    f"Context:\n{rag_context}"
                )
            },
            {"role": "user", "content": user_query}
        ]
    else:
        messages = [
            {"role": "system", "content": "You are a helpful AI assistant."},
            {"role": "user", "content": user_query}
        ]
    
    return tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        tokenize_special_tokens=True
    )

In [11]:
def safe_truncate(text: str, max_tokens: int = 4096) -> str:
    """Cut text"""
    tokens = tokenizer.encode(text)
    return tokenizer.decode(tokens[:max_tokens])

In [None]:
def model_inference(user_query, rag_context=None):
    "Model inference with user_query or user_query + rag_context"
    prompt = build_rag_prompt(user_query=user_query, rag_context=rag_context)

    output = pipe(prompt, **generation_args)

    return output[0]['generated_text']

# 2. Load from S3

In [12]:
client = Minio(
    "storage.yandexcloud.net",
    access_key=YANDEX_CLOUD_ACCESS_KEY,
    secret_key=YANDEX_CLOUD_SECRET_KEY,
    secure=True
)

In [13]:
# load from s3
client.fget_object(
    bucket_name=BUCKET_NAME, 
    object_name='pdf_2412_19437v1_prompt1.pkl',
    file_path='/content/pdf_2412_19437v1_prompt1.pkl'
    )

with open("/content/pdf_2412_19437v1_prompt1.pkl", "rb") as file:
    pdf_2412_19437v1_prompt1 = pickle.load(file)


# 3. Generate 

RAG based on [DeepSeek-v3 Technical Report](https://arxiv.org/abs/2412.19437)

### Page 1

In [49]:
print(pdf_2412_19437v1_prompt1[0])

DeepSeek-V3 Technical Report
DeepSeek-AI
research@deepseek.com
Abstract
We present DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. To achieve efficient inference and cost-effective training, DeepSeek-V3 adopts Multi-head Latent Attention (MLA) and DeepSeekMoE architectures, which were thoroughly validated in DeepSeek-V2. Furthermore, DeepSeek-V3 pioneers an auxiliary-loss-free strategy for load balancing and sets a multi-token prediction training objective for stronger performance. We pre-train DeepSeek-V3 on 14.8 trillion diverse and high-quality tokens, followed by Supervised Fine-Tuning and Reinforcement Learning stages to fully harness its capabilities. Comprehensive evaluations reveal that DeepSeek-V3 outperforms other open-source models and achieves performance comparable to leading closed-source models. Despite its excellent performance, DeepSeek-V3 requires only 2.788M H800 GPU hours for its full traini

In [69]:
rag_context = pdf_2412_19437v1_prompt1[0].replace('\n', ' ')

____How many total parameters does DeepSeek-V3 have?____

In [70]:
user_query = 'How many total parameters does DeepSeek-V3 have?'
print('Correct: Deepseek-v3 have 671b parameters')

print('-'*150)

# whith RAG
answer = model_inference(user_query=user_query, rag_context=rag_context)
print('With context:\n', answer)

print('-'*150)

# Without rag
answer = model_inference(user_query=user_query)
print('Without context :\n', answer)

Correct: Deepseek-v3 have 671b parameters
------------------------------------------------------------------------------------------------------------------------------------------------------
With context:
  DeepSeek-V3 has a total of 671 billion parameters.
------------------------------------------------------------------------------------------------------------------------------------------------------
Without context :
  I'm sorry, but I do not have specific information on a model named "DeepSeek-V3." The number of parameters in a neural network model depends on its architecture, including the number and size of its layers, the type of layers (e.g., convolutional, recurrent, fully connected), and other design choices.

If "DeepSeek-V3" is a model from a research paper, a proprietary system, or a model from a specific software library, you would need to refer to the original source or documentation for the exact number of parameters.

If you provide more context or details about t

____What was the Accuracy of DeepSeek-V3 and Qwen2.5-72B-Inst based on the technical report on the DeepSeek-V3 model?____

In [71]:
user_query = 'What rating did deepseek-v3 get on the MMLU-Pro benchmark and what is the increase relative to deepseek-v2.5?'
print('Correct: Deepseek-v3 75.9 VS DeepSeek-v2.5 66.2')

print('-'*150)

# whith RAG
answer = model_inference(user_query=user_query, rag_context=rag_context)
print('With context:\n', answer)

print('-'*150)

# No RAG
answer = model_inference(user_query=user_query)
print('Without context :\n', answer)

Correct: Deepseek-v3 75.9 VS DeepSeek-v2.5 66.2
------------------------------------------------------------------------------------------------------------------------------------------------------
With context:
  DeepSeek-V3 achieved a rating of 75.9% on the MMLU-Pro benchmark. When compared to DeepSeek-V2.5, which scored 59.1%, DeepSeek-V3 showed an increase of 16.8 percentage points. This indicates a significant improvement in performance on the MMLU-Pro benchmark.
------------------------------------------------------------------------------------------------------------------------------------------------------
Without context :
  I'm sorry, but as of my last update in April 2023, I don't have real-time or specific data access, including performance metrics for models like "deepseek-v3" on benchmarks such as MMLU-Pro. To find this information, you would need to refer to the latest research papers, model benchmarking results, or the official repositories where these models are doc

### Page 5

In [53]:
print(pdf_2412_19437v1_prompt1[4])

Table 1 | Training Costs of DeepSeek-V3
| Training Costs | Pre-Training | Context Extension | Post-Training | Total |
|----------------|---------------|-------------------|----------------|-------|
| in H800 GPU Hours | 2664K         | 119K              | 5K             | 2788K |
| in USD          | $5.328M       | $0.238M           | $0.01M         | $5.576M |
Table 1 | Training Costs of DeepSeek-V3, Assuming the Rental Price of H800 is $2 per GPU Hour.
We evaluate DeepSeek-V3 on a comprehensive array of benchmarks. Despite its economical training costs, comprehensive evaluations reveal that DeepSeek-V3-Base has emerged as the strongest open-source base model currently available, especially in code and math. Its chat version also outperforms other open-source models and achieves performance comparable to leading closed-source models, including GPT-4o and Claude-3.5-Sonnet, on a series of standard and open-ended benchmarks.
Lastly, we emphasize again the economical training costs of De

In [73]:
rag_context = pdf_2412_19437v1_prompt1[4].replace('\n', ' ')

____How much did the Pre-Training stage cost in 800 GPU Hours, as well as in USD? Also give an estimate of the cost of full-time education____

In [74]:
user_query = 'How much does the DeepSeek-V3 pre-training stage requires GPU hours on each trillion? How many GPUs in 1 cluster?'
print('Correct: During the pre-training stage, training DeepSeek-V3 on each trillion tokens requires only 180K H800 GPU hours, i.e., 3.7 days on our cluster with 2048 H800 GPUs.')

print('-'*150)

# whith RAG
answer = model_inference(user_query=user_query, rag_context=rag_context)
print('With context:\n', answer)

print('-'*150)

# No RAG
answer = model_inference(user_query=user_query)
print('Without context :\n', answer)

Correct: During the pre-training stage, training DeepSeek-V3 on each trillion tokens requires only 180K H800 GPU hours, i.e., 3.7 days on our cluster with 2048 H800 GPUs.
------------------------------------------------------------------------------------------------------------------------------------------------------
With context:
  The DeepSeek-V3 pre-training stage requires 180,000 GPU hours (180K GPU hours) to train on each trillion tokens. The cluster mentioned in the context consists of 2048 H800 GPUs. This information is derived from the context provided, which states that "training DeepSeek-V3 on each trillion tokens requires only 180K H800 GPU hours."
------------------------------------------------------------------------------------------------------------------------------------------------------
Without context :
  The DeepSeek-V3 pre-training stage, like any machine learning model training, depends on various factors such as the complexity of the model, the size of the 