In [1]:
! pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [5]:
! pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.1-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Downloading bitsandbytes-0.45.1-py3-none-manylinux_2_24_x86_64.whl (69.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.1


In [9]:
# Restart the kernel after running this cell.
!pip install -U bitsandbytes



In [2]:
! pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0


In [1]:
!pip install accelerate
!pip install -U bitsandbytes
!pip install transformers



In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import PrefixTuningConfig, get_peft_model
from datasets import Dataset
import jsonlines

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Load the model in 4-bit precision
    bnb_4bit_use_double_quant=True,  # Use double quantization for better efficiency
    bnb_4bit_quant_type="nf4",  # Use normalized float 4-bit quantization
    bnb_4bit_compute_dtype=torch.float16  # Compute dtype for 4-bit models
)

tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")

model = AutoModelForCausalLM.from_pretrained(
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    quantization_config=quantization_config,
    device_map="auto"  # Automatically map the model to available hardware (CPU/GPU)
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [5]:
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Set the padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use EOS token as padding token

# Move the model to the GPU
model.to(device)

# Define prefix tuning configuration
prefix_tuning_config = PrefixTuningConfig(
    task_type="CAUSAL_LM",  # Task type (causal language modeling)
    num_virtual_tokens=10,  # Number of virtual tokens (prefix length)
)

# Apply prefix tuning to the model
model = get_peft_model(model, prefix_tuning_config)

# Load the dataset from the uploaded file
data = []
with open("book_dataset.jsonl", "r", encoding="utf-8") as f:
    for item in jsonlines.Reader(f):
        data.append(item)

# Convert to Hugging Face Dataset
dataset = Dataset.from_dict({
    "instruction": [item["instruction"] for item in data],
    "input": [item["input"] for item in data],
    "output": [item["output"] for item in data]
})

# Preprocessing function
def preprocess_function(examples):
    prompts = [
        f"Instruction: {inst}\nInput: {inp}\nOutput: {out}"
        for inst, inp, out in zip(examples["instruction"], examples["input"], examples["output"])
    ]
    tokenized_inputs = tokenizer(prompts, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].clone()

    # Move tokenized inputs to the GPU
    for key in tokenized_inputs:
        tokenized_inputs[key] = tokenized_inputs[key].to(device)

    return tokenized_inputs

# Apply preprocessing to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Directory to save the model
    per_device_train_batch_size=4,  # Batch size
    num_train_epochs=3,  # Number of epochs
    logging_dir="./logs",  # Directory for logs
    save_steps=500,  # Save model every 500 steps
    save_total_limit=2,  # Keep only the last 2 models
    fp16=True,  # Use mixed precision (if supported)
    gradient_accumulation_steps=2,  # Accumulate gradients over 2 steps
    logging_steps=10,  # Log every 10 steps
    evaluation_strategy="no",  # No evaluation
    load_best_model_at_end=False,  # Disable loading the best model at the end
)
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

Using device: cuda


Map:   0%|          | 0/3045 [00:00<?, ? examples/s]

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss
10,8.3804
20,7.9546
30,7.553
40,7.1179
50,6.7685
60,6.3261
70,5.9857
80,5.5967
90,5.2414
100,4.9308


Step,Training Loss
10,8.3804
20,7.9546
30,7.553
40,7.1179
50,6.7685
60,6.3261
70,5.9857
80,5.5967
90,5.2414
100,4.9308


TrainOutput(global_step=1143, training_loss=2.8118392476259566, metrics={'train_runtime': 2322.2756, 'train_samples_per_second': 3.934, 'train_steps_per_second': 0.492, 'total_flos': 4.332082227314688e+16, 'train_loss': 2.8118392476259566, 'epoch': 3.0})

In [None]:
"""from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
model = AutoModelForCausalLM.from_pretrained("./results")  # Load your trained model

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move the model to the GPU
model.to(device)

# Set the padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use EOS token as padding token

# Function to generate a response
def generate_response(prompt, max_length=100):
    # Tokenize the input prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate output
    outputs = model.generate(
        inputs["input_ids"],
        max_length=max_length,  # Maximum length of the generated text
        num_return_sequences=1,  # Number of sequences to generate
        no_repeat_ngram_size=2,  # Prevent repetition of n-grams
        top_k=50,  # Top-k sampling
        top_p=0.95,  # Top-p (nucleus) sampling
        temperature=0.7,  # Sampling temperature
        do_sample=True,  # Enable sampling
    )

    # Decode the generated text
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Interactive loop to ask questions
while True:
    # Get user input
    user_input = input("You: ")

    # Exit condition
    if user_input.lower() in ["exit", "quit"]:
        print("Exiting...")
        break

    # Format the prompt
    prompt = f"Instruction: Continue this story or summarize.\nInput: {user_input}\nOutput:"

    # Generate and print the model's response
    response = generate_response(prompt)
    print(f"Model: {response}\n")"""

In [None]:
# Import necessary libraries
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from peft import PeftModel, PeftConfig  # Import PeftModel and PeftConfig

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")

# Load the PEFT configuration
peft_config = PeftConfig.from_pretrained("/content/results/checkpoint-1143")  # Load PEFT config

# Load the base model
model = AutoModelForCausalLM.from_pretrained(
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    # ... other configurations if needed (like quantization_config) ...
)

# Load the PEFT adapter weights into the base model
model = PeftModel.from_pretrained(model, "/content/results/checkpoint-1143", config=peft_config)  # Apply PEFT

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move the model to the GPU
model.to(device)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use EOS token as padding token

# Function to generate a response
def generate_response(prompt, max_length=100):
    # Tokenize the input prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate output
    outputs = model.generate(
        inputs["input_ids"],
        max_length=max_length,  # Maximum length of the generated text
        num_return_sequences=1,  # Number of sequences to generate
        no_repeat_ngram_size=2,  # Prevent repetition of n-grams
        top_k=50,  # Top-k sampling
        top_p=0.95,  # Top-p (nucleus) sampling
        temperature=0.7,  # Sampling temperature
        do_sample=True,  # Enable sampling
    )

    # Decode the generated text
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Interactive loop to ask questions
while True:
    # Get user input
    user_input = input("You: ")

    # Exit condition
    if user_input.lower() in ["exit", "quit"]:
        print("Exiting...")
        break

    # Format the prompt
    prompt = f"Instruction: Continue this story or summarize.\nInput: {user_input}\nOutput:"

    # Generate and print the model's response
    response = generate_response(prompt)
    print(f"Model: {response}\n")