In [None]:
%%capture
!pip install rich accelerate gradio transformers numba datasets peft bitsandbytes torch

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import datasets
import re
from tqdm.auto import tqdm

In [None]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-6.7b-instruct", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-coder-6.7b-instruct", trust_remote_code=True, torch_dtype=torch.bfloat16)

# Wrap the model with DataParallel
model = nn.DataParallel(model)

# Move the model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
dataset = datasets.load_dataset('RayBernard/leetcode')#Leetcode dataset

In [None]:
dataset

In [None]:
def preprocess_function(examples):
    inputs = [f"Instruction: {inst}\nInput: {inp}" for inst, inp in zip(examples["instruction"], examples["input"])]
    outputs = []
    for output in examples["output"]:
        # Extract Python code from the output
        code_blocks = re.findall(r"```python(.*?)```", output, re.DOTALL)
        code = "\n".join(code_blocks)
        outputs.append(code)
    
    max_length = 485  # Set the maximum sequence length
    
    model_inputs = tokenizer(inputs, truncation=True, max_length=max_length, padding="max_length")
    labels = tokenizer(outputs, truncation=True, max_length=max_length, padding="max_length")
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

In [None]:
train_dataset = tokenized_datasets["train"]
train_dataset = train_dataset.train_test_split(test_size=0.1, seed=42)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
def collate_fn(batch):
    input_ids = torch.tensor([item['input_ids'] for item in batch])
    attention_mask = torch.tensor([item['attention_mask'] for item in batch])
    labels = torch.tensor([item['labels'] for item in batch])
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

train_dataloader = DataLoader(train_dataset["train"], batch_size=16, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(train_dataset["test"], batch_size=16, collate_fn=collate_fn)

In [None]:
# Calculate the total number of training steps
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
progress_bar = tqdm(range(num_training_steps))

# Training loop
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for batch in train_dataloader:
        inputs = {key: value.to(device) for key, value in batch.items() if key != "labels"}
        labels = batch["labels"].to(device)
        
        optimizer.zero_grad()
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        progress_bar.update(1)
    
    train_loss /= len(train_dataloader)
    progress_bar.set_description(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss:.4f}")
    