Install Dependencies

In [None]:
!pip install transformers huggingface_hub
!pip install transformers datasets

Give HF access token

In [None]:
from huggingface_hub import notebook_login

notebook_login()

Import Libraries

In [None]:
import csv
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from transformers.optimization import AdamW

Dataset Processing

In [None]:
def csv_to_jsonl(csv_file_path, jsonl_file_path):
    with open(csv_file_path, newline='') as csvfile:
        reader = csv.reader(csvfile)
        with open(jsonl_file_path, 'w') as jsonlfile:
            for row in reader:
                jsonlfile.write(json.dumps({"input": row[0], "output": row[1]}) + '\n')


csv_file_path = '/content/LLM_Dataset.csv'  
jsonl_file_path = 'Finetune_dataset.jsonl'
csv_to_jsonl(csv_file_path, jsonl_file_path)

Get the pretrained model and the tokenizer

In [None]:
model_name = "mistralai/Mistral-7B-v0.1"  # HF repo name
tokenizer = AutoTokenizer.from_pretrained(model_name)

Custom Dataset

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data_file, tokenizer, max_length=128):
        self.examples = []
        with open(data_file, 'r') as file:
            for line in file:
                example = json.loads(line)
                self.examples.append(example)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]
        inputs = self.tokenizer(example['input'], max_length=self.max_length, padding="max_length", truncation=True)
        targets = self.tokenizer(example['output'], max_length=self.max_length, padding="max_length", truncation=True)
        return {
            'input_ids': torch.tensor(inputs.input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(inputs.attention_mask, dtype=torch.long),
            'labels': torch.tensor(targets.input_ids, dtype=torch.long),
        }

Dataloader

In [None]:
dataset = CustomDataset(jsonl_file_path, tokenizer)

batch_size = 1 
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


Traning Parameters

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_name)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

Training loop

In [None]:
model.train()
for epoch in range(10):  # Adjust the number of epochs as needed
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

Save Model

In [None]:
model.save_pretrained("./fine-tuned-misral-7b")
tokenizer.save_pretrained("./fine-tuned-misral-7b")