# Model Mistral test


vm_file_path="/home/lujun/local/causalllm/temp/virtual_memory_file"

sudo dd if=/dev/zero of="$vm_file_path" bs=1M count=32768

sudo mkswap "$vm_file_path"

echo "$vm_file_path none swap sw 0 0" | sudo tee -a /etc/fstab

sudo swapon -a

In [1]:
from transformers import AutoModel, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [26]:
def download_model_locally(model_name, local_path):
    # Download the model locally
    model = AutoModel.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Save the model and tokenizer to the local path
    model.save_pretrained(local_path)
    tokenizer.save_pretrained(local_path)

    return local_path

model_name = "mistralai/Mistral-7B-v0.1"
local_path = "/home/lujun/local/causalllm/resource/"

# Download the model locally
downloaded_path = download_model_locally(model_name, local_path)

KeyboardInterrupt: 

In [3]:
model_path = "resource/mistralai/"
# Load the local model and tokenizer
local_model = AutoModel.from_pretrained(model_path)
local_tokenizer = AutoTokenizer.from_pretrained(model_path)

Loading checkpoint shards: 100%|██████████| 6/6 [00:52<00:00,  8.76s/it]


In [3]:
import torch
from typing import List
from mistral_src.mistral.model import ModelArgs, Transformer 
from mistral_src.main import generate 

class DebugTokenizer:
    @property
    def bos_id(self) -> int:
        return 0

    @property
    def eos_id(self) -> int:
        return 1

    @property
    def pad_id(self) -> int:
        return -1

    def encode(self, s: str, bos: bool = True) -> List[int]:
        assert isinstance(s, str)
        t = [int(x) for x in s.split()]
        if bos:
            t = [self.bos_id, *t]
        return t

    def decode(self, t: List[int]) -> str:
        return " ".join([str(x) for x in t])

def test_hello_world_generation():
    torch.manual_seed(42)

    prompt = ["hello world"]
    args = ModelArgs(
        dim=512,
        n_layers=1,
        head_dim=128,
        hidden_dim=2048,
        n_heads=4,
        n_kv_heads=2,
        sliding_window=3,
        norm_eps=1e-5,
        vocab_size=32_000,
        max_batch_size=len(prompt),
    )
    model = Transformer(args).to("cuda", dtype=torch.float32)
    tokenizer = DebugTokenizer()

    tokenized_prompt, all_logprobs_old = generate(prompt, model, tokenizer, max_tokens=7)
    tokenized_prompt = [" ".join(r.split(" ")[1:]) for r in tokenized_prompt]  # Remove BOS
    generated, all_logprobs_new = generate(tokenized_prompt, model, tokenizer, max_tokens=0)
    assert generated == []

    # Verify that logprobs are the same
    assert len(prompt) == len(all_logprobs_old) == len(all_logprobs_new)
    for lp_old, lp_new in zip(all_logprobs_old, all_logprobs_new):
        assert all([abs(x - y) < 1e-5 for x, y in zip(lp_old, lp_new)]), f"\n{lp_old}\n{lp_new}"

    print("Hello World generation test passed.")

if __name__ == "__main__":
    test_hello_world_generation()


ModuleNotFoundError: No module named 'mistral'

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import AdamW
from torch.utils.data import DataLoader, Dataset
import torch

# Replace with your task data and labels
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Replace with your fine-tuning data
train_texts = ["your input text 1", "your input text 2", ...]
train_labels = [0, 1, ...]

# Replace "mistraleai" with the name of the model you want to fine-tune
model_name = "mistraleai"

# Load the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Use GPU for training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Create dataset and data loader
train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Fine-tuning
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Average Loss: {average_loss}")

# Save the fine-tuned model
model.save_pretrained("./path/to/fine_tuned_model")
tokenizer.save_pretrained("./path/to/fine_tuned_model")
