In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch

In [None]:
import nltk # to convert text into some more presentable way
nltk.download("punkt")
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize, word_tokenize

In [3]:
file_name = "little piece"
with open(file_name, "r", encoding="utf-8") as file: ## reading a file and storing it into one variable
    f = file.read()
    sentenses = sent_tokenize(f) # our english teacher
tokenized_sentenses = [word_tokenize(sent) for sent in sentenses] # structure a text

In [None]:
from transformers import BitsAndBytesConfig
# !pip install bitsandbytes accelerate
quant_config = BitsAndBytesConfig(load_in_8bit=True)
phi = "microsoft/phi-2"
phi_model = AutoModelForCausalLM.from_pretrained(
    phi,
    device_map="auto",
    quantization_config=quant_config,
    trust_remote_code=True
)

phi_tokenizer = AutoTokenizer.from_pretrained(phi)
phi_tokenizer.pad_token = phi_tokenizer.eos_token


In [None]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

phi_model = prepare_model_for_kbit_training(phi_model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # зависит от архитектуры, ниже уточню
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

phi_model = get_peft_model(phi_model, lora_config)
phi_model.gradient_checkpointing_enable()

In [None]:
from datasets import Dataset

dataset = Dataset.from_dict({"text": f.split("\n\n")})

def token_func(example):
    return phi_tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

token_dataset = dataset.map(token_func, batched=True)

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=phi_tokenizer,
    mlm=False
)

training_args = TrainingArguments(
    output_dir="./mistral-continued-pretrain",
    overwrite_output_dir=True,
    max_steps=1000,
    per_device_train_batch_size=1,
    save_steps=500,
    save_total_limit=1,
    prediction_loss_only=True,
    fp16=False,  # Disable fp16 for T4
    learning_rate=5e-6,
    logging_steps=50,
)


trainer = Trainer(
    model=phi_model,
    args=training_args,
    train_dataset=token_dataset,
    tokenizer=phi_tokenizer,
    data_collator=data_collator,
)

trainer.train()

In [None]:
checkpoint_path = "./mistral-continued-pretrain/checkpoint-1000"
import safetensors.torch
adapter_weights = safetensors.torch.load_file("./mistral-continued-pretrain/checkpoint-1000/adapter_model.safetensors")

In [None]:
phi_model.load_state_dict(adapter_weights, strict=False)
pipe = pipeline("text-generation", model=phi_model, tokenizer=phi_tokenizer)
pipe("church of Panagia Aggeloktisti")

import shutil
shutil.make_archive("model-checkpoint", 'zip', "./mistral-continued-pretrain/checkpoint-1000")
from google.colab import files
files.download("model-checkpoint.zip")
