## Подготовка

In [None]:
# Загружаем датасет
!wget -N https://github.com/tonsoleils/AI/raw/main/lab3/dataset/bbc-news-summary.zip
!unzip -q -o bbc-news-summary.zip -d ./

# Устанавливаем библиотеки
%pip install -q sentencepiece
%pip install -q transformers  datasets
%pip install -q accelerate
%pip install -q deepspeed mpi4py
%pip install -q pynvml
%pip install -q wandb
%pip install --upgrade transformers

import torch
import pandas as pd
import time

class Profiler():

    def __init__(self,) -> None:
        pass

    def gpu_mem(self):
        mem = torch.cuda.mem_get_info()
        mb = list(map(lambda x:x/pow(2,20),mem))
        total = mb[1]
        used = mb[1]-mb[0]
        return used,total

    def gpu_mem_info(self,title = ''):
        used,total = self.gpu_mem()
        print(f'🤖 {title} gpu mem : {used:.1f}/{total:.1f} mb')

    def one_step_report(self,batch, model, optimizer, do_backward = True,device = torch.device('cpu'),print_loss = False,deepspeed = False):

        report_df = pd.DataFrame(columns=['used_mem','delta_mem','delta_time'])

        delta_time =[0]
        used_mem = [self.gpu_mem()[0]]

        self.gpu_mem_info('begin')

        model.train()

        ids = batch['input_ids'].to(device,dtype=torch.long)
        labels = batch['labels'].to(device,dtype=torch.long)

        torch.cuda.synchronize()
        start_time = time.time()

        outputs = model(input_ids = ids,labels = labels)
        loss = outputs[0]

        torch.cuda.synchronize()
        forward_time = time.time()
        delta_time.append(-start_time + forward_time)

        used_mem.append(self.gpu_mem()[0])
        self.gpu_mem_info(f'{delta_time[-1]:.3f}s forward')
        if do_backward:
            optimizer.zero_grad()
            if deepspeed:
                model.backward(loss)
            else:
                loss.backward()

            torch.cuda.synchronize()
            backward_time = time.time()
            delta_time.append(-forward_time + backward_time)
            used_mem.append( self.gpu_mem()[0])
            self.gpu_mem_info(f'{delta_time[-1]:.3f}s backward')

            if deepspeed:
                model.step()
            else:
                optimizer.step()

            torch.cuda.synchronize()
            optimizer_step_time = time.time()
            delta_time.append(-backward_time + optimizer_step_time)
            used_mem.append( self.gpu_mem()[0])
            self.gpu_mem_info(f'{delta_time[-1]:.3f}s optimizer_step')

        if (print_loss):
            print('loss',loss)

        torch.cuda.empty_cache()
        used_mem.append( self.gpu_mem()[0])
        torch.cuda.synchronize()
        end_time = time.time()
        delta_time.append(end_time - optimizer_step_time)

        report_df.loc[:,'used_mem'] = pd.Series(used_mem)
        report_df.loc[:,'delta_time'] = pd.Series(delta_time)
        indexes = ['begin','forward','backward','optim_step','end']
        report_df.index = indexes

        report_df['delta_mem'] =  report_df['used_mem']- report_df.loc['begin','used_mem']

        report_df.loc['total'] = [self.gpu_mem()[1],0,end_time-start_time]
        report_df['delta_time'] = report_df['delta_time'].map(lambda t : round(t,3))

        return report_df

prof = Profiler()
prof.gpu_mem()
prof.gpu_mem_info()

def gpu_mem():
    mem = torch.cuda.mem_get_info()
    mb = list(map(lambda x:x/pow(2,20),mem))
    total = mb[1]
    used = mb[1]-mb[0]
    return used,total

def gpu_mem_info(title = ''):
    used,total = gpu_mem()
    print(f'🤖 {title} gpu mem : {used:.1f}/{total:.1f} mb')

--2025-05-21 07:27:56--  https://github.com/tonsoleils/AI/raw/main/lab3/dataset/bbc-news-summary.zip
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/tonsoleils/AI/main/lab3/dataset/bbc-news-summary.zip [following]
--2025-05-21 07:27:57--  https://raw.githubusercontent.com/tonsoleils/AI/main/lab3/dataset/bbc-news-summary.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4558162 (4.3M) [application/zip]
Saving to: ‘bbc-news-summary.zip’


Last-modified header missing -- time-stamps turned off.
2025-05-21 07:27:57 (81.6 MB/s) - ‘bbc-news-summary.zip’ saved [4558162/4558162]

🤖  gpu mem : 102.9/15

In [None]:
!ls

bbc-news-summary      cached_lm_GPT2Tokenizer_512_ds.txt       ds.txt
bbc-news-summary.zip  cached_lm_GPT2Tokenizer_512_ds.txt.lock  sample_data


## Выберем датасет и модель

In [None]:
MODEL_NAME = 'ai-forever/rugpt3small_based_on_gpt2' #@param ['ai-forever/rugpt3small_based_on_gpt2', 'ai-forever/rugpt3medium_based_on_gpt2','ai-forever/rugpt3large_based_on_gpt2', 'gpt2-large']
DATASET_PATH1 = "Summaries/tech" # @param ["News Articles/business", "News Articles/entertainment", "News Articles/politics", "News Articles/sport", "News Articles/tech", "Summaries/business", "Summaries/entertainment", "Summaries/politics", "Summaries/sport", "Summaries/tech"]
DATASET_PATH = "bbc-news-summary/" + DATASET_PATH1

import os
from nltk.tokenize import sent_tokenize

def read_text(paths):
    documents = []
    for root, dirs, files in os.walk(paths):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
                    lines = f.readlines()
                    content = ''.join(lines)
                    documents.append(content)
            except UnicodeDecodeError:
                print(f"Error reading file: {file_path}")

    return '\n'.join(documents)

spam = ['\ufeff', '\t', '\u2060', '¦', '«', '»', '\n', "\'", "\""]

def clear_vocab(text: str) -> set:
    extra_symbols = ['<EOS>', '<BOS>', '<UNK>']
    vocab = set()
    for char in text:
        if char not in vocab:
            vocab.add(char)
    for char in extra_symbols:
        vocab.add(char)
    return vocab

def clear_text(symbols: list, text: str) -> None:
    new_sentences = []
    sentences = sent_tokenize(text)
    for sentence in sentences:
        new_sentence = []
        for symbol in sentence:
            if symbol not in symbols:
                new_sentence.append(symbol)
        if len(new_sentence) >= 20:
            new_sentences.append(''.join(new_sentence))
    return new_sentences

## Загрузка модели для дообучения

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import transformers
from sklearn.model_selection import train_test_split
import time
import pandas as pd
import random
import deepspeed

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Working on {DEVICE}")

model_name_or_path = MODEL_NAME
tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)
model = GPT2LMHeadModel.from_pretrained(model_name_or_path).to(DEVICE)

[2025-05-21 07:29:13,443] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Working on cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
print(f"The total number of parameters in the model is {model.num_parameters()}")
gpu_mem_info()

The total number of parameters in the model is 125231616
🤖  gpu mem : 672.9/15095.1 mb


## Датасет

In [None]:
DATASET_PATH

'bbc-news-summary/Summaries/tech'

In [None]:
text = read_text(DATASET_PATH).split('\n')

In [None]:
ds_name = "ds.txt"
with open(ds_name, "w") as f:
  for line in text:
    out_str = line + "\n" + "[EOS]" + "\n"
    f.write(out_str)

In [None]:
tokenizer.add_tokens('[EOS]')
tokenizer.add_special_tokens({
    'eos_token': '[EOS]',
    'pad_token': '<pad>'
})

0

In [None]:
train_dataset = TextDataset(tokenizer=tokenizer,file_path=ds_name,block_size=512)
train_dataset, eval_dataset = train_test_split(train_dataset,test_size = 0.1,random_state = 42)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)



In [None]:
import numpy as np
print(f"Размер словаря (количество различных токенов): {tokenizer.vocab_size}")
print(f"Случайные примеры токенов: {[tokenizer.decode(id) for id in np.random.choice(range(tokenizer.vocab_size), 4)]}")

Размер словаря (количество различных токенов): 50257
Случайные примеры токенов: [' волос', ' доцент', ' Дети', ' карди']


In [None]:
print(f"первый батч из обучающей выборки размера {len(train_dataset[0])}")
print(train_dataset[0])

первый батч из обучающей выборки размера 512
tensor([ 1955, 19748,  3810, 10341,  3115, 32448,  3926,   534,  1366,  1284,
         3735,  1824,  1132, 37411,  1622, 45115,  2929,   593,  5724,    17,
         2692,    87,  3087,  3970, 34898,    23, 23254,    18, 43842,    23,
        23254,  2213, 12986,   463,  1172,   974, 32678,   593,  2369, 21892,
         1092, 48293, 40631,    87,    18,    51,    74,   463,  3405, 15714,
         5959,  5105,  3764,  8218, 34898,    23, 23254,    16, 19670,     9,
         2213, 14530, 42476,   710, 15173,     9,   593, 16332,  4796, 27479,
         1638,    87,   654, 34898,    23, 23254,  2213, 30814,   407,  8326,
        49276, 35707,  1092, 46515,  1284,     6, 12155, 37318,  1075,  1729,
         2751,    16, 21944,   593,   463,   969,  1753, 36975,   654,  7825,
        24369, 41292,    18,  1955,  7825, 28690, 15267,  6504,  1338, 41183,
         9765, 23254,  3135,  4240,  6756, 26084,  1227,  5614,   654,  5614,
        23962,   57

## Обучение

In [None]:


batch_size = 4
n_epochs = 15
training_args = TrainingArguments(
    output_dir="./finetuned",
    overwrite_output_dir=True,
    num_train_epochs=n_epochs,
    warmup_steps=10,
    gradient_accumulation_steps=4,
    auto_find_batch_size=True,
    report_to="none",
)



In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset = eval_dataset,
    # optimizers = (torch.optim.NAdam(model.parameters(),lr=1e-5),None) # Optimizer and lr scheduler
)

In [None]:
trainer_log = trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


## Генерация

In [None]:
import torch

text = "The other common type of ink in elections is indelible visible ink"
input_ids = tokenizer.encode(text, return_tensors="pt").to(DEVICE)
attention_mask = torch.ones_like(input_ids)  # Set all values to 1 initially
pad_token_id = 50256  # EOS token ID

if pad_token_id is not None:
    attention_mask[input_ids == pad_token_id] = 0

model.eval()
with torch.no_grad():
    out = model.generate(input_ids,
                         do_sample=True,
                         num_beams=4,
                         temperature=2.5,
                         top_p=0.9,
                         max_length=200,
                         attention_mask=attention_mask
                         )

generated_text = list(map(tokenizer.decode, out))[0]
print()
print(generated_text)



The other common type of ink in elections is indelible visible ink, which is so popular that the parties are forced to use."It's the first time I had been seeking the performer to be aware of the ink," said the campaign manager of independent electronic gaming firm Crysis.More than 40,000 people in the country have been cast in campaigns in recent months, with the goal of spending just £3.6bn in the way it is used to get the electronic music downloads.Few European electronic music players can be used to download or sell on a computer.One method of ink is a video recorder that can pause downloads and record results, which can be used to record results for different electronic music firms and other people using a computer to send video on the screen.Users of electronic gaming software can also download
