In [None]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
from transformers import Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

operations_ids = []
prev_titles = []

In [None]:
news_dataset = pd.read_csv('/kaggle/input/news-for-df/Lenta_dataset.csv')
news_dataset.head()

In [None]:
safe_df = news_dataset[~news_dataset['tags'].isin(['Все','Политика', 'Первая мировая', 'Россия', 'Вооружение', 'Выборы', 'Киберпреступность','Украина', 'Молдавия', 'Преступная Россия', 'Полиция и спецслужбы', 'Конфликты', 'Преступность', 'Криминал', 'Оружие', 'Следствие и суд'])]

In [None]:
test = safe_df.sample(1000)

In [None]:
test

In [None]:
model_name = "ai-forever/rugpt3small_based_on_gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

if torch.cuda.is_available():
    model.cuda()

with open("train_texts.txt", "w", encoding="utf-8") as f:
    for text in test['text']:
        f.write(text + "\n")

def load_dataset(file_path, tokenizer, block_size=128):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )
    return dataset

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

train_file_path = "train_texts.txt"
train_dataset = load_dataset(train_file_path, tokenizer)

training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    save_steps=100_000,
    save_total_limit=2,
    logging_dir='./logs',
)


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)
trainer.train()

model.save_pretrained('./trained_model')
tokenizer.save_pretrained('./trained_model')


In [None]:
import torch
from transformers import Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

def load_dataset(file_path, tokenizer, block_size=128):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )
    return dataset

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)


train_file_path = "train_texts.txt"
train_dataset = load_dataset(train_file_path, tokenizer)

model_path = './new_new_trained_model'
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Перенос модели на GPU, если доступно
if torch.cuda.is_available():
    model.cuda()
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=100000,
    save_total_limit=2,
    logging_dir='./logs',
)


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

trainer.train()


model.save_pretrained('./new_new_new_trained_model')
tokenizer.save_pretrained('./new_new_new_trained_model')

In [None]:
generated_result = pd.DataFrame(columns=['title', 'generated_text'])

In [None]:
now_titles = safe_df.title.sample(500)
titles =  [x for x in now_titles if x not in prev_titles]
prev_titles = prev_titles + titles

In [None]:
# Load model directly
#from transformers import AutoTokenizer, AutoModelForCausalLM
#from transformers import AutoModel


model_path = './new_new_new_trained_model'
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)
#tokenizer = AutoTokenizer.from_pretrained("ai-forever/rugpt3small_based_on_gpt2")
#model = AutoModelForCausalLM.from_pretrained("ai-forever/rugpt3small_based_on_gpt2")
if torch.cuda.is_available():
    model = model.cuda() 

In [None]:
def generate_text(prompt, model, tokenizer,  max_length=1000):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    encoded_input = tokenizer.encode(prompt, return_tensors='pt').to(device)
    
    output_sequences = model.generate(
        input_ids=encoded_input,
        max_length=max_length,
        temperature=0.7,
        no_repeat_ngram_size=2,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        num_return_sequences=1
    )

    text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
    end_pos = text.rfind('.')
    if end_pos != -1:
        text = text[:end_pos+1]
    return text


generated = []
for title in titles[400:]:
    prompt_text = f"Новостная статья: {title}"
    generated_text = generate_text(prompt_text, model, tokenizer, max_length=500)
    generated.append((title, generated_text))

tmp = pd.DataFrame(generated, columns=['title','generated_text'])
generated_result = pd.concat([generated_result, tmp])
generated_result = generated_result.reset_index(drop=True)

In [None]:
generated_result

In [None]:
generator = pipeline(model="ai-forever/mGPT", max_length=500)
generator(f"{titles[0]}.", do_sample=False)

In [None]:
generated_result.to_csv('/kaggle/working/generated_rugpt3_pre_trained_small.csv')