In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

# 1. Загрузка модели и токенизатора
model_name = "gpt2"  # Можно заменить на другую модель, например 'microsoft/DialoGPT-medium'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Добавим токен для padding
tokenizer.pad_token = tokenizer.eos_token

# 2. Пример генерации с использованием pipeline
generator = pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# Пример промпта
prompt = "Question: What is the capital of France?\nAnswer:"

# Генерация
generated = generator(
    prompt,
    max_length=100,
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id,
    temperature=0.7,
    top_p=0.9,
    do_sample=True
)

print("Сгенерированный ответ:")
print(generated[0]['generated_text'])
print("\n" + "="*50 + "\n")

# 3. Ручная генерация с контролем параметров
def generate_answer(prompt, model, tokenizer, max_new_tokens=50):
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=inputs.shape[-1] + max_new_tokens,
            temperature=0.8,      # Контроль "креативности"
            top_p=0.9,            # Nucleus sampling
            top_k=50,             # Ограничение на топ-K
            do_sample=True,       # Использовать вероятностный выбор
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Пример с другим вопросом
prompt2 = "Question: How does photosynthesis work?\nAnswer:"
generated_text = generate_answer(prompt2, model, tokenizer)
print("Сгенерированный ответ (ручная генерация):")
print(generated_text)
print("\n" + "="*50 + "\n")

# 4. Подготовка датасета для дообучения (пример)
# В реальности вы загрузите свой датасет (например, CSV с колонками 'question', 'answer')
data = [
    {"question": "What is the capital of Japan?", "answer": "Tokyo is the capital of Japan."},
    {"question": "Who invented the telephone?", "answer": "Alexander Graham Bell invented the telephone."},
    {"question": "What is 2+2?", "answer": "2+2 equals 4."}
]

# Формирование текста для обучения
formatted_data = [f"Question: {item['question']}\nAnswer: {item['answer']}" for item in data]

# 5. Пример дообучения (fine-tuning) на этих данных (упрощённо)
# Для полноценного дообучения потребуется больше шагов: токенизация, датасет, тренер и т.д.
# Ниже — минимальный пример подготовки данных

from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling

# Для простоты сохраним данные в файл
with open("train_data.txt", "w", encoding="utf-8") as f:
    for text in formatted_data:
        f.write(text + "\n\n")

# Загрузим датасет из файла
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="train_data.txt",
    block_size=128
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Языковая модель обучается на следующий токен, а не на маскировку
)

print("Датасет для дообучения подготовлен.")
print("Для полноценного дообучения используйте Trainer из transformers.")

Цикл

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Загрузка модели и токенизатора
model_name = "gpt2"  # или любая другая подходящая модель
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Убедимся, что токен для padding задан
tokenizer.pad_token = tokenizer.eos_token

# Функция генерации ответа
def generate_answer(prompt, model, tokenizer, max_new_tokens=100):
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=inputs.shape[-1] + max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Пример списка вопросов (замените его на ваш список)
questions = [
    "What is the capital of France?",
    "Who wrote '1984'?",
    "How does photosynthesis work?",
    # ... и так далее, до 300 вопросов
]

# Список для хранения результатов
results = []

# Цикл по всем вопросам
for i, question in enumerate(questions):
    prompt = f"Question: {question}\nAnswer:"
    generated_text = generate_answer(prompt, model, tokenizer)
    
    # Извлечение только ответа (опционально)
    answer = generated_text[len(prompt):].strip()
    
    results.append({
        "question": question,
        "generated_answer": answer
    })
    
    # Опционально: вывод прогресса
    print(f"[{i+1}/{len(questions)}] Question: {question}")
    print(f"Generated Answer: {answer}\n")

# Теперь у вас есть список словарей с вопросами и ответами
# Вы можете сохранить его в CSV, JSON и т.д.
import json

with open("generated_answers.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print("Ответы сохранены в файл 'generated_answers.json'")

батчинг + лайн

In [None]:
from transformers import pipeline

generator = pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1,
    batch_size=4  # Обрабатывать по 4 промпта за раз
)

prompts = [f"Question: {q}\nAnswer:" for q in questions]

# Генерация для всех промптов
generated_batch = generator(
    prompts,
    max_length=200,
    pad_token_id=tokenizer.eos_token_id,
    temperature=0.7,
    top_p=0.9,
    do_sample=True
)

# Извлечение ответов
results = []
for i, gen in enumerate(generated_batch):
    full_text = gen['generated_text']
    answer = full_text[len(prompts[i]):].strip()
    results.append({
        "question": questions[i],
        "generated_answer": answer
    })

файнтюн

In [None]:
import pandas as pd
from datasets import Dataset

# Пример данных
data = {
    "question": [
        "What is the capital of France?",
        "Who invented the telephone?",
        "How does photosynthesis work?",
        "What is Python used for?",
        "Who wrote 'Romeo and Juliet'?"
    ],
    "answer": [
        "The capital of France is Paris.",
        "Alexander Graham Bell invented the telephone.",
        "Photosynthesis is the process by which green plants use sunlight to synthesize food.",
        "Python is used for web development, data science, AI, and more.",
        "William Shakespeare wrote 'Romeo and Juliet'."
    ]
}

df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)

def format_qa(example):
    return {"text": f"Question: {example['question']}\nAnswer: {example['answer']}"}

dataset = dataset.map(format_qa)

In [None]:
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    TrainingArguments, Trainer,
    DataCollatorForLanguageModeling
)
import torch

model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./qa_model_finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    logging_steps=100,
    learning_rate=5e-5,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_dir='./logs',
    report_to=None,
    seed=42,
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
)

trainer.train()
trainer.save_model()
tokenizer.save_pretrained("./qa_model_finetuned")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, TaskType
from transformers import TrainingArguments, Trainer
from datasets import Dataset
import torch

model_name = "microsoft/DialoGPT-medium"  # или любая другая модель
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token

# Настройка LoRA
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["c_attn", "c_proj", "c_fc"]  # для GPT-2 / GPT-3
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./lora_qa_model",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    logging_steps=100,
    learning_rate=5e-4,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_dir='./logs',
    report_to=None,
    seed=42,
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
)

trainer.train()
trainer.save_model()
tokenizer.save_pretrained("./lora_qa_model")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token

# Токенизация
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

dataloader = DataLoader(tokenized_dataset, batch_size=2, shuffle=True)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

model.train()
model.to('cuda' if torch.cuda.is_available() else 'cpu')

for epoch in range(3):
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f"Epoch: {epoch}, Loss: {loss.item()}")

model.save_pretrained("./manual_finetuned_model")
tokenizer.save_pretrained("./manual_finetuned_model")