# ReRanker с подбором весов

In [None]:
import re
import numpy as np
from scipy.optimize import differential_evolution
from scipy.stats import spearmanr
import pickle
from pathlib import Path
from tqdm.auto import tqdm
from collections import Counter
from rouge_score import rouge_scorer


class TextReRankerWeightsOptimizer:
    def __init__(
        self,
        use_rouge_with_source=True,
        use_extractive_coverage=True,
        use_lead3_overlap=False,
        use_length_simple=True,
        use_compression_ratio=False,
        use_novel_ngrams=False,
        
        length_params=None,
        compression_params=None,
        weights=None,
        device='cuda',
        cache_dir='./reranker_cache'
    ):
        self.use_rouge_with_source = use_rouge_with_source
        self.use_extractive_coverage = use_extractive_coverage
        self.use_lead3_overlap = use_lead3_overlap
        self.use_length_simple = use_length_simple
        self.use_compression_ratio = use_compression_ratio
        self.use_novel_ngrams = use_novel_ngrams
        
        self.length_params = length_params or {'target_min': 50, 'target_max': 70}
        self.compression_params = compression_params or {'optimal_ratio': 0.15, 'sigma': 0.05}
        
        self.weights = weights or self._get_default_weights()
        
        self.device = device
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(exist_ok=True)
        
        # Быстрые scorers
        self._rouge_scorer = None
        self._rouge2_scorer = None  # для целевой метрики
    
    def __getstate__(self):
        state = self.__dict__.copy()
        state['_rouge_scorer'] = None
        state['_rouge2_scorer'] = None
        return state
    
    def __setstate__(self, state):
        self.__dict__.update(state)
    
    def _get_default_weights(self):
        active_metrics = []
        
        if self.use_rouge_with_source:
            active_metrics.append('rouge_with_source')
        if self.use_extractive_coverage:
            active_metrics.append('extractive_coverage')
        if self.use_lead3_overlap:
            active_metrics.append('lead3_overlap')
        if self.use_length_simple:
            active_metrics.append('length_simple')
        if self.use_compression_ratio:
            active_metrics.append('compression_ratio')
        if self.use_novel_ngrams:
            active_metrics.append('novel_ngrams')
        
        if not active_metrics:
            return {}
        
        weight = 1.0 / len(active_metrics)
        return {metric: weight for metric in active_metrics}
    
    def _load_rouge_scorer(self):
        if self._rouge_scorer is None:
            self._rouge_scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
        return self._rouge_scorer
    
    def _load_rouge2_scorer(self):
        """Для целевой метрики (ROUGE-2)"""
        if self._rouge2_scorer is None:
            self._rouge2_scorer = rouge_scorer.RougeScorer(
                ['rouge1', 'rouge2', 'rougeL'], 
                use_stemmer=True
            )
        return self._rouge2_scorer
    
    # ==================== МЕТРИКИ ====================
    
    @staticmethod
    def _get_ngrams_fast(text, n):
        words = text.lower().split()
        return Counter([' '.join(words[i:i+n]) for i in range(len(words)-n+1)])
    
    def compute_batch(self, candidates, source_text):
        if not candidates:
            return []
        
        # Предвычисление
        source_ngrams = None
        source_bigrams_set = None
        lead3_text = None
        source_len = None
        
        if self.use_extractive_coverage or self.use_novel_ngrams:
            source_ngrams = {
                1: self._get_ngrams_fast(source_text, 1),
                2: self._get_ngrams_fast(source_text, 2),
                3: self._get_ngrams_fast(source_text, 3),
                4: self._get_ngrams_fast(source_text, 4),
            }
        
        if self.use_novel_ngrams:
            source_bigrams_set = set(source_ngrams[2].keys())
        
        if self.use_lead3_overlap:
            sentences = re.split(r'[.!?]+', source_text)
            sentences = [s.strip() for s in sentences if s.strip()]
            lead3_text = ' '.join(sentences[:3]) if len(sentences) >= 3 else source_text
        
        if self.use_compression_ratio:
            source_len = len(source_text.split())
        
        # Вычисление
        all_scores = {}
        
        if self.use_rouge_with_source:
            scorer = self._load_rouge_scorer()
            scores = []
            for candidate in candidates:
                score = scorer.score(source_text, candidate)
                scores.append(score['rougeL'].fmeasure)
            all_scores['rouge_with_source'] = scores
        
        if self.use_extractive_coverage:
            scores = []
            for candidate in candidates:
                ngram_scores = []
                for n in [1, 2, 3, 4]:
                    cand_ngrams = self._get_ngrams_fast(candidate, n)
                    if not cand_ngrams:
                        continue
                    overlap = sum((cand_ngrams & source_ngrams[n]).values())
                    total = sum(cand_ngrams.values())
                    ngram_scores.append(overlap / total)
                scores.append(np.mean(ngram_scores) if ngram_scores else 0.0)
            all_scores['extractive_coverage'] = scores
        
        if self.use_lead3_overlap:
            scorer = self._load_rouge_scorer()
            scores = []
            for candidate in candidates:
                score = scorer.score(lead3_text, candidate)
                scores.append(score['rougeL'].fmeasure)
            all_scores['lead3_overlap'] = scores
        
        if self.use_length_simple:
            target_min = self.length_params['target_min']
            target_max = self.length_params['target_max']
            
            scores = []
            for candidate in candidates:
                words = len(candidate.split())
                if target_min <= words <= target_max:
                    score = 1.0
                elif words < target_min:
                    score = words / target_min
                else:
                    score = max(0, 1 - (words - target_max) / (target_max * 0.5))
                scores.append(score)
            all_scores['length_simple'] = scores
        
        if self.use_compression_ratio:
            optimal = self.compression_params['optimal_ratio']
            sigma = self.compression_params['sigma']
            
            scores = []
            for candidate in candidates:
                cand_len = len(candidate.split())
                ratio = cand_len / source_len if source_len > 0 else 0
                score = np.exp(-((ratio - optimal) ** 2) / (2 * sigma ** 2))
                scores.append(score)
            all_scores['compression_ratio'] = scores
        
        if self.use_novel_ngrams:
            scores = []
            for candidate in candidates:
                cand_bigrams = self._get_ngrams_fast(candidate, 2)
                if not cand_bigrams:
                    scores.append(0.0)
                    continue
                novel = sum(count for bigram, count in cand_bigrams.items() 
                           if bigram not in source_bigrams_set)
                total = sum(cand_bigrams.values())
                scores.append(novel / total if total > 0 else 0.0)
            all_scores['novel_ngrams'] = scores
        
        results = []
        for i in range(len(candidates)):
            candidate_scores = {metric: all_scores[metric][i] for metric in all_scores}
            results.append(candidate_scores)
        
        return results
    
    def compute(self, text, source_text=None, return_individual=True, return_weighted=True):
        batch_results = self.compute_batch([text], source_text)
        scores = batch_results[0]
        
        result = {}
        if return_individual:
            result['scores'] = scores
        
        if return_weighted:
            weighted_score = sum(scores[m] * self.weights.get(m, 0) for m in scores)
            result['weighted_score'] = weighted_score
        
        return result
    
    # ==================== RANKING ====================
    
    def rank_candidates(self, candidates, source_text=None):
        if not candidates:
            return []
        
        batch_scores = self.compute_batch(candidates, source_text)
        
        results = []
        for idx, scores in enumerate(batch_scores):
            weighted_score = sum(scores[m] * self.weights.get(m, 0) for m in scores)
            results.append((idx, weighted_score, scores))
        
        results.sort(key=lambda x: x[1], reverse=True)
        
        return results
    
    def get_best_candidate(self, candidates, source_text=None):
        results = self.rank_candidates(candidates, source_text)
        best_idx = results[0][0]
        return candidates[best_idx]
    
    # ==================== FIT ====================
    
    def fit(
        self,
        candidates_list,
        x_texts,
        y_texts,
        metric='rouge2',
        cache_name=None,
        use_cache=True,
        max_iter=50,
        popsize=15,
        seed=42,
        n_workers=1,
        print_correlations=False
    ):
        if len(candidates_list) != len(x_texts) or len(candidates_list) != len(y_texts):
            raise ValueError(
                f"Length mismatch: candidates_list={len(candidates_list)}, "
                f"x_texts={len(x_texts)}, y_texts={len(y_texts)}"
            )
        
        # Предвычисление метрик
        precomputed_metrics = self._load_or_compute_metrics(
            candidates_list, x_texts, cache_name, use_cache
        )
        
        # КЭШИРУЕМ целевые scores для всех кандидатов
        print("Предвычисление целевой метрики для всех кандидатов...")
        target_scores_cache = self._precompute_target_scores(
            candidates_list, y_texts, metric
        )
        
        metric_names = list(precomputed_metrics[0][0].keys())
        n_metrics = len(metric_names)

        print(f"\nМетрики: {metric_names}")
        print(f"Целевая метрика: {metric}")
        
        if print_correlations:            
            self._print_correlations_fast(
                precomputed_metrics, target_scores_cache,
                metric_names, n_samples=30
            )
        
        # Оптимизация с кэшированными target scores
        best_weights, best_score = self._optimize_weights_fast(
            candidates_list, precomputed_metrics, target_scores_cache,
            metric_names, max_iter, popsize, seed, n_workers
        )
        
        self.weights = {
            metric_names[i]: best_weights[i]
            for i in range(n_metrics)
        }

        print("\n" + "="*60)
        print("РЕЗУЛЬТАТЫ КАЛИБРОВКИ")
        print("="*60)
        print(f"Best {metric}: {best_score:.4f}")
        print(f"\nОптимальные веса:")
        for name, weight in self.weights.items():
            print(f"  {name:20s}: {weight:.4f}")
        print("="*60)

        return self.weights, best_score
    
    def _load_or_compute_metrics(self, candidates_list, x_texts, cache_name, use_cache):
        cache_path = None
        if cache_name and use_cache:
            cache_path = self.cache_dir / f"{cache_name}_metrics.pkl"
        
        if cache_path and cache_path.exists():
            print(f"Загрузка метрик из кэша: {cache_path}")
            with open(cache_path, 'rb') as f:
                return pickle.load(f)
        
        print("Предвычисление метрик...")
        precomputed = []
        
        for candidates, x_text in tqdm(list(zip(candidates_list, x_texts))):
            candidate_metrics = self.compute_batch(candidates, x_text)
            precomputed.append(candidate_metrics)
        
        if cache_path:
            with open(cache_path, 'wb') as f:
                pickle.dump(precomputed, f)
            print(f"Метрики сохранены в кэш: {cache_path}")
        
        return precomputed
    
    def _precompute_target_scores(self, candidates_list, y_texts, metric='rouge2'):
        """
        КЛЮЧЕВАЯ ОПТИМИЗАЦИЯ: предвычисляем целевую метрику для всех кандидатов.
        Это делается ОДИН раз, а не на каждом шаге differential_evolution!
        """
        scorer = self._load_rouge2_scorer()
        
        target_scores = []
        
        for candidates, y_text in tqdm(zip(candidates_list, y_texts), 
                                       total=len(candidates_list),
                                       desc="Целевая метрика"):
            candidate_scores = []
            for candidate in candidates:
                score = scorer.score(y_text, candidate)
                
                if metric == 'rouge1':
                    candidate_scores.append(score['rouge1'].fmeasure)
                elif metric == 'rouge2':
                    candidate_scores.append(score['rouge2'].fmeasure)
                elif metric == 'rougeL':
                    candidate_scores.append(score['rougeL'].fmeasure)
                else:
                    raise ValueError(f"Unknown metric: {metric}")
            
            target_scores.append(candidate_scores)
        
        return target_scores
    
    def _print_correlations_fast(self, precomputed_metrics, target_scores_cache,
                                 metric_names, n_samples=30):
        """Быстрая версия с кэшированными target scores"""
        print("\nКорреляция с целевой метрикой:")
        
        metric_spearman = {name: [] for name in metric_names}
        
        for i in range(min(n_samples, len(precomputed_metrics))):
            target_values = target_scores_cache[i]
            
            for metric_name in metric_names:
                metric_values = [
                    precomputed_metrics[i][j][metric_name] 
                    for j in range(len(precomputed_metrics[i]))
                ]
                
                if np.std(metric_values) > 1e-10 and np.std(target_values) > 1e-10:
                    spearman, _ = spearmanr(metric_values, target_values)
                    metric_spearman[metric_name].append(spearman)
        
        for metric_name in metric_names:
            if metric_spearman[metric_name]:
                avg_spearman = np.mean(metric_spearman[metric_name])
                std_spearman = np.std(metric_spearman[metric_name])
                
                if abs(avg_spearman) > 0.4:
                    status = "✓✓✓"
                elif abs(avg_spearman) > 0.25:
                    status = "✓✓"
                elif abs(avg_spearman) > 0.15:
                    status = "✓"
                else:
                    status = "⚠️"
                
                print(f"  {status} {metric_name:25s}: {avg_spearman:+.3f} (±{std_spearman:.3f})")
    
    def _optimize_weights_fast(self, candidates_list, precomputed_metrics, 
                               target_scores_cache, metric_names, 
                               max_iter, popsize, seed, n_workers):
        """Быстрая версия с кэшированными target scores"""
        n_metrics = len(metric_names)
        
        self._opt_precomputed_metrics = precomputed_metrics
        self._opt_target_scores_cache = target_scores_cache
        self._opt_metric_names = metric_names
        self._opt_n_metrics = n_metrics
        
        print(f"\nОптимизация весов (workers={n_workers})...")
        
        result = differential_evolution(
            self._objective_function_fast,
            bounds=[(0, 1)] * n_metrics,
            strategy='best1bin',
            maxiter=max_iter,
            popsize=popsize,
            tol=0.001,
            mutation=(0.5, 1.5),
            recombination=0.7,
            seed=seed,
            workers=n_workers,
            updating='deferred' if n_workers > 1 else 'immediate',
            polish=True,
            disp=True
        )
        
        del self._opt_precomputed_metrics
        del self._opt_target_scores_cache
        del self._opt_metric_names
        del self._opt_n_metrics
        
        weights_raw = result.x
        weights_sum = sum(weights_raw)
        
        if weights_sum < 1e-10:
            print("⚠️ WARNING: Все веса близки к нулю, используем равные веса")
            weights_normalized = np.ones(n_metrics) / n_metrics
        else:
            weights_normalized = weights_raw / weights_sum
        
        best_score = -result.fun
        
        return weights_normalized, best_score
    
    def _objective_function_fast(self, weights):
        """
        БЫСТРАЯ версия: использует кэшированные target scores.
        Не нужно вызывать ROUGE на каждом шаге!
        """
        total_score = 0.0
        
        for metrics_list, target_scores in zip(
            self._opt_precomputed_metrics, 
            self._opt_target_scores_cache
        ):
            # Вычисляем weighted scores для всех кандидатов
            weighted_scores = [
                sum(weights[i] * metrics_dict.get(self._opt_metric_names[i], 0) 
                    for i in range(self._opt_n_metrics))
                for metrics_dict in metrics_list
            ]
            
            # Выбираем лучшего по нашим метрикам
            best_idx = np.argmax(weighted_scores)
            
            # Берём его целевой score из кэша (БЕЗ вызова ROUGE!)
            total_score += target_scores[best_idx]
        
        # Усредняем
        avg_score = total_score / len(self._opt_precomputed_metrics)
        
        return -avg_score

# Метрики

In [6]:
!pip -q install bert_score rouge_score

from collections import Counter
import numpy as np
from rouge_score import rouge_scorer
from bert_score import score as bert_score
import torch
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
import warnings

warnings.filterwarnings('ignore')

def exact_match(prediction, reference):
    return prediction.strip().lower() == reference.strip().lower()
# Higher is better (0-1)

def token_f1(prediction, reference):
    pred_tokens = prediction.lower().split()
    ref_tokens = reference.lower().split()
    
    common = Counter(pred_tokens) & Counter(ref_tokens)
    num_same = sum(common.values())
    
    if num_same == 0:
        return 0.0
    
    precision = num_same / len(pred_tokens)
    recall = num_same / len(ref_tokens)
    f1 = 2 * precision * recall / (precision + recall)
    
    return f1
# Higher is better (0-1)

def compute_bleu(prediction, reference):
    pred_tokens = prediction.lower().split()
    ref_tokens = reference.lower().split()
    
    smoothing = SmoothingFunction().method1
    return sentence_bleu([ref_tokens], pred_tokens, smoothing_function=smoothing)
# Higher is better (0-1)

def compute_rouge(prediction, reference):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, prediction)
    
    return {
        'rouge1': scores['rouge1'].fmeasure,
        'rouge2': scores['rouge2'].fmeasure,
        'rougeL': scores['rougeL'].fmeasure
    }
# Higher is better (0-1)

def compute_bertscore(predictions, references):
    P, R, F1 = bert_score(predictions, references, lang='en', verbose=False)
    return {
        'precision': P.mean().item(),
        'recall': R.mean().item(),
        'f1': F1.mean().item()
    }
# Higher is better (0-1)

def compute_meteor(prediction, reference):
    pred_tokens = prediction.lower().split()
    ref_tokens = reference.lower().split()
    
    return meteor_score([ref_tokens], pred_tokens)
# Higher is better (0-1)

def perplexity(model, tokenizer, texts):
    total_loss = 0
    total_tokens = 0
    
    for text in texts:
        inputs = tokenizer(text, return_tensors='pt').to(model.device)
        
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs.input_ids)
            loss = outputs.loss
        
        total_loss += loss.item() * inputs.input_ids.size(1)
        total_tokens += inputs.input_ids.size(1)
    
    return np.exp(total_loss / total_tokens)
# Lower is better

def distinct_n(texts, n=2):
    all_ngrams = []
    
    for text in texts:
        tokens = text.lower().split()
        ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
        all_ngrams.extend(ngrams)
    
    if not all_ngrams:
        return 0.0
    
    return len(set(all_ngrams)) / len(all_ngrams)
# Higher is better (0-1, measures diversity)

def self_bleu(texts):
    if len(texts) < 2:
        return 0.0
    
    scores = []
    smoothing = SmoothingFunction().method1
    
    for i, text in enumerate(texts):
        others = texts[:i] + texts[i+1:]
        if not others:
            continue
        
        text_tokens = text.lower().split()
        if not text_tokens:
            continue
        
        refs_tokens = [other.lower().split() for other in others if other.strip()]
        refs_tokens = [ref for ref in refs_tokens if ref]
        
        if not refs_tokens:
            continue
        
        try:
            score = sentence_bleu(refs_tokens, text_tokens, smoothing_function=smoothing)
            scores.append(score)
        except:
            continue
    
    return np.mean(scores) if scores else 0.0
# Lower is better (0-1, measures diversity - lower means more diverse)

predictions = [
    "The cat sat on the mat",
    "A dog runs in the park",
    "She loves reading books"
]

references = [
    "A cat was sitting on the mat",
    "The dog is running",
    "She loves reading books"
]

em_scores = [exact_match(p, r) for p, r in zip(predictions, references)]
f1_scores = [token_f1(p, r) for p, r in zip(predictions, references)]
bleu_scores = [compute_bleu(p, r) for p, r in zip(predictions, references)]

rouge_scores = [compute_rouge(p, r) for p, r in zip(predictions, references)]
rouge_avg = {
    'rouge1': np.mean([s['rouge1'] for s in rouge_scores]),
    'rouge2': np.mean([s['rouge2'] for s in rouge_scores]),
    'rougeL': np.mean([s['rougeL'] for s in rouge_scores])
}

bertscore = compute_bertscore(predictions, references)
meteor_scores = [compute_meteor(p, r) for p, r in zip(predictions, references)]

distinct = distinct_n(predictions, n=2)
sbleu = self_bleu(predictions)

print(f"EM: {np.mean(em_scores):.3f}")
print(f"F1: {np.mean(f1_scores):.3f}")
print(f"BLEU: {np.mean(bleu_scores):.3f}")
print(f"ROUGE-1: {rouge_avg['rouge1']:.3f}")
print(f"ROUGE-2: {rouge_avg['rouge2']:.3f}")
print(f"ROUGE-L: {rouge_avg['rougeL']:.3f}")
print(f"BERTScore F1: {bertscore['f1']:.3f}")
print(f"METEOR: {np.mean(meteor_scores):.3f}")
print(f"Distinct-2: {distinct:.3f}")
print(f"Self-BLEU: {sbleu:.3f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


EM: 0.333
F1: 0.672
BLEU: 0.411
ROUGE-1: 0.738
ROUGE-2: 0.455
ROUGE-L: 0.672
BERTScore F1: 0.966
METEOR: 0.684
Distinct-2: 1.000
Self-BLEU: 0.027


# работа с A100

In [None]:
import torch
from transformers import AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16,  # BF16
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(
    'Qwen/Qwen2.5-7B-Instruct',
    quantization_config=bnb_config,
    device_map="auto",  # одна GPU
    attn_implementation="flash_attention_2"  # Flash Attention, ускорение модели, качество не ухудшается
)

args = TrainingArguments(
    output_dir='./output',
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    fp16=False,
    bf16=True,  # BF16
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False}
)

# RAG

In [None]:
class RAG:
    def __init__(self, checkpoint='BAAI/bge-base-en-v1.5', device='cuda'):
        self.model = AutoModel.from_pretrained(checkpoint).to(device)
        self.tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        self.device = device
        
        self.x_texts = None
        self.y_texts = None
        self.embeddings = None

    def fit(self, x_texts, y_texts, batch_size=32):
        self.x_texts = x_texts
        self.y_texts = y_texts
        all_embeddings = []

        for i in tqdm(range(0, len(x_texts), batch_size), desc='RAG fitting'):
            batch = x_texts[i:i + batch_size]
            
            inputs = self.tokenizer(
                batch,
                max_length=512,
                truncation=True,
                padding='longest',
                return_tensors='pt'
            ).to(self.device)
            
            with torch.no_grad():
                embeddings = self.model(**inputs).last_hidden_state[:, 0]
            
            all_embeddings.append(embeddings.cpu())

        self.embeddings = torch.cat(all_embeddings, dim=0).numpy()
        self.embeddings = self.embeddings / np.linalg.norm(
            self.embeddings, axis=1, keepdims=True
        )

    def predict(self, x_texts, k=3, batch_size=32):
        if isinstance(x_texts, str):
            x_texts = [x_texts]
            single = True
        else:
            single = False

        all_results = []
        
        for i in range(0, len(x_texts), batch_size):
            batch = x_texts[i:i + batch_size]
            
            inputs = self.tokenizer(
                batch,
                max_length=512,
                truncation=True,
                padding='longest',
                return_tensors='pt'
            ).to(self.device)
            
            with torch.no_grad():
                query_embs = self.model(**inputs).last_hidden_state[:, 0]
            
            query_embs = query_embs.cpu().numpy()
            query_embs = query_embs / np.linalg.norm(query_embs, axis=1, keepdims=True)
            
            similarities = np.dot(query_embs, self.embeddings.T)
            
            for j, sims in enumerate(similarities):
                top_k = np.argsort(sims)[-k - len(x_texts):][::-1]
                
                results = []
                for idx in top_k:
                    if self.x_texts[idx] == batch[j]:
                        continue

                    results.append({
                        'x': self.x_texts[idx],
                        'y': self.y_texts[idx],
                        'similarity': float(sims[idx]),
                        'index': int(idx)
                    })
                
                all_results.append(results[:k])
        
        return all_results[0] if single else all_results

# target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'] для LLaMA / Llama-2 / Llama-3 / Mistral / Qwen / Yi

# Форматы данных для LLM

1. Instruction-following (самый популярный)

In [None]:
# Формат Alpaca
template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:
{response}"""

# Формат ChatML (для chat-моделей)
template = """<|im_start|>system
{system_prompt}<|im_end|>
<|im_start|>user
{user_message}<|im_end|>
<|im_start|>assistant
{assistant_response}<|im_end|>"""

# Формат Llama/Mistral Instruct
template = """<s>[INST] {instruction} [/INST] {response}</s>"""

2. Структура датасета

In [None]:
# Вариант 1: Простой (для SFTTrainer)
dataset = [
    {"text": "### Вопрос: Столица России?\n### Ответ: Москва"},
    {"text": "### Вопрос: 2+2=?\n### Ответ: 4"},
]

# Вариант 2: Разделенный (лучше для контроля)
dataset = [
    {
        "instruction": "Столица России?",
        "response": "Москва"
    },
    {
        "instruction": "2+2=?",
        "response": "4"
    },
]

# Вариант 3: С контекстом
dataset = [
    {
        "instruction": "Суммаризируй текст",
        "input": "Длинный текст...",
        "output": "Краткое содержание"
    }
]

# Маскирование промпта

Способ 1: Автоматический (SFTTrainer)

In [None]:
from trl import SFTTrainer
from datasets import Dataset

# 1. Подготовка данных
data = [
    {"instruction": "Столица России?", "response": "Москва"},
    {"instruction": "Автор 'Война и мир'?", "response": "Лев Толстой"},
]

dataset = Dataset.from_list(data)

# 2. Функция форматирования
def formatting_func(example):
    return f"### Инструкция:\n{example['instruction']}\n\n### Ответ:\n{example['response']}"

# 3. Обучение
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,  # эффективный batch = 4*4 = 16
    learning_rate=2e-4,
    fp16=True,  # или bf16=True для новых GPU
    logging_steps=10,
    save_strategy="epoch",
    optim="paged_adamw_8bit",  # экономия памяти
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    formatting_func=formatting_func,
    max_seq_length=512,  # максимальная длина последовательности
    peft_config=lora_config,  # из прошлого блока
    args=training_args,
)

trainer.train()

Способ 2: Продвинутый (кастомный Data Collator)

In [None]:
from dataclasses import dataclass
from typing import Dict, List
from transformers import DataCollatorForLanguageModeling

@dataclass
class DataCollatorForCompletionOnlyLM:
    tokenizer: any
    response_template: str = "### Ответ:\n"
    mlm: bool = False
    
    def __call__(self, examples: List[Dict[str, List[int]]]) -> Dict[str, any]:
        batch = {
            "input_ids": [],
            "attention_mask": [],
            "labels": []
        }
        
        for example in examples:
            # Токенизируем полный текст
            full_text = example["text"]
            tokenized = self.tokenizer(
                full_text,
                truncation=True,
                max_length=512,
                padding=False,
            )
            
            input_ids = tokenized["input_ids"]
            
            # Находим где начинается ответ
            response_token_ids = self.tokenizer.encode(
                self.response_template, 
                add_special_tokens=False
            )
            
            # Ищем шаблон в input_ids
            labels = [-100] * len(input_ids)
            
            for i in range(len(input_ids) - len(response_token_ids)):
                if input_ids[i:i+len(response_token_ids)] == response_token_ids:
                    # Нашли начало ответа
                    response_start = i + len(response_token_ids)
                    labels[response_start:] = input_ids[response_start:]
                    break
            
            batch["input_ids"].append(input_ids)
            batch["attention_mask"].append(tokenized["attention_mask"])
            batch["labels"].append(labels)
        
        # Padding
        from torch.nn.utils.rnn import pad_sequence
        import torch
        
        batch["input_ids"] = pad_sequence(
            [torch.tensor(x) for x in batch["input_ids"]], 
            batch_first=True, 
            padding_value=self.tokenizer.pad_token_id
        )
        batch["attention_mask"] = pad_sequence(
            [torch.tensor(x) for x in batch["attention_mask"]], 
            batch_first=True, 
            padding_value=0
        )
        batch["labels"] = pad_sequence(
            [torch.tensor(x) for x in batch["labels"]], 
            batch_first=True, 
            padding_value=-100
        )
        
        return batch

Способ 3: Готовый инструмент из trl

In [None]:
from trl import DataCollatorForCompletionOnlyLM

# Указываем шаблон ответа
response_template = "### Ответ:\n"

collator = DataCollatorForCompletionOnlyLM(
    response_template=response_template,
    tokenizer=tokenizer,
    mlm=False
)

# Используем в Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    data_collator=collator,
    # ...
)

# TrainingArguments: Что важно

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    # === Основное ===
    output_dir="./results",
    num_train_epochs=3,
    
    # === Batch size ===
    per_device_train_batch_size=4,  # на 1 GPU
    gradient_accumulation_steps=4,   # накапливаем градиенты
    # Реальный batch = 4 * 4 = 16
    
    # === Learning rate ===
    learning_rate=2e-4,  # для LoRA обычно выше: 1e-4 до 5e-4
    lr_scheduler_type="cosine",  # или "linear"
    warmup_steps=100,  # или warmup_ratio=0.1
    
    # === Оптимизация памяти ===
    fp16=True,  # для старых GPU (V100, RTX 2080)
    # bf16=True,  # для новых GPU (A100, RTX 3090+) - лучше чем fp16
    gradient_checkpointing=True,  # экономия памяти за счет скорости
    optim="paged_adamw_8bit",  # 8-bit optimizer от bitsandbytes
    
    # === Логирование ===
    logging_steps=10,
    logging_dir="./logs",
    report_to="tensorboard",  # или "wandb"
    
    # === Сохранение ===
    save_strategy="epoch",  # "steps", "epoch", "no"
    save_total_limit=2,  # храним только 2 последних чекпоинта
    
    # === Eval (если есть val set) ===
    evaluation_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    
    # === Прочее ===
    remove_unused_columns=False,  # важно для SFTTrainer
    dataloader_num_workers=4,  # параллельная загрузка данных
)

# Полный пример: От данных до обученной модели

In [1]:
!pip -q install transformers>=4.38.0 trl>=0.8.0 peft>=0.9.0 bitsandbytes

import torch
from datasets import Dataset
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, TrainingArguments
from peft import LoraConfig, prepare_model_for_kbit_training
from trl import SFTTrainer

checkpoint = "mistralai/Mistral-7B-v0.1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_type=torch.float16,
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    quantization_config=bnb_config,
    device_map='auto'
)

prepare_model_for_kbit_training(model)

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout=0.05,
    bias='none',
    task_type='CAUSAL_LM'
)

train_data = [
    {
        "question": "Переведи на английский: Привет, как дела?",
        "answer": "Hello, how are you?"
    },
    {
        "question": "Реши: 15 * 8",
        "answer": "120"
    }
]
dataset = Dataset.from_list([
    {"text": f"<s>[INST] {item['question']} [/INST] {item['answer']}</s>"}
    for item in train_data
])

args = TrainingArguments(
    optim='paged_adamw_8bit',
    report_to='none',
    output_dir='./result',
    fp16=torch.cuda.is_available()
)
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    processing_class=tokenizer,
    args=args,
    peft_config=lora_config
)
trainer.train()
trainer.save_model("./qa_model")

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
pylibcudf-cu12 25.2.2 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.
cudf-cu12 25.2.2 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.
bigframes 2.12.0 requires rich<14,>=12.4.4, but you have rich 14.2.0 which is incompatible.
libcugraph-cu12 25.6.0 requires libraft-cu12==25.6.*, but you have libraft-cu12 25.2.0 which is incompatible.
cudf-polars-cu12 25.6.0 requires pylibcudf-cu12==25.6.*, but you have pylibcudf-cu12 25.2.2 which is incompatible.
pylibcugraph-cu12 25.6.0 requires pylibraft-cu12==25.6.*, but you have pylibraft-cu12 25.2.0 which is incompatible.
pylibcugraph-cu12 

2025-11-09 07:06:42.452891: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762672002.680153      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762672002.744361      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Adding EOS to train dataset:   0%|          | 0/2 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/2 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.
  return fn(*args, **kwargs)


Step,Training Loss


In [2]:
from peft import PeftModel

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    quantization_config=bnb_config,
    device_map="auto"
)

model = PeftModel.from_pretrained(model, "./qa_model")
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_pro

In [None]:
# Подготовка промпта
prompt = "<s>[INST] Привет, как дела? [/INST]"

# Токенизация
inputs = tokenizer(prompt, return_tensors="pt").to(trainer.model.device)

# Генерация
outputs = model.generate(
    **inputs,
    max_new_tokens=256,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.2
)

# Декодирование
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

In [None]:
inputs = tokenizer('<s>[INST] Приветики-пистолетики! [/INST]</s>', return_tensors='pt').to(trainer.model.device)
outputs = model(
    **inputs,
    max_new_tokens=256,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    repetitiin_penalty=1.2
)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# Подготовка промпта
prompt = "<s>[INST] Привет, как дела? [/INST]"

# Токенизация
inputs = tokenizer(prompt, return_tensors="pt").to(trainer.model.device)

# Генерация
outputs = model.generate(
    **inputs,
    
    # === Длина ===
    max_new_tokens=256,      # максимум новых токенов
    min_new_tokens=10,       # минимум (опционально)
    max_length=512,          # альтернатива: общая длина (prompt + generation)
    
    # === Стратегия декодирования ===
    do_sample=True,          # False = greedy, True = sampling
    
    # === Temperature (креативность) ===
    temperature=0.7,         # 0.1 = консервативно, 1.0 = нормально, 2.0 = креативно
                             # <0.7: факты, код, переводы
                             # 0.7-1.0: обычная генерация
                             # >1.0: креативное письмо
    
    # === Top-p (nucleus sampling) ===
    top_p=0.9,              # рассматриваем топ токенов с суммарной вероятностью 90%
                            # 0.9-0.95: хороший баланс
                            # 0.5: более консервативно
                            # 0.99: почти все токены
    
    # === Top-k sampling ===
    top_k=50,               # рассматриваем только топ-50 токенов
                            # обычно 40-100
                            # 0 = выключено
    
    # === Repetition penalty ===
    repetition_penalty=1.2, # штраф за повторения
                            # 1.0 = нет штрафа
                            # 1.1-1.5: легкий штраф (обычно хорошо)
                            # >1.5: сильный штраф
    
    # # === Stopping criteria ===
    # eos_token_id=tokenizer.eos_token_id,
    # pad_token_id=tokenizer.pad_token_id,
    
    # === Другое ===
    num_return_sequences=1,  # сколько вариантов генерировать
    num_beams=1,            # beam search (1 = выключен)
)

# Декодирование
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

# Если не хватает памяти

In [None]:
# Уменьшите:
per_device_train_batch_size=2  # было 4
max_seq_length=256  # было 512
r=8  # было 16 в LoRA

# Добавьте:
gradient_checkpointing=True
optim="paged_adamw_8bit"

# Блок 3: Inference, генерация и валидация

Часть 1: Загрузка обученной модели

In [None]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    quantization_config=bnb_config,
    device_map="auto"
)

model = PeftModel.from_pretrained(model, "./qa_model")
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model.eval()

Часть 2: Генерация текста

In [6]:
# Подготовка промпта
prompt = "<s>[INST] Hello! [/INST]"

# Токенизация
inputs = tokenizer(prompt, return_tensors="pt").to(trainer.model.device)

# Генерация
outputs = trainer.model.generate(
    **inputs,
    max_new_tokens=100,  # сколько токенов сгенерировать
    do_sample=False,     # greedy decoding
)

# Декодирование
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[INST] Hello! [/INST]

Hi there, I'm new to this forum. My name is Kieran and I am from the United Kingdom (hence my accent). I have been playing TF2 for 5 years now, and have always loved it. It has never failed to give me fun times with friends or strangers alike.
I've just recently got back into trading after a long break, so if you want to trade with someone who knows what they are doing then feel free to send me an offer. Also, any tips would be appreciated as I'm not very good at trading myself xD .
I will also like to join your group, as I think that we could help each other out in trades.


Параметры генерации

In [4]:
outputs = model.generate(
    **inputs,
    
    # === Длина ===
    max_new_tokens=256,      # максимум новых токенов
    min_new_tokens=10,       # минимум (опционально)
    max_length=512,          # альтернатива: общая длина (prompt + generation)
    
    # === Стратегия декодирования ===
    do_sample=True,          # False = greedy, True = sampling
    
    # === Temperature (креативность) ===
    temperature=0.7,         # 0.1 = консервативно, 1.0 = нормально, 2.0 = креативно
                             # <0.7: факты, код, переводы
                             # 0.7-1.0: обычная генерация
                             # >1.0: креативное письмо
    
    # === Top-p (nucleus sampling) ===
    top_p=0.9,              # рассматриваем топ токенов с суммарной вероятностью 90%
                            # 0.9-0.95: хороший баланс
                            # 0.5: более консервативно
                            # 0.99: почти все токены
    
    # === Top-k sampling ===
    top_k=50,               # рассматриваем только топ-50 токенов
                            # обычно 40-100
                            # 0 = выключено
    
    # === Repetition penalty ===
    repetition_penalty=1.2, # штраф за повторения
                            # 1.0 = нет штрафа
                            # 1.1-1.5: легкий штраф (обычно хорошо)
                            # >1.5: сильный штраф
    
    # === Stopping criteria ===
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    
    # === Другое ===
    num_return_sequences=1,  # сколько вариантов генерировать
    num_beams=1,            # beam search (1 = выключен)
)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Комбинации параметров для разных задач

In [None]:
# 1. Факты, QA, перевод (нужна точность)
generation_config_precise = {
    "max_new_tokens": 100,
    "do_sample": True,
    "temperature": 0.3,
    "top_p": 0.85,
    "repetition_penalty": 1.1,
}

# 2. Обычная генерация (баланс)
generation_config_balanced = {
    "max_new_tokens": 200,
    "do_sample": True,
    "temperature": 0.7,
    "top_p": 0.9,
    "top_k": 50,
    "repetition_penalty": 1.2,
}

# 3. Креативное письмо
generation_config_creative = {
    "max_new_tokens": 300,
    "do_sample": True,
    "temperature": 1.0,
    "top_p": 0.95,
    "repetition_penalty": 1.15,
}

# 4. Детерминированная генерация (для debug)
generation_config_greedy = {
    "max_new_tokens": 100,
    "do_sample": False,  # greedy decoding
}

# Использование
outputs = model.generate(**inputs, **generation_config_balanced)

Batch генерация

In [None]:
prompts = [
    "<s>[INST] Столица России? [/INST]",
    "<s>[INST] 2+2=? [/INST]",
    "<s>[INST] Кто написал 'Евгений Онегин'? [/INST]"
]

# Токенизация с padding
inputs = tokenizer(
    prompts, 
    return_tensors="pt", 
    padding=True,  # важно!
    truncation=True,
    max_length=512
).to(model.device)

# Генерация
outputs = model.generate(
    **inputs,
    max_new_tokens=50,
    temperature=0.7,
    top_p=0.9,
    pad_token_id=tokenizer.pad_token_id
)

# Декодирование
for i, output in enumerate(outputs):
    text = tokenizer.decode(output, skip_special_tokens=True)
    print(f"Prompt {i}: {text}\n")

In [None]:
def generate_response(model, tokenizer, promts):
    inputs = tokenizer(
        promts,
        return_tensors='pt',
        padding='longest',
        truncation=True,
        max_length=512
    ).to(model.device)
    outputs = model.generate(
        **inputs,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.2,
        max_new_tokens=256
    )

    texts = []
    for output in outputs:
        output = output[inputs.input_ids.shape[1]:]
        text = tokenizer.decode(output, skip_special_tokens=True)
        texts.append(text)

    return texts

In [None]:
def template_processing(question, answer=None):
    if answer is None:
        return tokenizer.apply_chat_template(
            [{'role': 'user', 'content': f'здесь задача модели:\n\n{question}'}],
            tokenize=False,
            add_generation_promt=True
        )
    else:
        return tokenizer.apply_chat_template(
            [{'role': 'user', 'content': f'здесь задача модели:\n\n{question}'},
             {'role': 'assistant', 'content': answer}],
            tokenize=False
        )

# Часть 3: Валидация в процессе обучения

Вариант 1: Простой callback для генерации примеров

In [None]:
from transformers import TrainerCallback

class GenerationCallback(TrainerCallback):
    def __init__(self, tokenizer, test_prompts, every_n_steps=100):
        self.tokenizer = tokenizer
        self.test_prompts = test_prompts
        self.every_n_steps = every_n_steps
    
    def on_step_end(self, args, state, control, model=None, **kwargs):
        if state.global_step % self.every_n_steps == 0:
            print(f"\n{'='*50}")
            print(f"Generation at step {state.global_step}")
            print(f"{'='*50}")
            
            model.eval()
            for prompt in self.test_prompts:
                inputs = self.tokenizer(prompt, return_tensors="pt").to(model.device)
                
                with torch.no_grad():
                    outputs = model.generate(
                        **inputs,
                        max_new_tokens=50,
                        temperature=0.7,
                        top_p=0.9,
                        do_sample=True
                    )
                
                generated = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
                print(f"\nPrompt: {prompt}")
                print(f"Generated: {generated}")
            
            model.train()
            print(f"{'='*50}\n")

# Использование
test_prompts = [
    "<s>[INST] Столица России? [/INST]",
    "<s>[INST] Что такое Python? [/INST]",
]

trainer = SFTTrainer(
    model=model,
    # ... остальные параметры
    callbacks=[GenerationCallback(tokenizer, test_prompts, every_n_steps=50)]
)

Вариант 2: Validation set с метриками

In [None]:
from datasets import Dataset

# Разделяем данные
train_data = data[:800]
val_data = data[800:]

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)

# В TrainingArguments добавляем
training_args = TrainingArguments(
    # ...
    evaluation_strategy="steps",  # или "epoch"
    eval_steps=100,               # оценивать каждые 100 шагов
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

# Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # добавили validation set
    # ...
)

Вариант 3: Кастомные метрики (ROUGE, BLEU)

In [None]:
import numpy as np
from evaluate import load

# Загружаем метрики
rouge = load('rouge')
bleu = load('bleu')

# Сам генерируй и считай метрики
def evaluate_model(model, val_data):
    rouge = load('rouge')
    predictions = []
    references = []
    
    for item in val_data[:50]:
        prompt = f"<s>[INST] {item['question']} [/INST]"
        
        # Генерируем
        generated = generate_response(model, tokenizer, prompt)
        
        predictions.append(generated)
        references.append(item['answer'])
    
    scores = rouge.compute(predictions=predictions, references=references)
    return scores  # чем больше, тем лучше

########################################## ИЛИ

rouge = evaluate.load('rouge')
def evaluate_model(model, val_data):
    predictions = generate_response(model, tokenizer, [f"<s>[INST] {item['question']} [/INST]" for item in val_data])
    references = [item['answer'] for item in val_data]

    return rouge.compute(predictions=predictions, references=references)

########################################## ИЛИ

rouge = evaluate.load('rouge')
def evaluate_model(model, tokenizer, val_data, batch_size=8):
    predictions = []
    for i in tqdm(range(0, len(val_data), batch_size)):
        batch = [val_data[j] for j in range(i, min(i+batch_size, len(val_data)))]
        batch_predictions = generate_response(
            model, tokenizer,
            [tokenizer.apply_chat_template(
                [{'role': 'user', 'content': item['text']}],
                add_generation_prompt=True, tokenize=False
            ) for item in batch]
        )
        predictions.extend(batch_predictions)
    references = [item['summary'] for item in val_data]
    return rouge.compute(predictions=predictions, references=references)

Вариант 4: Полноценная валидация с генерацией (для олимпиады)

In [None]:
def evaluate_model(model, tokenizer, val_data, num_samples=50):
    """
    Оценка модели на validation set с реальной генерацией
    """
    model.eval()
    results = {
        "rouge1": [],
        "rouge2": [],
        "rougeL": [],
        "exact_match": 0,
    }
    
    rouge_metric = load('rouge')
    
    for i, example in enumerate(val_data[:num_samples]):
        # Формируем промпт
        prompt = f"<s>[INST] {example['instruction']} [/INST]"
        true_response = example['response']
        
        # Генерируем
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=100,
                temperature=0.7,
                top_p=0.9,
                do_sample=True
            )
        
        generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Убираем промпт из сгенерированного текста
        generated = generated.replace(prompt, "").strip()
        
        # Считаем метрики
        rouge_scores = rouge_metric.compute(
            predictions=[generated],
            references=[true_response]
        )
        
        results["rouge1"].append(rouge_scores["rouge1"])
        results["rouge2"].append(rouge_scores["rouge2"])
        results["rougeL"].append(rouge_scores["rougeL"])
        
        # Exact match (для простых задач типа QA)
        if generated.strip().lower() == true_response.strip().lower():
            results["exact_match"] += 1
    
    # Усредняем
    final_results = {
        "rouge1": np.mean(results["rouge1"]),
        "rouge2": np.mean(results["rouge2"]),
        "rougeL": np.mean(results["rougeL"]),
        "exact_match": results["exact_match"] / num_samples,
    }
    
    model.train()
    return final_results

# Использование
val_results = evaluate_model(model, tokenizer, val_data)
print(val_results)

# Часть 4: Debugging и типичные проблемы

Проблема 1: Модель повторяет промпт

In [None]:
# Проблема:
prompt = "Вопрос: Столица России?"
# Генерация: "Вопрос: Столица России? Вопрос: Столица России? Вопрос..."

# Решение 1: Увеличить repetition_penalty
outputs = model.generate(
    **inputs,
    repetition_penalty=1.5,  # было 1.2
)

# Решение 2: Правильно форматировать промпт (использовать тот же формат что при обучении)
prompt = "<s>[INST] Столица России? [/INST]"  # как в обучении!

# Решение 3: Убрать промпт из вывода
generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
response = generated.replace(prompt, "").strip()

Проблема 2: Модель генерирует бессмыслицу

In [None]:
# Причины:
# 1. Слишком высокий temperature
temperature=0.5  # вместо 1.5

# 2. Модель недообучена
# Проверьте loss, обучите дольше

# 3. Слишком мало данных
# Нужно минимум 100-500 качественных примеров

# 4. Неправильный формат промпта
# Используйте ТОТ ЖЕ формат что и при обучении!

Проблема 3: Модель обрывается на середине

In [None]:
# Проблема: генерация заканчивается слишком рано

# Решение 1: Увеличить max_new_tokens
max_new_tokens=256  # было 50

# Решение 2: Проверить eos_token
print(f"EOS token: {tokenizer.eos_token}")
print(f"EOS token ID: {tokenizer.eos_token_id}")

# Решение 3: Добавить min_new_tokens
min_new_tokens=20

Проблема 4: Медленная генерация

In [None]:
# Решение 1: Использовать квантизацию
# (уже покрыто выше)

# Решение 2: Уменьшить max_new_tokens
max_new_tokens=100  # было 512

# Решение 3: Использовать greedy вместо sampling
do_sample=False  # быстрее, но менее разнообразно

# Решение 4: Batch inference
# (покрыто выше)

Часть 6: Быстрый inference для олимпиады

In [None]:
class LLMInference:
    def __init__(self, base_model_name, adapter_path, use_4bit=True):
        """Класс для быстрого inference"""
        
        if use_4bit:
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.float16
            )
            self.model = AutoModelForCausalLM.from_pretrained(
                base_model_name,
                quantization_config=bnb_config,
                device_map="auto"
            )
        else:
            self.model = AutoModelForCausalLM.from_pretrained(
                base_model_name,
                torch_dtype=torch.float16,
                device_map="auto"
            )
        
        self.model = PeftModel.from_pretrained(self.model, adapter_path)
        self.tokenizer = AutoTokenizer.from_pretrained(base_model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        
        self.model.eval()
    
    def generate(self, prompt, max_new_tokens=100, temperature=0.7, **kwargs):
        """Генерация одного ответа"""
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                top_p=kwargs.get('top_p', 0.9),
                do_sample=True,
                repetition_penalty=kwargs.get('repetition_penalty', 1.2),
                pad_token_id=self.tokenizer.pad_token_id,
            )
        
        generated = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Убираем промпт
        response = generated.replace(prompt, "").strip()
        return response
    
    def batch_generate(self, prompts, **kwargs):
        """Batch генерация"""
        inputs = self.tokenizer(
            prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        ).to(self.model.device)
        
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=kwargs.get('max_new_tokens', 100),
                temperature=kwargs.get('temperature', 0.7),
                top_p=kwargs.get('top_p', 0.9),
                do_sample=True,
                pad_token_id=self.tokenizer.pad_token_id,
            )
        
        results = []
        for i, output in enumerate(outputs):
            generated = self.tokenizer.decode(output, skip_special_tokens=True)
            response = generated.replace(prompts[i], "").strip()
            results.append(response)
        
        return results

# Использование
inferencer = LLMInference(
    base_model_name="mistralai/Mistral-7B-v0.1",
    adapter_path="./final_model"
)

response = inferencer.generate(
    "<s>[INST] Столица России? [/INST]",
    max_new_tokens=50,
    temperature=0.5
)
print(response)

Часть 7: Сохранение и загрузка для submission

In [None]:
# === После обучения ===

# Вариант 1: Сохранить только LoRA адаптер (маленький размер)
trainer.save_model("./lora_adapter")
# Размер: ~10-50 MB

# Вариант 2: Объединить и сохранить полную модель
model = model.merge_and_unload()
model.save_pretrained("./full_model")
tokenizer.save_pretrained("./full_model")
# Размер: ~13 GB для 7B модели

# === Для загрузки ===

# Вариант 1: Загрузить LoRA адаптер
base_model = AutoModelForCausalLM.from_pretrained(...)
model = PeftModel.from_pretrained(base_model, "./lora_adapter")

# Вариант 2: Загрузить полную модель
model = AutoModelForCausalLM.from_pretrained("./full_model")
tokenizer = AutoTokenizer.from_pretrained("./full_model")