In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install gigachat

In [None]:
import json
from sklearn.metrics import accuracy_score, f1_score
from gigachat import GigaChat

data = []
with open('/kaggle/input/medmed/dev.json', 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line.strip()))
results = []

PROMPT = """Ответь на вопрос учитывая контекст
Контекст: {context}
Вопрос: {question}

Обязательно ответь либо "да", либо "нет"."""

with GigaChat(credentials='myapi', verify_ssl_certs=False) as giga:
    
    for item in data:
        context = item["context"]
        question = item["question"]
        correct_answer = item["answer"]
        
        response_text = PROMPT.format(context=context, question=question)
        print(response_text)
        response = giga.chat(response_text)
        predicted_answer = response.choices[0].message.content.strip().lower()
        
        results.append({
            "context": context,
            "question": question,
            "predicted_answer": predicted_answer,
            "correct_answer": correct_answer
        })

y_true = [item["correct_answer"] for item in data]
y_pred = [item["predicted_answer"] for item in results]

accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, pos_label="да")

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

with open('results.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=4)


In [2]:
!pip install -U "transformers>=4.42.3" bitsandbytes accelerate peft




In [3]:
import json
import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, XLMRobertaForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training, TaskType
from accelerate import Accelerator
from torch.utils.data import Dataset

# Инициализация Accelerate
accelerator = Accelerator()

# Загрузка и подготовка данных
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line.strip()) for line in f]
    return data

train_data = load_data('/kaggle/input/medmed/train.json')
dev_data = load_data('/kaggle/input/medmed/dev.json')

# Создание кастомного датасета
class QLoraDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):  # Уменьшение max_length до 512
        self.texts = [f"{item['context']} {item['question']}" for item in data]
        self.labels = [1 if item['answer'].lower() == 'да' else 0 for item in data]
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokens = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors="pt")
        tokens = {key: val.squeeze(0) for key, val in tokens.items()}  # Убираем первую размерность
        tokens['labels'] = torch.tensor(label, dtype=torch.long)
        return tokens

# Загрузка токенизатора и модели
tokenizer = AutoTokenizer.from_pretrained("sagteam/xlm-roberta-large-sag")
model = XLMRobertaForSequenceClassification.from_pretrained('sagteam/xlm-roberta-large-sag', num_labels=2)

# Применение QLoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "key", "value"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_CLS,
    layers_to_transform=[i for i in range(24) if i >= 15]
)
model.config.use_cache = False
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

# Создание датасетов
train_dataset = QLoraDataset(train_data, tokenizer, max_length=512)
dev_dataset = QLoraDataset(dev_data, tokenizer, max_length=512)

# Разделение тренировочного набора на тренировочную и валидационную части

# Настройка аргументов для тренировки
training_args = TrainingArguments(
    output_dir="output",
    overwrite_output_dir=True,
    report_to="none",
    num_train_epochs=10,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    per_device_eval_batch_size=8,
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    optim="adamw_8bit",
    fp16=True,
    learning_rate=2e-4,
)

# Функция для расчета метрик
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    accuracy = accuracy_score(p.label_ids, preds)
    f1 = f1_score(p.label_ids, preds)
    return {"accuracy": accuracy, "f1": f1}

# Инициализация тренера
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)

# Подготовка модели и данных для использования с Accelerator
trainer.model, trainer.train_dataloader, trainer.eval_dataloader = accelerator.prepare(
    trainer.model, trainer.get_train_dataloader(), trainer.get_eval_dataloader()
)

# Обучение модели
trainer.train()

# Финальная оценка на dev наборе
final_eval_results = trainer.evaluate(eval_dataset=dev_dataset)
print(f"Final evaluation results on dev dataset: {final_eval_results}")


2024-08-27 11:08:27.054997: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-27 11:08:27.055154: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-27 11:08:27.211278: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/588 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at sagteam/xlm-roberta-large-sag and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1
50,0.6872,0.725142,0.5,0.666667
100,0.734,0.695784,0.5,0.666667
150,0.6832,0.736011,0.5,0.0
200,0.7186,0.719923,0.5,0.666667
250,0.7633,0.705061,0.5,0.0
300,0.7423,0.701221,0.5,0.0
350,0.713,0.697474,0.5,0.0
400,0.8736,0.791031,0.5,0.0
450,0.7349,0.696363,0.5,0.666667
500,0.8721,0.693552,0.492188,0.551724


Final evaluation results on dev dataset: {'eval_loss': 1.22214937210083, 'eval_accuracy': 0.79296875, 'eval_f1': 0.7969348659003832, 'eval_runtime': 16.1643, 'eval_samples_per_second': 15.837, 'eval_steps_per_second': 1.98, 'epoch': 10.0}


In [6]:
model.print_trainable_parameters()


trainable params: 1,936,386 || all params: 561,828,868 || trainable%: 0.3447
