In [None]:
import pandas as pd
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import torch
import numpy as np
from sklearn.preprocessing import LabelEncoder
import random

In [None]:
train_df = pd.read_csv('SynTrainLLMclass.csv')
test_df = pd.read_csv('MainTest.csv')

train_df = train_df[['text', 'response']]
train_df.rename({'response': 'label'}, axis=1, inplace=True)

train_df['label'] = train_df['label'].astype(int)
test_df['label'] = test_df['label'].astype(int)

In [None]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("deepset/gbert-large")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)


In [None]:
# Convert to datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

In [None]:
# Set format
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])


In [None]:
model = AutoModelForSequenceClassification.from_pretrained("deepset/gbert-large", num_labels=2)


In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    #save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    seed=18,
    #load_best_model_at_end=True,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [None]:
trainer.train()
model.save_pretrained('./MAIN_llm18')
tokenizer.save_pretrained('./MAIN_llm18')

In [None]:
# Evaluate model
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

# Classification report
print(classification_report(test_df['label'], pred_labels))

In [None]:
pred = pd.DataFrame({
    'Text': test_df['text'],
    'Labels': test_df['label'],
    'Predicition': pred_labels
})

pred.to_csv('bert_real_18_batch16_e5.csv')