In [None]:
# python 3.7 recommended outside of colab
!pip install transformers
!pip install sentencepiece
!pip install tensorflow
# !pip install --upgrade tensorflow-gpu # on colab

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('./dataset.csv', delimiter=';')
df = df.dropna()
df.iloc[:,2] = df.iloc[:,2].astype(int)
# Split the data into training and temporary datasets (70% train, 30% temp)
train_df, validation_df = train_test_split(df, test_size=0.3, random_state=42)

In [None]:
x_train = list(train_df["answers"])
y_train = list(train_df["check"])
x_val = list(validation_df["answers"])
y_val = list(validation_df["check"])

In [None]:
from transformers import DebertaV2Tokenizer

model_name = "microsoft/deberta-v3-base"
tokenizer = DebertaV2Tokenizer.from_pretrained(model_name, from_pt=True)

In [None]:
x_train_enc = tokenizer(x_train, padding = True, truncation = True, max_length = 512)
x_val_enc = tokenizer(x_val, padding = True, truncation = True, max_length = 512)

In [None]:
import tensorflow as tf

train_dataset = tf.data.Dataset.from_tensor_slices((dict(x_train_enc),y_train))

val_dataset = tf.data.Dataset.from_tensor_slices((dict(x_val_enc),y_val))

In [None]:
from transformers import TFDebertaV2ForSequenceClassification, TFTrainer, TFTrainingArguments

training_args = TFTrainingArguments(
    output_dir = "./results_deberta", #output directory
    num_train_epochs = 3,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 16,
    warmup_steps = 500,
    save_steps=2000,
    weight_decay = 0.01,
    logging_dir = "./logs",
    logging_steps = 100,
    evaluation_strategy="steps",
    eval_steps=100
)


In [None]:

with training_args.strategy.scope():
  model = TFDebertaV2ForSequenceClassification.from_pretrained("microsoft/deberta-v3-base", from_pt=True)

trainer = TFTrainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset
)

trainer.train()

In [None]:
trainer.evaluate(val_dataset)

In [None]:
trainer.predict(val_dataset)[1].shape

In [None]:
#Validation data

import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

predictions = trainer.predict(val_dataset)

raw_predictions, true_labels = predictions.predictions, predictions.label_ids

predicted_labels = np.argmax(raw_predictions, axis=1)

f1 = f1_score(true_labels, predicted_labels, average='macro')

accuracy = accuracy_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels, average='macro')
precision = precision_score(true_labels, predicted_labels, average='macro')


print(f"Accuracy Score: {accuracy}")
print(f"Recall Score: {recall}")
print(f"Precision Score: {precision}")
print(f"F1 Score: {f1}")

In [None]:
df_test = pd.read_csv('./test_dataset_1.csv', delimiter=';')
df_test = df_test.dropna()
x_test = list(df_test["text"])
y_test = list(df_test["label"])
x_test_enc = tokenizer(x_test, padding=True, truncation=True, max_length=512)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(x_test_enc), y_test))

In [None]:
# Ai vs Students texts

import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

predictions = trainer.predict(test_dataset)

raw_predictions, true_labels = predictions.predictions, predictions.label_ids

predicted_labels = np.argmax(raw_predictions, axis=1)

f1 = f1_score(true_labels, predicted_labels, average='macro')

accuracy = accuracy_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels, average='macro')
precision = precision_score(true_labels, predicted_labels, average='macro')

print(f"Accuracy Score: {accuracy}")
print(f"Recall Score: {recall}")
print(f"Precision Score: {precision}")
print(f"F1 Score: {f1}")

In [None]:
df_test = pd.read_csv('./test_dataset_2.csv', delimiter=';')
df_test = df_test.dropna()
x_test2 = list(df_test["text"])
y_test2 = list(df_test["label"])
x_test2_enc = tokenizer(x_test2, padding=True, truncation=True, max_length=512)
test_dataset2 = tf.data.Dataset.from_tensor_slices((dict(x_test2_enc), y_test2))

In [None]:
# Daigt

import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

predictions = trainer.predict(test_dataset)

raw_predictions, true_labels = predictions.predictions, predictions.label_ids

predicted_labels = np.argmax(raw_predictions, axis=1)

f1 = f1_score(true_labels, predicted_labels, average='macro')

accuracy = accuracy_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels, average='macro')
precision = precision_score(true_labels, predicted_labels, average='macro')

print(f"Accuracy Score: {accuracy}")
print(f"Recall Score: {recall}")
print(f"Precision Score: {precision}")
print(f"F1 Score: {f1}")

In [None]:
trainer.save_model("./model_deberta")