In [4]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd


train_df = pd.read_csv('train.tsv', sep='\t')
test_df = pd.read_csv('test.tsv', sep='\t')

print("Train columns:", train_df.columns.tolist())
print("\nSample data:")
print(train_df.head())
print("\nUnique labels:", train_df[train_df.columns[-1]].unique())

label_mapping = {'NOCUOUS': 0, 'INNOCUOUS': 1}  


text_column = train_df.columns[0]  
label_column = train_df.columns[-1]  

train_df[text_column] = train_df[text_column].astype(str)
test_df[text_column] = test_df[text_column].astype(str)
train_df[label_column] = train_df[label_column].map(label_mapping)
test_df[label_column] = test_df[label_column].map(label_mapping)


train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(test_df)


tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def preprocess_function(examples):
    tokenized = tokenizer(
        examples[text_column],
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors=None
    )
    
    tokenized['labels'] = examples[label_column]
    
    return tokenized

train_tokenized = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
val_tokenized = val_dataset.map(preprocess_function, batched=True, remove_columns=val_dataset.column_names)


train_tokenized.set_format(type="torch")
val_tokenized.set_format(type="torch")


model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Define metrics
def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    acc = accuracy_score(labels, predictions)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


trainer.train()

eval_results = trainer.evaluate()
print("\nEvaluation Results:")
print(eval_results)


model_save_path = "./ambiguity_detection_model"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"\nModel saved to {model_save_path}")

import json
with open(f"{model_save_path}/label_mapping.json", 'w') as f:
    json.dump(label_mapping, f)

Train columns: ['Unnamed: 0', 'ID', 'Sentence', 'Detected as']

Sample data:
   Unnamed: 0            ID  \
0           1    library#01   
1           2  library#02-1   
2           3  library#02-2   
3           4    library#03   
4           5    library#04   

                                            Sentence Detected as  
0  All material that is stored in the repository ...     NOCUOUS  
1  The Library may want to accept important digit...     NOCUOUS  
2  The Library may want to accept important digit...   INNOCUOUS  
3  Once material has arrived, <referential>it</re...   INNOCUOUS  
4  Allows resources to be reviewed before a decis...   INNOCUOUS  

Unique labels: ['NOCUOUS' 'INNOCUOUS']




Map:   0%|          | 0/139 [00:00<?, ? examples/s]

Map:   0%|          | 0/73 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.6872031092643738, 'eval_accuracy': 0.5616438356164384, 'eval_f1': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.8816, 'eval_samples_per_second': 82.804, 'eval_steps_per_second': 5.671, 'epoch': 1.0}


  0%|          | 0/5 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.6872454881668091, 'eval_accuracy': 0.5616438356164384, 'eval_f1': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.6806, 'eval_samples_per_second': 107.255, 'eval_steps_per_second': 7.346, 'epoch': 2.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.6902236342430115, 'eval_accuracy': 0.5205479452054794, 'eval_f1': 0.22222222222222224, 'eval_precision': 0.38461538461538464, 'eval_recall': 0.15625, 'eval_runtime': 0.686, 'eval_samples_per_second': 106.408, 'eval_steps_per_second': 7.288, 'epoch': 3.0}
{'train_runtime': 30.0973, 'train_samples_per_second': 13.855, 'train_steps_per_second': 0.897, 'train_loss': 0.697892083062066, 'epoch': 3.0}


  0%|          | 0/5 [00:00<?, ?it/s]


Evaluation Results:
{'eval_loss': 0.6902236342430115, 'eval_accuracy': 0.5205479452054794, 'eval_f1': 0.22222222222222224, 'eval_precision': 0.38461538461538464, 'eval_recall': 0.15625, 'eval_runtime': 0.7187, 'eval_samples_per_second': 101.566, 'eval_steps_per_second': 6.957, 'epoch': 3.0}

Model saved to ./ambiguity_detection_model
