# Import necessary libraries

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Load the data to train

In [8]:
# Create the df and load the data inside with two columns
column_names = ['label', 'text']
data = pd.read_fwf('TRAINING_DATA.txt', header=None, names=column_names)

# 17877 rows

In [201]:
# Lets see it
data.head()

Unnamed: 0,label,text
0,1,"Cuando conocí a Janice en 2013 , una familia n..."
1,0,Hwang habló en Sur de este año por Southwest M...
2,1,Usted podría pensar Katy Perry y Robert Pattin...
3,1,Cualquiera que haya volado los cielos del crea...
4,1,"Bueno , este cantante tendrá un LARGO tiempo p..."


# Perform a train_test_split for testing

In [203]:
X = data['text']
y = data['label']

In [204]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [205]:
# Take only a certain amount of data to make training and predicting faster to tune parameters
X_train = X_train[:1500]
X_test = X_test[:1000]
y_train = y_train[:1500]
y_test = y_test[:1000]

# Initialise the tokenizer for BERT

In [2]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')

# Preprocess the data for BERT

Define a function for preprocessing

In [3]:
def preprocess_text_for_bert(text):
    # Clean the text: join back into a single string without modifying the content
    text = ' '.join(text.split())
    # Tokenize the text
    tokens = tokenizer.tokenize(text)
    # Add [CLS] and [SEP] tokens
    tokens = ["[CLS]"] + tokens + ["[SEP]"]
    return tokens

Preprocess X_train and X_test

In [206]:
# Apply preprocessing to the text
train_encodings = [preprocess_text_for_bert(text) for text in X_train]
test_encodings = [preprocess_text_for_bert(text) for text in X_test]

# Convert tokenized text data into numerical IDs for train and test sets
train_input_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in train_encodings]
test_input_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in test_encodings]

# Pad sequences to a fixed length for train and test sets
max_length = max(max(len(ids) for ids in train_input_ids), max(len(ids) for ids in test_input_ids))
train_input_ids = [ids + [tokenizer.pad_token_id] * (max_length - len(ids)) for ids in train_input_ids]
test_input_ids = [ids + [tokenizer.pad_token_id] * (max_length - len(ids)) for ids in test_input_ids]

# Convert input_ids lists into PyTorch tensors for train and test sets
train_input_ids = torch.tensor(train_input_ids)
test_input_ids = torch.tensor(test_input_ids)

# Create attention masks to indicate which tokens are actual words and which are padding tokens for train and test sets
train_attention_masks = torch.where(train_input_ids != tokenizer.pad_token_id, torch.tensor(1), torch.tensor(0))
test_attention_masks = torch.where(test_input_ids != tokenizer.pad_token_id, torch.tensor(1), torch.tensor(0))

# Create labels tensors for train and test sets
train_labels = torch.tensor(y_train.values)
test_labels = torch.tensor(y_test.values)

# Create TensorDatasets for train and test sets
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)

# Define the batch size and create dataloaders to feed the BERT model

In [None]:
# Define batch size
batch_size = 32

# Create DataLoaders for train and test sets
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Create and initialize the model

In [4]:
# Create the model
class CustomBERTForClassification(nn.Module):
    def __init__(self, num_labels):
        super(CustomBERTForClassification, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained('dccuchile/bert-base-spanish-wwm-cased', num_labels=num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        return logits

# Initialize the fine-tuning model
num_labels = 2  # Number of output labels (machine-written, human-written)
model = CustomBERTForClassification(num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Define loss function, optimiser, num of epochs and move model to gpu ( Nvidia 4060 )

In [5]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=3e-5)

# Define number of training epochs
num_epochs = 10

# Move model and data to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

CustomBERTForClassification(
  (bert): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(31002, 768, padding_idx=1)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSdpaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, o

# Perform the training loop and print loss per epoch

In [208]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch in train_dataloader:
        input_ids, attention_masks, labels = batch
        input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids, attention_masks)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    # Print average loss for the epoch
    print(f"Epoch {epoch+1}, Loss: {running_loss / len(train_dataloader)}")

Epoch 1, Loss: 0.6350215077400208
Epoch 2, Loss: 0.389578825774345
Epoch 3, Loss: 0.1716717522036522
Epoch 4, Loss: 0.08509116011493384
Epoch 5, Loss: 0.05483895638323528
Epoch 6, Loss: 0.03891185656724934
Epoch 7, Loss: 0.03221550868933109
Epoch 8, Loss: 0.034046128768711645
Epoch 9, Loss: 0.039166630590037305
Epoch 10, Loss: 0.07996122936837058


# Save the trained model if needed

In [211]:
# Save the trained model
# torch.save(model.state_dict(), "test_model.pth")

# Evaluate the model based on accuracy, precision, recall and f1 score at different threshold values

In [210]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define a function to evaluate the model with an adjustable threshold
def evaluate_model_with_threshold(model, dataloader, threshold=0.5):
    model.eval()
    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_masks, labels = batch
            input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)

            outputs = model(input_ids, attention_masks)
            logits = outputs  # Directly use the output tensor
            probabilities = torch.sigmoid(logits)[:, 1]  # Assuming the positive class is at index 1
            predictions = (probabilities > threshold).long()

            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predictions.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_predictions)
    precision = precision_score(all_labels, all_predictions, average='binary')
    recall = recall_score(all_labels, all_predictions, average='binary')
    f1 = f1_score(all_labels, all_predictions, average='binary')

    return accuracy, precision, recall, f1


# Evaluate on the validation/test set with a lower threshold to improve recall
threshold = 0.7  # Adjust this value as needed
val_accuracy, val_precision, val_recall, val_f1 = evaluate_model_with_threshold(model, test_dataloader, threshold)
print(f'Performance with {threshold} threshold')
print()
print(f"Validation Accuracy: {val_accuracy}")
print(f"Validation Precision: {val_precision}")
print(f"Validation Recall: {val_recall}")
print(f"Validation F1-Score: {val_f1}")

print()
print('----------------------------------------------')
print()

threshold = 0.6  # Adjust this value as needed
val_accuracy, val_precision, val_recall, val_f1 = evaluate_model_with_threshold(model, test_dataloader, threshold)
print(f'Performance with {threshold} threshold')
print()
print(f"Validation Accuracy: {val_accuracy}")
print(f"Validation Precision: {val_precision}")
print(f"Validation Recall: {val_recall}")
print(f"Validation F1-Score: {val_f1}")

print()
print('----------------------------------------------')
print()

threshold = 0.5  # Adjust this value as needed
val_accuracy, val_precision, val_recall, val_f1 = evaluate_model_with_threshold(model, test_dataloader, threshold)
print(f'Performance with {threshold} threshold')
print()
print(f"Validation Accuracy: {val_accuracy}")
print(f"Validation Precision: {val_precision}")
print(f"Validation Recall: {val_recall}")
print(f"Validation F1-Score: {val_f1}")

print()
print('----------------------------------------------')
print()

threshold = 0.3  # Adjust this value as needed

val_accuracy, val_precision, val_recall, val_f1 = evaluate_model_with_threshold(model, test_dataloader, threshold)
print(f'Performance with {threshold} threshold')
print()
print(f"Validation Accuracy: {val_accuracy}")
print(f"Validation Precision: {val_precision}")
print(f"Validation Recall: {val_recall}")
print(f"Validation F1-Score: {val_f1}")

print()
print('----------------------------------------------')
print()

threshold = 0.2  # Adjust this value as needed

val_accuracy, val_precision, val_recall, val_f1 = evaluate_model_with_threshold(model, test_dataloader, threshold)
print(f'Performance with {threshold} threshold')
print()
print(f"Validation Accuracy: {val_accuracy}")
print(f"Validation Precision: {val_precision}")
print(f"Validation Recall: {val_recall}")
print(f"Validation F1-Score: {val_f1}")

print()
print('----------------------------------------------')
print()

threshold = 0.1  # Adjust this value as needed

val_accuracy, val_precision, val_recall, val_f1 = evaluate_model_with_threshold(model, test_dataloader, threshold)
print(f'Performance with {threshold} threshold')
print()
print(f"Validation Accuracy: {val_accuracy}")
print(f"Validation Precision: {val_precision}")
print(f"Validation Recall: {val_recall}")
print(f"Validation F1-Score: {val_f1}")


Performance with 0.7 threshold

Validation Accuracy: 0.67
Validation Precision: 0.6348973607038123
Validation Recall: 0.8424124513618677
Validation F1-Score: 0.7240802675585284

----------------------------------------------

Performance with 0.6 threshold

Validation Accuracy: 0.662
Validation Precision: 0.6253561253561254
Validation Recall: 0.8540856031128404
Validation F1-Score: 0.7220394736842105

----------------------------------------------

Performance with 0.5 threshold

Validation Accuracy: 0.652
Validation Precision: 0.6130790190735694
Validation Recall: 0.8754863813229572
Validation F1-Score: 0.7211538461538461

----------------------------------------------

Performance with 0.3 threshold

Validation Accuracy: 0.619
Validation Precision: 0.5813953488372093
Validation Recall: 0.9241245136186771
Validation F1-Score: 0.7137490608564989

----------------------------------------------

Performance with 0.2 threshold

Validation Accuracy: 0.593
Validation Precision: 0.5608646188

# Here is the performance we get after tuning our model and changing certains parameters (A/B testing manually)

In [None]:
# settings : batch size 32, epoch 10, learning rate 3e-5


# Performance with 0.7 threshold

# Validation Accuracy: 0.67
# Validation Precision: 0.6348973607038123
# Validation Recall: 0.8424124513618677
# Validation F1-Score: 0.7240802675585284

# ----------------------------------------------

# Performance with 0.6 threshold

# Validation Accuracy: 0.662
# Validation Precision: 0.6253561253561254
# Validation Recall: 0.8540856031128404
# Validation F1-Score: 0.7220394736842105

# ----------------------------------------------

# Performance with 0.5 threshold

# Validation Accuracy: 0.652
# Validation Precision: 0.6130790190735694
# Validation Recall: 0.8754863813229572
# Validation F1-Score: 0.7211538461538461

# ----------------------------------------------

# Performance with 0.3 threshold

# Validation Accuracy: 0.619
# Validation Precision: 0.5813953488372093
# Validation Recall: 0.9241245136186771
# Validation F1-Score: 0.7137490608564989

# ----------------------------------------------

# Performance with 0.2 threshold

# Validation Accuracy: 0.593
# Validation Precision: 0.5608646188850968
# Validation Recall: 0.9591439688715954
# Validation F1-Score: 0.7078248384781048

# ----------------------------------------------

# Performance with 0.1 threshold

# Validation Accuracy: 0.539
# Validation Precision: 0.5275181723779855
# Validation Recall: 0.9883268482490273
# Validation F1-Score: 0.6878808395396073

These metrics are only with a model trained on a small ammount of data and should be used to tune parameters.

# Lets now get performance metrics on the whole data set with a train_test split of 0.15

Same process as in the beginning but now with all the rows in the data

In [None]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [None]:
# Apply preprocessing to the text
train_encodings = [preprocess_text_for_bert(text) for text in X_train]
test_encodings = [preprocess_text_for_bert(text) for text in X_test]

# Convert tokenized text data into numerical IDs for train and test sets
train_input_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in train_encodings]
test_input_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in test_encodings]

# Pad sequences to a fixed length for train and test sets
max_length = max(max(len(ids) for ids in train_input_ids), max(len(ids) for ids in test_input_ids))
train_input_ids = [ids + [tokenizer.pad_token_id] * (max_length - len(ids)) for ids in train_input_ids]
test_input_ids = [ids + [tokenizer.pad_token_id] * (max_length - len(ids)) for ids in test_input_ids]

# Convert input_ids lists into PyTorch tensors for train and test sets
train_input_ids = torch.tensor(train_input_ids)
test_input_ids = torch.tensor(test_input_ids)

# Create attention masks to indicate which tokens are actual words and which are padding tokens for train and test sets
train_attention_masks = torch.where(train_input_ids != tokenizer.pad_token_id, torch.tensor(1), torch.tensor(0))
test_attention_masks = torch.where(test_input_ids != tokenizer.pad_token_id, torch.tensor(1), torch.tensor(0))

# Create labels tensors for train and test sets
train_labels = torch.tensor(y_train.values)
test_labels = torch.tensor(y_test.values)

# Create TensorDatasets for train and test sets
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)

In [None]:
# Define batch size
batch_size = 32

# Create DataLoaders for train and test sets
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Initialize the fine-tuning model
num_labels = 2  # Number of output labels (machine-written, human-written)
model = CustomBERTForClassification(num_labels=num_labels)

In [None]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=3e-5)

# Define number of training epochs
num_epochs = 15

# Move model and data to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch in train_dataloader:
        input_ids, attention_masks, labels = batch
        input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids, attention_masks)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    # Print average loss for the epoch
    print(f"Epoch {epoch+1}, Loss: {running_loss / len(train_dataloader)}")

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch 1, Loss: 0.5760435861662815
Epoch 2, Loss: 0.4528866478957628
Epoch 3, Loss: 0.32058884799480436
Epoch 4, Loss: 0.24398892097567257
Epoch 5, Loss: 0.1766134725040511
Epoch 6, Loss: 0.1380147894177782
Epoch 7, Loss: 0.11504603306615824
Epoch 8, Loss: 0.10283022353082504
Epoch 9, Loss: 0.08404091031192557
Epoch 10, Loss: 0.07039247675921376
Epoch 11, Loss: 0.07504533402278628
Epoch 12, Loss: 0.06518921642588746
Epoch 13, Loss: 0.05957319792286542
Epoch 14, Loss: 0.05700705631954694
Epoch 15, Loss: 0.05563991794019545


Evaluate the model

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define a function to evaluate the model with an adjustable threshold
def evaluate_model_with_threshold(model, dataloader, threshold=0.5):
    model.eval()
    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_masks, labels = batch
            input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)

            outputs = model(input_ids, attention_masks)
            logits = outputs  # Directly use the output tensor
            probabilities = torch.sigmoid(logits)[:, 1]  # Assuming the positive class is at index 1
            predictions = (probabilities > threshold).long()

            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predictions.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_predictions)
    precision = precision_score(all_labels, all_predictions, average='binary')
    recall = recall_score(all_labels, all_predictions, average='binary')
    f1 = f1_score(all_labels, all_predictions, average='binary')

    return accuracy, precision, recall, f1


# Evaluate on the validation/test set with a lower threshold to improve recall
threshold = 0.7  # Adjust this value as needed
val_accuracy, val_precision, val_recall, val_f1 = evaluate_model_with_threshold(model, test_dataloader, threshold)
print(f'Performance with {threshold} threshold')
print()
print(f"Validation Accuracy: {val_accuracy}")
print(f"Validation Precision: {val_precision}")
print(f"Validation Recall: {val_recall}")
print(f"Validation F1-Score: {val_f1}")

print()
print('----------------------------------------------')
print()

threshold = 0.6  # Adjust this value as needed
val_accuracy, val_precision, val_recall, val_f1 = evaluate_model_with_threshold(model, test_dataloader, threshold)
print(f'Performance with {threshold} threshold')
print()
print(f"Validation Accuracy: {val_accuracy}")
print(f"Validation Precision: {val_precision}")
print(f"Validation Recall: {val_recall}")
print(f"Validation F1-Score: {val_f1}")

print()
print('----------------------------------------------')
print()

threshold = 0.5  # Adjust this value as needed
val_accuracy, val_precision, val_recall, val_f1 = evaluate_model_with_threshold(model, test_dataloader, threshold)
print(f'Performance with {threshold} threshold')
print()
print(f"Validation Accuracy: {val_accuracy}")
print(f"Validation Precision: {val_precision}")
print(f"Validation Recall: {val_recall}")
print(f"Validation F1-Score: {val_f1}")

print()
print('----------------------------------------------')
print()

threshold = 0.3  # Adjust this value as needed

val_accuracy, val_precision, val_recall, val_f1 = evaluate_model_with_threshold(model, test_dataloader, threshold)
print(f'Performance with {threshold} threshold')
print()
print(f"Validation Accuracy: {val_accuracy}")
print(f"Validation Precision: {val_precision}")
print(f"Validation Recall: {val_recall}")
print(f"Validation F1-Score: {val_f1}")

print()
print('----------------------------------------------')
print()

threshold = 0.2  # Adjust this value as needed

val_accuracy, val_precision, val_recall, val_f1 = evaluate_model_with_threshold(model, test_dataloader, threshold)
print(f'Performance with {threshold} threshold')
print()
print(f"Validation Accuracy: {val_accuracy}")
print(f"Validation Precision: {val_precision}")
print(f"Validation Recall: {val_recall}")
print(f"Validation F1-Score: {val_f1}")

print()
print('----------------------------------------------')
print()

threshold = 0.1  # Adjust this value as needed

val_accuracy, val_precision, val_recall, val_f1 = evaluate_model_with_threshold(model, test_dataloader, threshold)
print(f'Performance with {threshold} threshold')
print()
print(f"Validation Accuracy: {val_accuracy}")
print(f"Validation Precision: {val_precision}")
print(f"Validation Recall: {val_recall}")
print(f"Validation F1-Score: {val_f1}")


Performance with 0.7 threshold

Validation Accuracy: 0.6174496644295302
Validation Precision: 0.6321285140562249
Validation Recall: 0.5808118081180812
Validation F1-Score: 0.6053846153846154

----------------------------------------------

Performance with 0.6 threshold

Validation Accuracy: 0.6260253542132737
Validation Precision: 0.6311475409836066
Validation Recall: 0.6250922509225092
Validation F1-Score: 0.6281053021876158

----------------------------------------------

Performance with 0.5 threshold

Validation Accuracy: 0.6271439224459359
Validation Precision: 0.6236933797909407
Validation Recall: 0.6605166051660517
Validation F1-Score: 0.6415770609318996

----------------------------------------------

Performance with 0.3 threshold

Validation Accuracy: 0.6185682326621924
Validation Precision: 0.6012195121951219
Validation Recall: 0.7276752767527676
Validation F1-Score: 0.6584307178631051

----------------------------------------------

Performance with 0.2 threshold

Validati

In [None]:
# Performance with 0.7 threshold

# Validation Accuracy: 0.6174496644295302
# Validation Precision: 0.6321285140562249
# Validation Recall: 0.5808118081180812
# Validation F1-Score: 0.6053846153846154

# ----------------------------------------------

# Performance with 0.6 threshold

# Validation Accuracy: 0.6260253542132737
# Validation Precision: 0.6311475409836066
# Validation Recall: 0.6250922509225092
# Validation F1-Score: 0.6281053021876158

# ----------------------------------------------

# Performance with 0.5 threshold

# Validation Accuracy: 0.6271439224459359
# Validation Precision: 0.6236933797909407
# Validation Recall: 0.6605166051660517
# Validation F1-Score: 0.6415770609318996

# ----------------------------------------------

# Performance with 0.3 threshold

# Validation Accuracy: 0.6185682326621924
# Validation Precision: 0.6012195121951219
# Validation Recall: 0.7276752767527676
# Validation F1-Score: 0.6584307178631051

# ----------------------------------------------

# Performance with 0.2 threshold

# Validation Accuracy: 0.6111111111111112
# Validation Precision: 0.589041095890411
# Validation Recall: 0.7616236162361624
# Validation F1-Score: 0.664306404892179

# ----------------------------------------------

# Performance with 0.1 threshold

# Validation Accuracy: 0.5969425801640567
# Validation Precision: 0.5698267074413863
# Validation Recall: 0.8250922509225093
# Validation F1-Score: 0.6741031052155563

Now we can use these metrics to predict the expected accuracy and recall of the model when performing with real data

# Lets train the model with the entire data set

In [None]:
# Apply preprocessing to the text
encodings = [preprocess_text_for_bert(text) for text in X]  # Process the entire dataset

# Convert tokenized text data into numerical IDs
input_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in encodings]

# Pad sequences to a fixed length
max_length = max(len(ids) for ids in input_ids)
input_ids = [ids + [tokenizer.pad_token_id] * (max_length - len(ids)) for ids in input_ids]

# Convert input_ids list into PyTorch tensors
input_ids = torch.tensor(input_ids)

# Create attention masks to indicate which tokens are actual words and which are padding tokens
attention_masks = torch.where(input_ids != tokenizer.pad_token_id, torch.tensor(1), torch.tensor(0))

# Create labels tensor
labels = torch.tensor(y.values)  # Ensure y is the correct variable for labels

# Create a TensorDataset for the entire dataset
dataset = TensorDataset(input_ids, attention_masks, labels)

# Define batch size
batch_size = 32

# Create a DataLoader for the dataset
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# The dataloader is now ready to be used for training the model

In [None]:
# Initialize the fine-tuning model
num_labels = 2  # Number of output labels (machine-written, human-written)
model = CustomBERTForClassification(num_labels=num_labels)

In [None]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=3e-5)

# Define number of training epochs
num_epochs = 15

# Move model and data to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch in dataloader:
        input_ids, attention_masks, labels = batch
        input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids, attention_masks)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    # Print average loss for the epoch
    print(f"Epoch {epoch+1}, Loss: {running_loss / len(dataloader)}")

In [None]:
# Save the trained model
#torch.save(model.state_dict(), "MODEL_ALL_DATA_15_EPOCHS.pth")

Training the model with more epochs would probably improve it's performance, since a single training session takes 35 mins on my hardware, i decided to continue.

# Now lets predict and tag new data

And export it to an output.txt file

PS : Keep in mind that the prediction system below includes a threshold value, that way we can choose to maximise Accuracy or Recall

A higher value around 0.5 to 0.6 will maximise Accuracy

A lower value around 0.2 to 0.1 will maximise Recall

In [8]:
def preprocess_text_for_predictions(text):
    # Clean the text: join back into a single string without modifying the content
    text = ' '.join(text.split())
    # Tokenize the text
    tokens = tokenizer.tokenize(text)
    # Add [CLS] and [SEP] tokens
    tokens = ["[CLS]"] + tokens + ["[SEP]"]
    # Convert tokens to numerical IDs
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    return input_ids

In [9]:
# Load the trained model
model.load_state_dict(torch.load('MODEL_ALL_DATA_15_EPOCHS.pth', map_location=device))

# Define the threshold we want
    # Here we picked 0.1 to maximise Recall
best_threshold = 0.1

# Read the input file
with open('REAL_DATA.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Preprocess the sentences
input_ids = [preprocess_text_for_predictions(line.split('\t', 1)[1].strip()) for line in lines]

# Pad sequences to a fixed length
max_length = max(len(ids) for ids in input_ids)
input_ids = [ids + [tokenizer.pad_token_id] * (max_length - len(ids)) for ids in input_ids]

# Convert to PyTorch tensors
input_ids = torch.tensor(input_ids)
attention_masks = torch.where(input_ids != tokenizer.pad_token_id, torch.tensor(1), torch.tensor(0))

# Create a TensorDataset and DataLoader
dataset = TensorDataset(input_ids, attention_masks)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

# Function to make predictions with the specified threshold
def predict_with_threshold(model, dataloader, threshold):
    model.eval()
    all_predictions = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_masks = batch
            input_ids, attention_masks = input_ids.to(device), attention_masks.to(device)

            outputs = model(input_ids, attention_masks)
            probabilities = torch.sigmoid(outputs)[:, 1]  # Assuming the positive class is at index 1
            predictions = (probabilities > threshold).long()

            all_predictions.extend(predictions.cpu().numpy())
    return all_predictions

# Get predictions for the sentences
predictions = predict_with_threshold(model, dataloader, best_threshold)

# Replace the number 2 with the predicted labels
output_lines = []
for line, prediction in zip(lines, predictions):
    parts = line.strip().split('\t', 1)
    new_line = f"{prediction}\t{parts[1]}\n"
    output_lines.append(new_line)

# Save the modified content to a new file
output_file_path = 'output.txt'
with open(output_file_path, 'w', encoding='utf-8') as file:
    file.writelines(output_lines)

print(f"Predictions saved to {output_file_path}")

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Predictions saved to output.txt
