In [1]:
#!pip3 install datasets

In [2]:
import os
import torch

import numpy as np
import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt

import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix, roc_auc_score
from datasets import DatasetDict
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from glob import glob
from torch.utils.data import DataLoader

import torch.nn.functional as F

# Configuración de dispositivo (GPU o CPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
# device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
device

  from .autonotebook import tqdm as notebook_tqdm


'cuda'

In [3]:
# @title Customize your key variables here
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 200 # @param {type:"integer"}
TRAIN_BATCH_SIZE = 16 # @param {type:"integer"}
VALID_BATCH_SIZE = 16 # @param {type:"integer"}
EPOCHS = 4 # @param {type:"integer"}
LEARNING_RATE = 1e-4 # @param {type:"number"}

In [4]:
machines_files = glob('./data/machine/*.jsonl')
len(machines_files)

13

In [5]:
df_human = pd.read_json('./data/human.jsonl', lines=True)

df_machine = pd.read_json(machines_files[0], lines=True)
for file in machines_files[1:]:
    df_current = pd.read_json(file, lines=True)
    df_machine = pd.concat([df_machine, df_current])


df_human["id"] = df_human["id"].str.split('/').str[1:].str.join('/')
df_machine["id"] = df_machine["id"].str.split('/').str[1:].str.join('/')

df_combined = pd.merge(df_human, df_machine, on="id", suffixes=("_1", "_2"))
df_combined['target_human'] = 1
df_combined['target_machine'] = 0
df_combined

Unnamed: 0,id,text_1,text_2,target_human,target_machine
0,news-2021-01-01-2021-12-31-bideninauguration/a...,Inaugural Address by President Joseph R. Biden...,Inaugural Address: President Joseph R. Biden J...,1,0
1,news-2021-01-01-2021-12-31-bideninauguration/a...,Inaugural Address by President Joseph R. Biden...,What should be the focus of the speech? The In...,1,0
2,news-2021-01-01-2021-12-31-bideninauguration/a...,Inaugural Address by President Joseph R. Biden...,Biden's Inaugural Address Highlights Triumph o...,1,0
3,news-2021-01-01-2021-12-31-bideninauguration/a...,Inaugural Address by President Joseph R. Biden...,Biden's Inaugural Address: A Clarion Call for ...,1,0
4,news-2021-01-01-2021-12-31-bideninauguration/a...,Inaugural Address by President Joseph R. Biden...,"President Biden Emphasizes Unity, Democracy, a...",1,0
...,...,...,...,...,...
14126,news-2021-01-01-2021-12-31-wyominggabbypetito/...,Gabby Petito case: Surf shop owner in her home...,Gabby Petito: Long Island Surf Shop Owner Reme...,1,0
14127,news-2021-01-01-2021-12-31-wyominggabbypetito/...,Gabby Petito case: Surf shop owner in her home...,Gabby Petito: Surf Shop Owner in Hometown Reme...,1,0
14128,news-2021-01-01-2021-12-31-wyominggabbypetito/...,Gabby Petito case: Surf shop owner in her home...,Gabby Petito Remembered as a 'Kind-Hearted Sou...,1,0
14129,news-2021-01-01-2021-12-31-wyominggabbypetito/...,Gabby Petito case: Surf shop owner in her home...,Gabby Petito Remembered as a 'Super Kind-Heart...,1,0


In [6]:
random_indices = df_combined.sample(frac=0.5, random_state=42).index
df_combined.loc[random_indices, ['text_1', 'text_2']] = df_combined.loc[random_indices, ['text_2', 'text_1']].values
df_combined.loc[random_indices, ['target_human']] = 0
df_combined.loc[random_indices, ['target_machine']] = 1
df_combined['target_tuple'] = list(zip(df_combined['target_human'], df_combined['target_machine']))
df_combined.drop(columns=['id', 'target_human', 'target_machine'], inplace=True)
df_combined

Unnamed: 0,text_1,text_2,target_tuple
0,Inaugural Address: President Joseph R. Biden J...,Inaugural Address by President Joseph R. Biden...,"(0, 1)"
1,Inaugural Address by President Joseph R. Biden...,What should be the focus of the speech? The In...,"(1, 0)"
2,Inaugural Address by President Joseph R. Biden...,Biden's Inaugural Address Highlights Triumph o...,"(1, 0)"
3,Biden's Inaugural Address: A Clarion Call for ...,Inaugural Address by President Joseph R. Biden...,"(0, 1)"
4,Inaugural Address by President Joseph R. Biden...,"President Biden Emphasizes Unity, Democracy, a...","(1, 0)"
...,...,...,...
14126,Gabby Petito: Long Island Surf Shop Owner Reme...,Gabby Petito case: Surf shop owner in her home...,"(0, 1)"
14127,Gabby Petito case: Surf shop owner in her home...,Gabby Petito: Surf Shop Owner in Hometown Reme...,"(1, 0)"
14128,Gabby Petito case: Surf shop owner in her home...,Gabby Petito Remembered as a 'Kind-Hearted Sou...,"(1, 0)"
14129,Gabby Petito Remembered as a 'Super Kind-Heart...,Gabby Petito case: Surf shop owner in her home...,"(0, 1)"


In [7]:
X_train, X_val, y_train, y_val = train_test_split(df_combined[["text_1", "text_2"]], df_combined["target_tuple"], test_size=0.3, random_state=43, stratify=df_combined["target_tuple"])
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=1/3, random_state=43, stratify=y_val)

X_train.shape, X_val.shape, X_test.shape

((9891, 2), (2826, 2), (1414, 2))

In [8]:
class AiClassificationDataset(Dataset):
    def __init__(self, dataframe, labels):
        self.data = dataframe.reset_index(drop=True)
        self.labels = labels.reset_index(drop=True)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        # Use iloc to access the rows by index for data and labels
        text_1 = self.data.iloc[index]['text_1']
        text_2 = self.data.iloc[index]['text_2']
        target = self.labels.iloc[index]  # assuming labels are in a compatible format
        return {
            'text_1': text_1,
            'text_2': text_2,
            'targets': target
        }

In [9]:
class AiClassificationCollator:
    def __init__(self, dataset, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataset
        self.max_len = max_len

    def __call__(self, input_batch):
        batch_dict = {colname: [x[colname] for x in input_batch] for colname in input_batch[0]}

        # Process text_1
        comment_text_1 = batch_dict['text_1']
        # print(comment_text_1)
        comment_text_1 = [" ".join(text.split()) for text in comment_text_1]
        # print(comment_text_1)

        # Process text_2
        comment_text_2 = batch_dict['text_2']
        comment_text_2 = [" ".join(text.split()) for text in comment_text_2]

        inputs = self.tokenizer(
            comment_text_1,
            comment_text_2,
            max_length=self.max_len,
            padding=True,
            truncation=True,
            return_token_type_ids=True
        )

        return {
            'ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'token_type_ids': torch.tensor(inputs['token_type_ids'], dtype=torch.long),
            'targets': torch.tensor(batch_dict['targets'], dtype=torch.float)
        }

In [10]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

In [11]:
training_set = AiClassificationDataset(X_train, y_train)
validation_set = AiClassificationDataset(X_val, y_val)
test_set = AiClassificationDataset(X_test, y_test)

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0,
                'collate_fn': AiClassificationCollator(training_set, tokenizer, MAX_LEN)
                }

val_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0,
                'collate_fn': AiClassificationCollator(validation_set, tokenizer, MAX_LEN)
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0,
                'collate_fn': AiClassificationCollator(test_set, tokenizer, MAX_LEN)
                }

training_loader = DataLoader(training_set, **train_params)
validation_loader = DataLoader(validation_set, **val_params)
test_loader = DataLoader(test_set, **test_params)

In [12]:
class TransformerClass(torch.nn.Module):
    def __init__(self):
        super(TransformerClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('google-bert/bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 256)
        self.l4 = torch.nn.Linear(256, 2)

    def forward(self, ids, mask, token_type_ids):
        last_hidden_state = self.l1(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids
        ).last_hidden_state

        cls_token = last_hidden_state[:, 0]
        hidden_output = F.gelu(self.l3(self.l2(cls_token)))
        output = self.l4(hidden_output)
        return output

In [None]:
# class TransformerClass(torch.nn.Module):
#  def __init__(self):
#     super(TransformerClass, self).__init__()
#     self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
#     self.l2 = torch.nn.Linear(768, 768)
#     self.l3 = torch.nn.Dropout(0.1)
#     self.l4 = torch.nn.CosineSimilarity(dim=1)
#     self.l5 = torch.nn.Linear(1, 1)
    
#  def forward(self, ids_0, mask_0, token_type_ids_0, ids_1, mask_1, token_type_ids_1):
#     last_hidden_state_a = self.l1(ids_0, attention_mask=mask_0, token_type_ids=token_type_ids_0).last_hidden_state[:, 0]
#     last_hidden_state_b = self.l1(ids_1, attention_mask=mask_1, token_type_ids=token_type_ids_1).last_hidden_state[:, 0]
#     x_a, x_b = self.l2(last_hidden_state_a), self.l2(last_hidden_state_b)
#     x_a, x_b = torch.gelu(self.l3(x_a)), torch.gelu(self.l3(x_b))
#     sem_sim = self.l4(x_a, x_b)
#     weighted_sem_sim = self.l5(sem_sim)
#     return weighted_sem_sim

In [14]:
def training_step(input_ids, attention_mask, token_type_ids, y, model, optimizer):
    logits = model(input_ids, attention_mask, token_type_ids)

    loss = torch.nn.functional.cross_entropy(logits, y, reduction='mean')
    loss.backward()

    optimizer.step()
    optimizer.zero_grad()

    return loss

In [15]:
model = TransformerClass()
model.to(device)

optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [16]:
def validate():
    model.eval()
    val_loss = 0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for data in validation_loader:
            input_ids = data['ids'].to(device)
            attention_mask = data['mask'].to(device)
            token_type_ids = data['token_type_ids'].to(device)
            targets = data['targets'].to(device)

            logits = model(input_ids, attention_mask, token_type_ids)
            val_loss += torch.nn.functional.cross_entropy(logits, targets, reduction='sum').item()  # Accumulate validation loss

            preds = torch.argmax(logits, dim=1)
            correct_predictions += (preds == targets.argmax(dim=1)).sum().item()
            total_predictions += targets.size(0)

    avg_val_loss = val_loss / total_predictions
    accuracy = correct_predictions / total_predictions
    return avg_val_loss, accuracy


In [17]:
def train(epoch, log_interval=200, save_model_path='./model_weights'):
    model.train()
    running_loss = 0

    for step, data in enumerate(training_loader):
        input_ids = data['ids'].to(device)
        attention_mask = data['mask'].to(device)
        token_type_ids = data['token_type_ids'].to(device)
        targets = data['targets'].to(device)

        loss = training_step(input_ids, attention_mask, token_type_ids, targets, model, optimizer)
        running_loss += loss.item()

        if step % log_interval == 0:
            avg_loss = running_loss / (step + 1)
            print(f"Epoch {epoch + 1}/{EPOCHS}, Step {step + 1}/{len(training_loader)}")
            print(f"  Running Loss: {avg_loss:.4f}")

    avg_train_loss = running_loss / len(training_loader)

    avg_val_loss, val_accuracy = validate()

    print(f"Epoch {epoch + 1}/{EPOCHS} - End of epoch")
    print(f"  Training Loss: {avg_train_loss:.4f}")
    print(f"  Validation Loss: {avg_val_loss:.4f}")
    print(f"  Validation Accuracy: {val_accuracy:.4f}")

    if not os.path.exists(save_model_path):
        os.makedirs(save_model_path)

    model_save_path = os.path.join(save_model_path, f"model_epoch_{epoch + 1}.pth")
    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved to {model_save_path}")

for epoch in range(EPOCHS):
    train(epoch)

Epoch 1/4, Step 1/619
  Running Loss: 0.6878
Epoch 1/4, Step 201/619
  Running Loss: 0.2178
Epoch 1/4, Step 401/619
  Running Loss: 0.1649
Epoch 1/4, Step 601/619
  Running Loss: 0.1311
Epoch 1/4 - End of epoch
  Training Loss: 0.1275
  Validation Loss: 0.0371
  Validation Accuracy: 0.9919
Model saved to ./model_weights\model_epoch_1.pth
Epoch 2/4, Step 1/619
  Running Loss: 0.0393
Epoch 2/4, Step 201/619
  Running Loss: 0.0286
Epoch 2/4, Step 401/619
  Running Loss: 0.0255
Epoch 2/4, Step 601/619
  Running Loss: 0.0247
Epoch 2/4 - End of epoch
  Training Loss: 0.0244
  Validation Loss: 0.0171
  Validation Accuracy: 0.9958
Model saved to ./model_weights\model_epoch_2.pth
Epoch 3/4, Step 1/619
  Running Loss: 0.0005
Epoch 3/4, Step 201/619
  Running Loss: 0.0311
Epoch 3/4, Step 401/619
  Running Loss: 0.0474
Epoch 3/4, Step 601/619
  Running Loss: 0.0427
Epoch 3/4 - End of epoch
  Training Loss: 0.0421
  Validation Loss: 0.0340
  Validation Accuracy: 0.9904
Model saved to ./model_weight

In [22]:
model = TransformerClass()

model_save_path = './model_weights/model_epoch_4.pth'
model.load_state_dict(torch.load(model_save_path))

model.to(device)

  model.load_state_dict(torch.load(model_save_path))


TransformerClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [23]:
def test(test_loader, model, device):
    model.eval()
    test_loss = 0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for data in test_loader:
            input_ids = data['ids'].to(device)
            attention_mask = data['mask'].to(device)
            token_type_ids = data['token_type_ids'].to(device)
            targets = data['targets'].to(device)

            # Forward pass
            logits = model(input_ids, attention_mask, token_type_ids)
          
            test_loss += torch.nn.functional.cross_entropy(logits, targets, reduction='sum').item()

            # Predicciones
            preds = torch.argmax(logits, dim=1)
            correct_predictions += (preds == targets.argmax(dim=1)).sum().item()
            total_predictions += targets.size(0)

    avg_test_loss = test_loss / total_predictions
    accuracy = correct_predictions / total_predictions
    return avg_test_loss, accuracy


test_loss, test_accuracy = test(test_loader, model, device)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Loss: 0.0319
Test Accuracy: 0.9880
