In [66]:
import os
import json
import torch

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix, roc_auc_score
from datasets import DatasetDict
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from glob import glob
from torch.utils.data import DataLoader

# Configuración de dispositivo (GPU o CPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [61]:
# @title Customize your key variables here
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 200 # @param {type:"integer"}
TRAIN_BATCH_SIZE = 16 # @param {type:"integer"}
VALID_BATCH_SIZE = 16 # @param {type:"integer"}
EPOCHS = 100 # @param {type:"integer"}
LEARNING_RATE = 1e-4 # @param {type:"number"}

In [59]:
machines_files = glob('./data/machine/*.jsonl')
len(machines_files)

13

In [49]:
df_human = pd.read_json('./data/human.jsonl', lines=True)

df_machine = pd.read_json(machines_files[0], lines=True)
for file in machines_files[1:]:
    df_current = pd.read_json(file, lines=True)
    df_machine = pd.concat([df_machine, df_current])


df_human["id"] = df_human["id"].str.split('/').str[1:].str.join('/')
df_machine["id"] = df_machine["id"].str.split('/').str[1:].str.join('/')

df_combined = pd.merge(df_human, df_machine, on="id", suffixes=("_1", "_2"))
df_combined['target_human'] = 1
df_combined['target_machine'] = 0
df_combined

Unnamed: 0,id,text_1,text_2,target_human,target_machine
0,news-2021-01-01-2021-12-31-bideninauguration/a...,Inaugural Address by President Joseph R. Biden...,Inaugural Address: President Joseph R. Biden J...,1,0
1,news-2021-01-01-2021-12-31-bideninauguration/a...,Inaugural Address by President Joseph R. Biden...,What should be the focus of the speech? The In...,1,0
2,news-2021-01-01-2021-12-31-bideninauguration/a...,Inaugural Address by President Joseph R. Biden...,Biden's Inaugural Address Highlights Triumph o...,1,0
3,news-2021-01-01-2021-12-31-bideninauguration/a...,Inaugural Address by President Joseph R. Biden...,Biden's Inaugural Address: A Clarion Call for ...,1,0
4,news-2021-01-01-2021-12-31-bideninauguration/a...,Inaugural Address by President Joseph R. Biden...,"President Biden Emphasizes Unity, Democracy, a...",1,0
...,...,...,...,...,...
14126,news-2021-01-01-2021-12-31-wyominggabbypetito/...,Gabby Petito case: Surf shop owner in her home...,Gabby Petito: Long Island Surf Shop Owner Reme...,1,0
14127,news-2021-01-01-2021-12-31-wyominggabbypetito/...,Gabby Petito case: Surf shop owner in her home...,Gabby Petito: Surf Shop Owner in Hometown Reme...,1,0
14128,news-2021-01-01-2021-12-31-wyominggabbypetito/...,Gabby Petito case: Surf shop owner in her home...,Gabby Petito Remembered as a 'Kind-Hearted Sou...,1,0
14129,news-2021-01-01-2021-12-31-wyominggabbypetito/...,Gabby Petito case: Surf shop owner in her home...,Gabby Petito Remembered as a 'Super Kind-Heart...,1,0


In [50]:
random_indices = df_combined.sample(frac=0.5, random_state=42).index
df_combined.loc[random_indices, ['text_1', 'text_2']] = df_combined.loc[random_indices, ['text_2', 'text_1']].values
df_combined.loc[random_indices, ['target_human']] = 0
df_combined.loc[random_indices, ['target_machine']] = 1
df_combined['target_tuple'] = list(zip(df_combined['target_human'], df_combined['target_machine']))
df_combined.drop(columns=['id', 'target_human', 'target_machine'], inplace=True)
df_combined

Unnamed: 0,text_1,text_2,target_tuple
0,Inaugural Address: President Joseph R. Biden J...,Inaugural Address by President Joseph R. Biden...,"(0, 1)"
1,Inaugural Address by President Joseph R. Biden...,What should be the focus of the speech? The In...,"(1, 0)"
2,Inaugural Address by President Joseph R. Biden...,Biden's Inaugural Address Highlights Triumph o...,"(1, 0)"
3,Biden's Inaugural Address: A Clarion Call for ...,Inaugural Address by President Joseph R. Biden...,"(0, 1)"
4,Inaugural Address by President Joseph R. Biden...,"President Biden Emphasizes Unity, Democracy, a...","(1, 0)"
...,...,...,...
14126,Gabby Petito: Long Island Surf Shop Owner Reme...,Gabby Petito case: Surf shop owner in her home...,"(0, 1)"
14127,Gabby Petito case: Surf shop owner in her home...,Gabby Petito: Surf Shop Owner in Hometown Reme...,"(1, 0)"
14128,Gabby Petito case: Surf shop owner in her home...,Gabby Petito Remembered as a 'Kind-Hearted Sou...,"(1, 0)"
14129,Gabby Petito Remembered as a 'Super Kind-Heart...,Gabby Petito case: Surf shop owner in her home...,"(0, 1)"


In [51]:
X_train, X_val, y_train, y_val = train_test_split(df_combined[["text_1", "text_2"]], df_combined["target_tuple"], test_size=0.3, random_state=43, stratify=df_combined["target_tuple"])
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=1/3, random_state=43, stratify=y_val)

X_train.shape, X_val.shape, X_test.shape

((9891, 2), (2826, 2), (1414, 2))

In [62]:
class AiClassificationDataset(Dataset):
    def __init__(self, dataframe, labels):
        self.data = dataframe
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):      
        return {
            'text_1': self.data[index, ['text_1']],
            'text_2': self.data[index, ['text_2']],
            'targets': self.labels[index, ['target_tuple']]
        }

In [65]:
class AiClassificationCollator(Dataset):
    def __init__(self, dataset, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataset
        self.max_len = max_len
        self.vocabulary = len(self.tokenizer)


    def __call__(self, input_batch):
        batch_dict = {colname: col.tolist() for colname, col in pd.DataFrame(input_batch).items()}

        comment_text_1 = batch_dict['text_1']
        comment_text_2 = batch_dict['text_2']
        
        inputs = self.tokenizer(
            comment_text_1, 
            comment_text_2,
            max_length=self.max_len,
            padding=True,
            return_token_type_ids=True
        )
        
        return {
            'ids': torch.tensor(inputs.input_ids, dtype=torch.long),
            'mask': torch.tensor(inputs.attention_mask, dtype=torch.long),
            'token_type_ids': torch.tensor(inputs.token_type_ids, dtype=torch.long),
            'targets': torch.tensor(batch_dict['targets'], dtype=torch.float)
        }

In [68]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [69]:
training_set = AiClassificationDataset(X_train, y_train)
testing_set = AiClassificationDataset(X_val, y_val)

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0,
                'collate_fn': AiClassificationCollator(training_set, tokenizer, MAX_LEN)
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0,
                'collate_fn': AiClassificationCollator(testing_set, tokenizer, MAX_LEN)
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
class TransformerClass(torch.nn.Module):
    def __init__(self):
        super(TransformerClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('google-bert/bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 256)
        self.l4 = torch.nn.Linear(256, 2)  

    def forward(self, ids, mask, token_type_ids):
        last_hidden_state = self.l1(
            ids, 
            attention_mask=mask, 
            token_type_ids=token_type_ids
        ).last_hidden_state
        
        cls_token = last_hidden_state[:, 0]
        # TODO: hacer media de los embedings de cada frase
        print(last_hidden_state)
        print(cls_token)
        sentence_embedding = torch.sum(cls_token*mask.unsqueeze(-1), 1)/ mask.unsqueeze(-1)
        print(sentence_embedding)


        hidden_output = torch.gelu(self.l3(self.l2(sentence_embedding)))
        output = self.l4(hidden_output)
        
        return output

In [None]:
def loss(logits, y):
    return torch.nn.functional.cross_entropy(logits, y, reduction='mean')

def training_step(input_ids, attention_mask, token_type_ids, y, model, optimizer):
    logits = model(input_ids, attention_mask, token_type_ids)
    
    loss = loss(logits, y)
    loss.backward()
    
    optimizer.step()
    optimizer.zero_grad()
    

In [None]:
model = TransformerClass()
model.to(device)

optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
def train(epoch):
    model.train()
    for _, data in enumerate(training_loader):
        training_step(data['ids'], data['mask'], data['token_type_ids'], data['targets'], model, optimizer)
        print(f'Epoch: {epoch}, Loss:  {loss.item()}')