# Data Preperation Dependencies

In [1]:
import json
import os
import torch
import pandas as pd

# Data preperation

In [2]:
def create_datasets(default_directory: str):
    """
    Preprocess the data and labels to provide text pair

    Args:
        default_directory: Default directory for both training and validation data
    
    Returns:
        A dictionary contained processed training and validation sets
    """

    # Defining dictionary
    data_dict = {
        "train": [],
        "validation": []
    }

    # Iterate through folders
    for split in ["train", "validation"]:
        for difficulty in ["easy", "medium", "hard"]:
            # Difficulty dict
            difficulty_dict = os.path.join(default_directory, difficulty)
            # Set current directory [train, validation]
            current_directory = os.path.join(difficulty_dict, split)
            
            # Iterate over all filenames
            for filename in os.listdir(current_directory):
                # Only work on .txt files
                if filename.endswith(".txt"):
                    text_path = os.path.join(current_directory, filename)
                    label_path = os.path.join(current_directory, "truth-" + filename.replace(".txt", ".json"))

                    # Open an process the files
                    # Text files
                    with open(text_path) as f:
                        text = f.read()
                    paragraphs = text.strip().split("\n")
                    # Labels
                    with open(label_path) as f:
                        object = json.load(f)
                    labels = object.get("changes")
                    
                    # print(paragraphs)
                    # print(labels)

                    # Error handling by removing badly formatted files
                    if len(labels) != len(paragraphs)-1:
                        os.remove(text_path)
                        os.remove(label_path)
                        print("Removed bad formatted files")
                    
                    # Fill up data_dict
                    for i in range(1, len(paragraphs)):
                        data_dict[split].append([paragraphs[i-1], paragraphs[i], labels[i-1]])
    return data_dict


In [9]:
default_directory = "../pan24-multi-author-analysis"
data_dict = create_datasets(default_directory=default_directory)

In [10]:
df = pd.DataFrame(data_dict.get("train"), columns=["paragraph1", "paragraph2", "label"])
df.head()

Unnamed: 0,paragraph1,paragraph2,label
0,Just like who knew the Taliban would be just a...,Also the EU wasn’t born out of a “let’s preven...,1
1,Also the EU wasn’t born out of a “let’s preven...,Their money? What money did they have before t...,1
2,Their money? What money did they have before t...,Well...yeah actually. It kinda sucks to listen...,1
3,"In general, be courteous to others. Debate/dis...",Sigh. There are so many right wing monsters we...,1
4,Sigh. There are so many right wing monsters we...,r/politics is currently accepting new moderato...,1


In [11]:
train_df = pd.DataFrame(data_dict.get("train"), columns=["paragraph1", "paragraph2", "label"])
train_df.to_csv("train.csv")

validation_df = pd.DataFrame(data_dict.get("validation"), columns=["paragraph1", "paragraph2", "label"])
validation_df.to_csv("validation.csv")

# Custom Datasets

In [5]:
# # Dependencies
# from torch.utils.data import Dataset, DataLoader

# # Creating custom dataset
# class CustomParagraphsDataset(Dataset):
#     """ 
#     Dataset containing pairs of data points 

#     Args:
#         Dataset

#     Return:
#         Sample dictionary
#     """

#     # Initialize the variables
#     def __init__(self, train=True, transform=None):
#         columns = ["paragraph1", "paragraph2", "label"]
#         self.dataframe = pd.DataFrame(data_dict.get("train"), columns=columns) if train else pd.DataFrame(data_dict.get("validation"), columns=columns)
#         self.transform = transform

#     # Function for returning size of dataset
#     def __len__(self):
#         return len(self.dataframe)

#     # Function for retrieving item
#     def __getitem__(self, idx):
#         # Convert index to list
#         if torch.is_tensor(idx):
#             idx = idx.tolist()
        
#         # Defining sample
#         sample = {
#             "paragraph1": self.dataframe.iloc[idx]["paragraph1"],
#             "paragraph2": self.dataframe.iloc[idx]["paragraph2"],
#             "label": self.dataframe.iloc[idx]["label"]
#         }

#         # Transform check
#         if self.transform:
#             sample = self.transform(sample)
        
#         return sample

# TEST START ------

In [6]:
# from datasets.dataset_dict import DatasetDict
# from datasets import Dataset

# columns = ["paragraph1", "paragraph2", "label"]

# d = {
#     "train": Dataset.from_dict({
#         "paragraph1": pd.DataFrame(data_dict.get("train"), columns=columns)["paragraph1"],
#         "paragraph2": pd.DataFrame(data_dict.get("train"), columns=columns)["paragraph2"],
#         "label": pd.DataFrame(data_dict.get("train"), columns=columns)["label"]
#     })
# }

# raw = DatasetDict(d)

In [7]:
# train_dataset = CustomParagraphsDataset()
# validation_dataset = CustomParagraphsDataset(train=False)

In [8]:
# Importing dataset creation dependenceis
from datasets import DatasetDict, Dataset

# Defining column names
columns = ["paragraph1", "paragraph2", "label"]

# Creating raw dataset
raw_datasets = DatasetDict({
    "train": Dataset.from_dict({
        "paragraph1": pd.DataFrame(data_dict.get("train"), columns=columns)["paragraph1"],
        "paragraph2": pd.DataFrame(data_dict.get("train"), columns=columns)["paragraph2"],
        "label": pd.DataFrame(data_dict.get("train"), columns=columns)["label"]
    }),
    "validation": Dataset.from_dict({
        "paragraph1": pd.DataFrame(data_dict.get("validation"), columns=columns)["paragraph1"],
        "paragraph2": pd.DataFrame(data_dict.get("validation"), columns=columns)["paragraph2"],
        "label": pd.DataFrame(data_dict.get("validation"), columns=columns)["label"]
    })
})

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['paragraph1', 'paragraph2', 'label'],
        num_rows: 51962
    })
    validation: Dataset({
        features: ['paragraph1', 'paragraph2', 'label'],
        num_rows: 11194
    })
})

# Tokenizing and Encoder

In [10]:
from transformers import AutoTokenizer, DataCollatorWithPadding
reference = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(reference)

In [11]:
def tokenize_function(sample):
    return tokenizer(
        sample["paragraph1"],
        sample["paragraph2"],
        truncation=True
    )

In [12]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 51962/51962 [00:04<00:00, 12713.07 examples/s]
Map: 100%|██████████| 11194/11194 [00:00<00:00, 13510.84 examples/s]


In [13]:
# Testing mapping
print(tokenized_datasets.column_names)

{'train': ['paragraph1', 'paragraph2', 'label', 'input_ids', 'attention_mask'], 'validation': ['paragraph1', 'paragraph2', 'label', 'input_ids', 'attention_mask']}


In [14]:
"""
ONLY FOR WHEN NOT USING THE TRAINER API
"""
# Post process removal
for key in tokenized_datasets.keys():
    tokenized_datasets[key] = tokenized_datasets[key].remove_columns(["paragraph1", "paragraph2"])
    tokenized_datasets[key] = tokenized_datasets[key].rename_column("label", "labels")
    tokenized_datasets[key] = tokenized_datasets[key].with_format("torch")

'\nONLY FOR WHEN NOT USING THE TRAINER API\n'

In [15]:
tokenized_datasets["train"]

Dataset({
    features: ['paragraph1', 'paragraph2', 'label', 'input_ids', 'attention_mask'],
    num_rows: 51962
})

# Trainer

In [16]:
from transformers import AutoModelForSequenceClassification

def setup_model(MODEL_NAME: str, num_labels: int):
    model = AutoModelForSequenceClassification.from_pretrained(
        pretrained_model_name_or_path=MODEL_NAME,
        num_labels=num_labels
    )
    return model

In [17]:
MODEL_NAME = "roberta-base"
num_labels = len(pd.unique(df["label"]))
model = setup_model(MODEL_NAME=MODEL_NAME, num_labels=num_labels)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
# Arguments for training
from transformers import TrainingArguments

# Definition of training arguments
args = TrainingArguments(
    "trainer", 
    per_device_eval_batch_size=8,
    per_device_train_batch_size=8,
    num_train_epochs=5,
    learning_rate=2e-3
)

In [27]:
# Loading Trainer dependency
from transformers import Trainer

def setup_trainer():
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=collator,
        tokenizer=tokenizer
    )
    return trainer

In [28]:
args.device

device(type='mps')

In [29]:
# Setting up and training trainer
trainer = setup_trainer()
trainer.train()

  0%|          | 1/8120 [03:41<500:26:29, 221.90s/it]
  0%|          | 29/32480 [00:46<7:24:28,  1.22it/s] 

KeyboardInterrupt: 