# Data Preperation Dependencies

In [1]:
import json
import os

# Data preperation

In [37]:
def create_datasets(default_directory: str):
    """
    Preprocess the data and labels to provide text pair

    Args:
        default_directory: Default directory for both training and validation data
    
    Returns:
        A dictionary contained processed training and validation sets
    """

    # Defining dictionary
    data_dict = {
        "train": [],
        "validation": []
    }

    # Iterate through folders
    for split in ["train", "validation"]:
        for difficulty in ["easy", "medium", "hard"]:
            # Difficulty dict
            difficulty_dict = os.path.join(default_directory, difficulty)
            # Set current directory [train, validation]
            current_directory = os.path.join(difficulty_dict, split)
            
            # Iterate over all filenames
            for filename in os.listdir(current_directory):
                # Only work on .txt files
                if filename.endswith(".txt"):
                    text_path = os.path.join(current_directory, filename)
                    label_path = os.path.join(current_directory, "truth-" + filename.replace(".txt", ".json"))

                    # Open an process the files
                    # Text files
                    with open(text_path) as f:
                        text = f.read()
                    paragraphs = text.strip().split("\n")
                    # Labels
                    with open(label_path) as f:
                        object = json.load(f)
                    labels = object.get("changes")
                    
                    # print(paragraphs)
                    # print(labels)

                    # Error handling by removing badly formatted files
                    if len(labels) != len(paragraphs)-1:
                        os.remove(text_path)
                        os.remove(label_path)
                        print("Removed bad formatted files")

                    # Split each paragraph into tokens
                    processed_paragraphs = [paragraph.split() for paragraph in paragraphs]
                    
                    # Fill up data_dict
                    for i in range(1, len(paragraphs)):
                        #print((paragraphs[i-1], paragraphs[i], labels[i-1]))
                        data_dict[split].append({"Sent1":paragraphs[i-1], "Sent2":paragraphs[i], "Label":labels[i-1]})
    return data_dict


In [38]:
default_directory = "../pan24-multi-author-analysis"
data_dict = create_datasets(default_directory=default_directory)

In [45]:
print(len(data_dict.get("train")))
data_dict.get("train")[6]

51962


{'Sent1': "Only after he does as much as he can to burn the GOP to the ground. I'd much rather see a harsh primary and RNC electorate count with him pushing election lies about their primary and then have him narrowly not get the ticket and then have him start a MAGA party with some down ballot MAGA people who he picked who also lost their GOP primaries who get like 10% to 20% of the vote leading to the GOP losing states that are normally lighter red like NC, TX, great lakes states, etc...",
 'Sent2': "I hate the idea. He almost won in 2020 and all the conservatives will fall in line and vote for him again in 2024 regardless of anything they say -- I would expect him to get even more votes in 2024 than he got in 2020. It's too dangerous for him to get anywhere near the presidency again. I think he will lose, but I don't want him to have the chance.",
 'Label': 1}

In [40]:
import pandas as pd
train_df = pd.DataFrame(data_dict.get("train"), columns=["Sent1", "Sent2", "Label"])

In [41]:
train_df

Unnamed: 0,Sent1,Sent2,Label
0,Just like who knew the Taliban would be just a...,Also the EU wasn’t born out of a “let’s preven...,1
1,Also the EU wasn’t born out of a “let’s preven...,Their money? What money did they have before t...,1
2,Their money? What money did they have before t...,Well...yeah actually. It kinda sucks to listen...,1
3,"In general, be courteous to others. Debate/dis...",Sigh. There are so many right wing monsters we...,1
4,Sigh. There are so many right wing monsters we...,r/politics is currently accepting new moderato...,1
...,...,...,...
51957,Which part of your comment or the one you’re r...,"Okay, you’re a company and you raise your pric...",0
51958,"Okay, you’re a company and you raise your pric...",It's greed. During the pandemic everyone rose ...,1
51959,It's greed. During the pandemic everyone rose ...,Seriously! When banks give interest free loans...,0
51960,Seriously! When banks give interest free loans...,It’s defo a ‘goody people will be desperate to...,1


In [42]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [43]:
def tokenize_function(dataset):
    return tokenizer(
        df_split["Sent1"],
        df_split["Sent2"],
        truncation=True,
        padding=True
    )

In [25]:
tokenized_training_dataset = train_df.map(tokenize_function(df_split=train_df))

ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

# Model preperation

In [106]:
import torch
import torch.nn as nn
from transformers import RobertaForSequenceClassification, RobertaTokenizer

In [107]:
def create_model(MODEL_NAME: str):
    """
    Creates a RoBERTa model for the writing-style classification

    Args:
        model_name: Name of pre-trained model
    Returns:
        Model and tokenizer
    """
    model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
    tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)
    return model, tokenizer

In [96]:
MODEL_NAME = "roberta-base"
model, tokenizer = create_model(MODEL_NAME=MODEL_NAME)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [100]:
train_encodings = tokenizer(
    data_dict.get("train")[0][0],
    truncuation=True,
    padding=True
)
train_encodings

Keyword arguments {'truncuation': True} not recognized.


{'input_ids': [0, 6785, 101, 54, 1467, 5, 7597, 74, 28, 95, 25, 1099, 71, 5, 382, 2468, 66, 9, 4035, 25, 137, 4, 1801, 10, 319, 9, 44014, 56, 57, 1765, 13, 5, 382, 7, 989, 13, 107, 4, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

# Saving hyperparameters

In [81]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="roberta_trainer",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    learning_rate=1e-3
)

In [94]:
train_model()

True
