# Dependencies

In [1]:
import json
import os
import torch
import pandas as pd

# Dataset Creation

In [2]:
def create_datasets(default_directory: str):
    """
    Preprocess the data and labels to provide text pair

    Args:
        default_directory: Default directory for both training and validation data
    
    Returns:
        A dictionary contained processed training and validation sets
    """

    # Defining dictionary
    data_dict = {
        "train": [],
        "validation": []
    }

    # Iterate through folders
    for split in ["train", "validation"]:
        for difficulty in ["easy", "medium", "hard"]:
            # Difficulty dict
            difficulty_dict = os.path.join(default_directory, difficulty)
            # Set current directory [train, validation]
            current_directory = os.path.join(difficulty_dict, split)
            
            # Iterate over all filenames
            for filename in os.listdir(current_directory):
                # Only work on .txt files
                if filename.endswith(".txt"):
                    text_path = os.path.join(current_directory, filename)
                    label_path = os.path.join(current_directory, "truth-" + filename.replace(".txt", ".json"))

                    # Open an process the files
                    # Text files
                    with open(text_path) as f:
                        text = f.read()
                    paragraphs = text.strip().split("\n")
                    # Labels
                    with open(label_path) as f:
                        object = json.load(f)
                    labels = object.get("changes")
                    
                    # print(paragraphs)
                    # print(labels)

                    # Error handling by removing badly formatted files
                    if len(labels) != len(paragraphs)-1:
                        os.remove(text_path)
                        os.remove(label_path)
                        print("Removed bad formatted files")
                    else: 
                        # Fill up data_dict
                        for i in range(1, len(paragraphs)):
                            data_dict[split].append([paragraphs[i-1], paragraphs[i], labels[i-1], difficulty])
    return data_dict


In [3]:
default_directory = "../pan24-multi-author-analysis"
data = create_datasets(default_directory=default_directory)
train_df = pd.DataFrame(data.get("train"), columns=["paragraph1", "paragraph2", "label", "difficulty"])
validation_df = pd.DataFrame(data.get("validation"), columns=["paragraph1", "paragraph2", "label", "difficulty"])
print(len(train_df))
print(len(validation_df))

Removed bad formatted files
Removed bad formatted files
Removed bad formatted files
Removed bad formatted files
Removed bad formatted files
Removed bad formatted files
51962
11194


In [4]:
train_df.to_csv('../FINAL-DATA/train.csv', index=False)

In [5]:
def rebalance_data(df, difficulty_col='difficulty',label_col='label'):
    dfs_balanced = []

    for difficulty in df[difficulty_col].unique():
        df_difficulty = df[df[difficulty_col] == difficulty]
        label_count = df_difficulty.groupby(label_col)[label_col].count()
        min_count = label_count.min()
        df_balanced = df_difficulty.groupby(label_col).apply(lambda x: x.sample(min_count)).reset_index(drop=True)
        dfs_balanced.append(df_balanced)

    balanced_df = pd.concat(dfs_balanced).reset_index(drop=True)
    return balanced_df

In [6]:
def create_test_set(df, n_samples_per_difficulty=1865):
    test_samples = []
    for difficulty in df['difficulty'].unique():
        df_difficulty = df[df['difficulty'] == difficulty]
        sample = df_difficulty.sample(n=n_samples_per_difficulty, random_state=42)
        test_samples.append(sample)
    
    test_df = pd.concat(test_samples).reset_index(drop=True)
    df = df.drop(test_df.index).reset_index(drop=True)

    return df, test_df

validation_df, test_df = create_test_set(validation_df)
print(len(validation_df))
print(len(test_df))

5599
5595


In [7]:
validation_df.to_csv('../FINAL-DATA/validation.csv', index=False)
test_df.to_csv('../FINAL-DATA/test.csv', index=False)

In [8]:
# balanced_train_df = rebalance_data(df=train_df)
# balanced_validation_df = rebalance_data(df=validation_df)
# print(len(balanced_train_df))
# print(len(balanced_validation_df))

In [9]:
# balanced_train_df.to_csv('./FINAL-DATA/train_balanced.csv', index=False)
# balanced_validation_df.to_csv('./FINAL-DATA/validation_balanced.csv', index=False)
# test_df.to_csv('./FINAL-DATA/test.csv', index=False)