# PAN Data Loader

In this notebook we are able to load the PAN AV data, split the relevant columns and save as two seperate data frames.

In [26]:
import json
import os

import pandas as pd

In [4]:
def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
            
    data = pd.DataFrame(data)
    return data

def write_jsonl(data, output_file_path):
    with open(output_file_path, 'w') as file:
        for _, row in data.iterrows():
            json.dump(row.to_dict(), file)
            file.write('\n')

In [28]:
base_loc = "/Volumes/ExternalHDD/datasets/Authorship Verification/PAN/"
base_loc = "../../../../datasets/PAN/"
file_loc = f"{base_loc}PAN 21/pan21-raw.jsonl"
known_loc = f"{base_loc}PAN 21/pan21-known.jsonl"
unknown_loc = f"{base_loc}PAN 21/pan21-unknown.jsonl"
truth_loc = f"{base_loc}PAN 21/pan21-truth.jsonl"
txt_samples = f"{base_loc}PAN 21/sample_texts/"

In [10]:
df = read_jsonl(file_loc)

In [11]:
df

Unnamed: 0,id,fandoms,pair
0,c28e8b03-c02a-5184-b58a-12dd28b8ca74,"[Death Note, Thor]","[talk because they hadn""t been exposed to comm..."
1,b9326101-6352-56dd-9d1b-1f41466897b7,"[Samurai Deeper Kyo, Yu Yu Hakusho]","[Zazuki nodded his head and got to his feet, k..."
2,e2ac4453-bf54-53f2-bf68-6caae6aacded,"[West Side Story, Martin Mystery]","[""Oh we did lots of special things. On Christm..."
3,a5e9a289-0999-5764-b597-dc1bf8c21ede,"[Get Backers, Yu Yu Hakusho]","[""Hey now, at least Shido brings home some mon..."
4,cb4054b1-d422-58d6-a137-dcfc70100df6,"[My Hero Academia/僕のヒーローアカデミア, Skins]","[It was a mere five minutes"" walk from third y..."
...,...,...,...
19994,7b226a98-3d82-50c1-b6f3-ee682f580d27,"[Mentalist, Thor]",[They zipped through the formalities before fl...
19995,eed8a22b-c7cd-5adb-b530-88b9d5d30202,"[Janet Evanovich, Gallagher Girls]","[""You""re taking me to a spa?"" I asked eyeing h..."
19996,07d9c529-6fd2-5cd1-b46f-bcfad1d58503,"[Skulduggery Pleasant series, Yu Yu Hakusho]",[Vanessa: Hahaa! Shudder: Still here. This is ...
19997,88066615-7a7a-5f5b-adb9-f8c67a4c59a6,"[Jackie Chan Adventures, Justice League]","[bountiful breasts pressed against his chest, ..."


In [12]:
truth = read_jsonl(truth_loc)

In [13]:
truth

Unnamed: 0,id,same,authors
0,c28e8b03-c02a-5184-b58a-12dd28b8ca74,True,"[2072507, 2072507]"
1,b9326101-6352-56dd-9d1b-1f41466897b7,True,"[1404943, 1404943]"
2,e2ac4453-bf54-53f2-bf68-6caae6aacded,False,"[2406271, 929947]"
3,a5e9a289-0999-5764-b597-dc1bf8c21ede,True,"[189771, 189771]"
4,cb4054b1-d422-58d6-a137-dcfc70100df6,True,"[965626, 965626]"
...,...,...,...
19994,7b226a98-3d82-50c1-b6f3-ee682f580d27,True,"[2264834, 2264834]"
19995,eed8a22b-c7cd-5adb-b530-88b9d5d30202,False,"[1144417, 8582246]"
19996,07d9c529-6fd2-5cd1-b46f-bcfad1d58503,False,"[2733573, 1421642]"
19997,88066615-7a7a-5f5b-adb9-f8c67a4c59a6,True,"[787585, 787585]"


In [15]:
combined_df = df.merge(truth, on='id', how='left')

In [20]:
def pan_corpus_splitter(df):
    # Ensure the columns exist in the dataframe
    if 'id' not in df.columns or 'fandoms' not in df.columns or 'pair' not in df.columns or 'authors' not in df.columns:
        raise ValueError("DataFrame must contain 'id', 'fandoms', 'authors', and 'pair' columns")
    
    # Check if 'fandom' and 'pair' columns are lists with two elements
    if not all(df['fandoms'].apply(lambda x: isinstance(x, list) and len(x) == 2)) or not all(df['pair'].apply(lambda x: isinstance(x, list) and len(x) == 2)) or not all(df['authors'].apply(lambda x: isinstance(x, list) and len(x) == 2)):
        raise ValueError("'fandoms', 'authors', and 'pair' columns must contain lists with two elements each")
    
    # Create the 'x' dataframe
    x = df[['id']].copy()
    x['author'] = df['authors'].apply(lambda x: x[0])
    x['fandom'] = df['fandoms'].apply(lambda x: x[0])
    x['text'] = df['pair'].apply(lambda x: x[0])
    
    # Create the 'y' dataframe
    y = df[['id']].copy()
    y['author'] = df['authors'].apply(lambda x: x[1])
    y['fandom'] = df['fandoms'].apply(lambda x: x[1])
    y['text'] = df['pair'].apply(lambda x: x[1])
    
    return x, y

In [21]:
x, y = pan_corpus_splitter(combined_df)

In [22]:
x

Unnamed: 0,id,author,fandom,text
0,c28e8b03-c02a-5184-b58a-12dd28b8ca74,2072507,Death Note,"talk because they hadn""t been exposed to commu..."
1,b9326101-6352-56dd-9d1b-1f41466897b7,1404943,Samurai Deeper Kyo,"Zazuki nodded his head and got to his feet, kn..."
2,e2ac4453-bf54-53f2-bf68-6caae6aacded,2406271,West Side Story,"""Oh we did lots of special things. On Christma..."
3,a5e9a289-0999-5764-b597-dc1bf8c21ede,189771,Get Backers,"""Hey now, at least Shido brings home some mone..."
4,cb4054b1-d422-58d6-a137-dcfc70100df6,965626,My Hero Academia/僕のヒーローアカデミア,"It was a mere five minutes"" walk from third ye..."
...,...,...,...,...
19994,7b226a98-3d82-50c1-b6f3-ee682f580d27,2264834,Mentalist,They zipped through the formalities before fli...
19995,eed8a22b-c7cd-5adb-b530-88b9d5d30202,1144417,Janet Evanovich,"""You""re taking me to a spa?"" I asked eyeing hi..."
19996,07d9c529-6fd2-5cd1-b46f-bcfad1d58503,2733573,Skulduggery Pleasant series,Vanessa: Hahaa! Shudder: Still here. This is b...
19997,88066615-7a7a-5f5b-adb9-f8c67a4c59a6,787585,Jackie Chan Adventures,"bountiful breasts pressed against his chest, i..."


In [23]:
y

Unnamed: 0,id,author,fandom,text
0,c28e8b03-c02a-5184-b58a-12dd28b8ca74,2072507,Thor,"Your brother, Loki wants to say, and instantly..."
1,b9326101-6352-56dd-9d1b-1f41466897b7,1404943,Yu Yu Hakusho,"""Her name is Sayaka. She""s with Botan."" He ans..."
2,e2ac4453-bf54-53f2-bf68-6caae6aacded,929947,Martin Mystery,"""Get it out! Get it out!' Martin was trying ha..."
3,a5e9a289-0999-5764-b597-dc1bf8c21ede,189771,Yu Yu Hakusho,"Ookami Sarah Yay, and that is it so far. The n..."
4,cb4054b1-d422-58d6-a137-dcfc70100df6,965626,Skins,"""Go on, Liv. I wanna take a picture of you thr..."
...,...,...,...,...
19994,7b226a98-3d82-50c1-b6f3-ee682f580d27,2264834,Thor,"He didn""t prevail. His brother was now in our ..."
19995,eed8a22b-c7cd-5adb-b530-88b9d5d30202,8582246,Gallagher Girls,"You""re the fear, I don""t care""Cause I""ve never..."
19996,07d9c529-6fd2-5cd1-b46f-bcfad1d58503,1421642,Yu Yu Hakusho,Authoress starts dancing around happily. Darkf...
19997,88066615-7a7a-5f5b-adb9-f8c67a4c59a6,787585,Justice League,"""Have you ever heard of the Turing test?"" ""No ..."


In [24]:
write_jsonl(x, known_loc)

In [25]:
write_jsonl(y, unknown_loc)

In [27]:
def save_text_samples(combined_df, folder_location, sample_size=100):
    # Ensure the folder exists
    os.makedirs(folder_location, exist_ok=True)
    
    # Take a sample of the specified size
    sample_df = combined_df.sample(n=sample_size, random_state=42)
    
    # Save each text column to a separate .txt file
    for idx, row in sample_df.iterrows():
        file_path = os.path.join(folder_location, f"{row['id']}.txt")
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(row['text'])

In [29]:
# save_text_samples(x, txt_samples)