In [135]:
import pandas as pd
import numpy as np

In [136]:
%run "../read_and_write_docs.py"

In [137]:
base_loc = "../../../../datasets/PAN/PAN 21/"
known_preprocessed = f"{base_loc}pan21-known-preprocessed.jsonl"
unknown_preprocessed = f"{base_loc}pan21-unknown-preprocessed.jsonl"
reference_loc = f"{base_loc}pan21-reference.jsonl"

In [138]:
known_df = read_jsonl_file(known_preprocessed)
unknown_df = read_jsonl_file(unknown_preprocessed)

In [139]:
def combine_deduplicate_filter_and_sample(known_df, unknown_df, sample_size=3):
    # Step 1: Check the number of rows before appending
    print("Number of rows in known_df:", known_df.shape[0])
    print("Number of rows in unknown_df:", unknown_df.shape[0])
    
    # Step 2: Append the datasets
    combined_df = pd.concat([known_df, unknown_df], ignore_index=True)
    
    # Step 3: Check the number of rows before dropping duplicates
    print("Number of rows after appending:", combined_df.shape[0])
    
    # Step 4: Drop duplicates based on 'author', 'fandom', and 'text' columns
    deduplicated_df = combined_df.drop_duplicates(subset=['author', 'fandom', 'text'])
    
    # Step 5: Check the number of rows after dropping duplicates
    print("Number of rows after dropping duplicates:", deduplicated_df.shape[0])
    
    # Step 6: Drop any documents for authors with fewer than 3 rows
    authors_less_than_3 = deduplicated_df.groupby('author').filter(lambda x: len(x) < 3)
    filtered_df = deduplicated_df.groupby('author').filter(lambda x: len(x) >= 3)
    
    # Step 7: Check the number of rows after filtering
    print("Number of rows after filtering authors with less than 3 documents:", filtered_df.shape[0])
    
    # Step 8: For each author, keep a random sample of 3 rows
    sampled_df = filtered_df.groupby('author').apply(lambda x: x.sample(n=sample_size, random_state=1), include_groups=True)
    
    # Step 9: Reset index to avoid hierarchical indexing from groupby
    sampled_df = sampled_df.reset_index(drop=True)
    
    # Step 10: Check the number of rows after sampling
    print("Number of rows after sampling:", sampled_df.shape[0])
    
    # Return the final DataFrame
    return sampled_df, authors_less_than_3

In [140]:
combined_df, potential_impostors = combine_deduplicate_filter_and_sample(known_df, unknown_df)

Number of rows in known_df: 19999
Number of rows in unknown_df: 19999
Number of rows after appending: 39998
Number of rows after dropping duplicates: 38949
Number of rows after filtering authors with less than 3 documents: 5739
Number of rows after sampling: 3027


  sampled_df = filtered_df.groupby('author').apply(lambda x: x.sample(n=sample_size, random_state=1), include_groups=True)


In [141]:
combined_df

Unnamed: 0,id,author,fandom,text
0,9ebf6a04-0207-58fd-b13f-0adaf8d71045,1000199,Soul Eater,"Dampness permeated everything - her skin , her..."
1,bcf8cf45-ed0c-52b6-8044-81180b7c067f,1000199,Full Moon wo Sagashite,Thanks so much to everyone who's reviewed so f...
2,31e114ab-ad16-5618-a2d2-77e341b1833f,1000199,Full Moon wo Sagashite,'She's doing well . If you're wondering . Your...
3,778ba107-2595-5245-8894-09fb9897ed48,1003238,Yu Yu Hakusho,'Don't you hello Koto me' She growled on the o...
4,778ba107-2595-5245-8894-09fb9897ed48,1003238,Karin,'Ok . Tomorow are you going to help Oni - chan...
...,...,...,...,...
3022,cb04d325-b5d5-5bb2-9f69-062213397e3e,997768,Saving Private Ryan,"7 June 1944 Dear Bonnie , Hey there , little s..."
3023,cb04d325-b5d5-5bb2-9f69-062213397e3e,997768,Troy,me . Where are Hector and Andromache ? And Sca...
3024,598d77fe-d337-505a-b969-8d1d27ac7934,9991,Spider-Man,"'Oh , that's Mary - Jane .' 'Is she your girlf..."
3025,235a6dc1-f2d0-55ca-866a-2c1180066563,9991,Spider-Man,never wanted to see hurt . Peter Parker seemed...


In [142]:
def split_authors_evenly(combined_df, train_prop=0.6):
    # Step 1: Get a unique list of authors
    authors = combined_df['author'].unique()
    
    # Step 2: Calculate the number of authors for the training set
    total_authors = len(authors)
    train_size = total_authors * train_prop
    test_size = total_authors - train_size
    
    # Step 3: Round the train_size
    rounded_train_size = round(train_size)
    rounded_test_size = round(test_size)

    # Step 4: Adjust train_size to be even if necessary
    if rounded_train_size < train_size and rounded_train_size % 2 != 0:
        rounded_train_size -= 1
    elif rounded_train_size > train_size and rounded_train_size % 2 != 0:
        rounded_train_size += 1

    if rounded_test_size < test_size and rounded_test_size % 2 != 0:
        rounded_test_size -= 1
    elif rounded_test_size > test_size and rounded_test_size % 2 != 0:
        rounded_test_size += 1

    if(rounded_train_size + rounded_test_size) > total_authors:
        rounded_train_size -= 2
        rounded_test_size -= 2

    print(f"""Total Authors: {total_authors}\nTrain Authors: {rounded_train_size}\nTest Authors: {rounded_test_size}\nTotal Sample: {rounded_train_size + rounded_test_size}""")

    # Step 5: Shuffle the authors
    np.random.seed(1)  # For reproducibility
    np.random.shuffle(authors)
    
    # Step 6: Split the authors into training and testing sets
    train_authors = authors[:rounded_train_size]
    test_authors = authors[rounded_train_size:rounded_train_size + rounded_test_size]

    # Step 7: Create two DataFrames based on the training and testing authors
    train_df = combined_df[combined_df['author'].isin(train_authors)].reset_index(drop=True)
    test_df = combined_df[combined_df['author'].isin(test_authors)].reset_index(drop=True)
    
    # Print the number of rows in each DataFrame
    print("Number of rows in training DataFrame:", train_df.shape[0])
    print("Number of rows in testing DataFrame:", test_df.shape[0])
    
    # Return the training and testing DataFrames
    return train_df, test_df

In [143]:
train_df, test_df = split_authors_evenly(combined_df)

Total Authors: 1009
Train Authors: 604
Test Authors: 404
Total Sample: 1008
Number of rows in training DataFrame: 1812
Number of rows in testing DataFrame: 1212


In [144]:
train_df

Unnamed: 0,id,author,fandom,text
0,9ebf6a04-0207-58fd-b13f-0adaf8d71045,1000199,Soul Eater,"Dampness permeated everything - her skin , her..."
1,bcf8cf45-ed0c-52b6-8044-81180b7c067f,1000199,Full Moon wo Sagashite,Thanks so much to everyone who's reviewed so f...
2,31e114ab-ad16-5618-a2d2-77e341b1833f,1000199,Full Moon wo Sagashite,'She's doing well . If you're wondering . Your...
3,2db8a01a-0f42-5445-87b7-cf68bdb3b2b8,1003327,Nightwalker,Yayoi had hurriedly pushed them into a conceal...
4,7980ca32-6762-5352-90f1-051bf7833fcb,1003327,Carpathian Series,His feral visitor gripped his empty glass tigh...
...,...,...,...,...
1807,41e1f416-4998-55d3-8b14-396fa1ec960a,995163,Zoey 101,"and gave her a deep , passionate kiss . James ..."
1808,41e1f416-4998-55d3-8b14-396fa1ec960a,995163,Sonny with a Chance,"3:18 a . m ., ' a doctor mumbled walking away ..."
1809,9d06d94f-1c69-54e6-bad0-aaac0ad126e4,996460,Sonny with a Chance,'mam .' As soon as Ms . Monroe's footsteps cou...
1810,7e87eeea-ed1f-58dd-b9b7-1a15db8e7f25,996460,High School Musical,"happiness depended on it . First , let's back ..."


In [145]:
def split_df_for_same_and_different_authors(df):
    # Ensure reproducibility
    np.random.seed(1)
    
    # Initialize DataFrames for the results
    df_x = pd.DataFrame()
    df_y = pd.DataFrame()
    
    # Track remaining rows
    remaining_rows = []
    
    # Group by author and process each group
    for author, group in df.groupby('author'):
        if len(group) != 3:
            raise ValueError(f"Each author should have exactly 3 rows. Found {len(group)} rows for author {author}.")
        
        # Select two rows with matching sample_id for df_x and df_y
        matching_rows = group.sample(2)
        remaining_row = group[~group.index.isin(matching_rows.index)]
        
        # Assign one row to df_x and one to df_y
        df_x = pd.concat([df_x, matching_rows.iloc[0:1]])
        df_y = pd.concat([df_y, matching_rows.iloc[1:2]])
        
        # Add the remaining row to the pool
        remaining_rows.append(remaining_row.iloc[0])
    
    # Convert the pool of remaining rows to DataFrame
    remaining_df = pd.DataFrame(remaining_rows)
    
    # Shuffle remaining rows and split evenly between df_x and df_y
    remaining_df = remaining_df.sample(frac=1, random_state=1).reset_index(drop=True)
    half_remaining = len(remaining_df) // 2
    
    # Split the remaining rows
    x = pd.concat([df_x, remaining_df.iloc[:half_remaining]], ignore_index=True)
    y = pd.concat([df_y, remaining_df.iloc[half_remaining:]], ignore_index=True)

    # Add sample_id columns
    x = x.reset_index(drop=True)
    y = y.reset_index(drop=True)
    
    x.insert(0, 'sample_id', x.index + 1)
    y.insert(0, 'sample_id', y.index + 1)
    
    # Print the number of rows in each resulting DataFrame
    print("Number of rows in DataFrame X:", x.shape[0])
    print("Number of rows in DataFrame Y:", y.shape[0])
    
    return x, y

In [146]:
train_x, train_y = split_df_for_same_and_different_authors(train_df)

Number of rows in DataFrame X: 906
Number of rows in DataFrame Y: 906


In [147]:
test_x, test_y = split_df_for_same_and_different_authors(test_df)

Number of rows in DataFrame X: 606
Number of rows in DataFrame Y: 606


In [148]:
save_as_jsonl(train_x, f"{base_loc}pan21-train_x.jsonl")
save_as_jsonl(train_y, f"{base_loc}pan21-train_y.jsonl")
save_as_jsonl(test_x, f"{base_loc}pan21-test_x.jsonl")
save_as_jsonl(test_y, f"{base_loc}pan21-test_y.jsonl")
save_as_jsonl(potential_impostors, f"{base_loc}pan21-potential_impostors.jsonl")