In [4]:
import pandas as pd
import numpy as np

In [5]:
%run "../read_and_write_docs.py"

In [6]:
base_loc = "../../../../datasets/PAN/PAN 21/"
known_preprocessed = f"{base_loc}pan21-known-preprocessed.jsonl"
unknown_preprocessed = f"{base_loc}pan21-unknown-preprocessed.jsonl"
reference_loc = f"{base_loc}pan21-reference.jsonl"

In [7]:
known_df = read_jsonl_file(known_preprocessed)
unknown_df = read_jsonl_file(unknown_preprocessed)

In [11]:
known_df

Unnamed: 0,id,author,fandom,text
0,c28e8b03-c02a-5184-b58a-12dd28b8ca74,2072507,Death Note,talk because they hadn't been exposed to commu...
1,b9326101-6352-56dd-9d1b-1f41466897b7,1404943,Samurai Deeper Kyo,"Zazuki nodded his head and got to his feet , k..."
2,e2ac4453-bf54-53f2-bf68-6caae6aacded,2406271,West Side Story,'Oh we did lots of special things . On Christm...
3,a5e9a289-0999-5764-b597-dc1bf8c21ede,189771,Get Backers,"'Hey now , at least Shido brings home some mon..."
4,cb4054b1-d422-58d6-a137-dcfc70100df6,965626,My Hero Academia/僕のヒーローアカデミア,It was a mere five minutes' walk from third ye...
...,...,...,...,...
19994,7b226a98-3d82-50c1-b6f3-ee682f580d27,2264834,Mentalist,They zipped through the formalities before fli...
19995,eed8a22b-c7cd-5adb-b530-88b9d5d30202,1144417,Janet Evanovich,'You're taking me to a spa ?' I asked eyeing h...
19996,07d9c529-6fd2-5cd1-b46f-bcfad1d58503,2733573,Skulduggery Pleasant series,Vanessa : Hahaa ! Shudder : Still here . This ...
19997,88066615-7a7a-5f5b-adb9-f8c67a4c59a6,787585,Jackie Chan Adventures,"bountiful breasts pressed against his chest , ..."


In [17]:
def pan_ref_sample(known_df, unknown_df, n, seed=None):
    # Set the random seed for reproducibility, if provided
    if seed is not None:
        np.random.seed(seed)
    
    # Append the known and unknown dataframes together
    appended_df = pd.concat([known_df, unknown_df], ignore_index=True)
    
    # Initialize a dictionary to store results
    results = {
        'id': [],
        'fandom': [],
        'author': [],
        'sampled_doc_id': []
    }
    
    # Get the list of unique IDs from the known dataframe
    unique_ids = known_df['id'].unique()
    total_ids = len(unique_ids)
    
    # Iterate through each unique 'id'
    for i, current_id in enumerate(unique_ids):
        # Get the current author's ids for the current row from both known and unknown dataframes
        known_author_id = known_df[known_df['id'] == current_id]['author'].unique()
        unknown_author_ids = unknown_df[unknown_df['id'] == current_id]['author'].unique()
        
        # Combine and ensure unique author ids
        current_author_ids = set(known_author_id).union(set(unknown_author_ids))
        
        # Filter out rows with the current id and the current authors in the appended dataframe
        filtered_df = appended_df[~appended_df['author'].isin(current_author_ids)]
        
        # Initialize a set to keep track of excluded authors
        excluded_authors = set(current_author_ids)
        
        # Sample n unique authors
        for _ in range(n):
            # Sample a row excluding previously sampled authors
            sample_row = filtered_df[~filtered_df['author'].isin(excluded_authors)].sample(n=1)
            
            # Extract author_id and id from the sampled row
            sampled_author_id = sample_row['author'].values[0]
            sampled_doc_id = sample_row['id'].values[0]
            
            # Add sampled author_id to the exclusion list
            excluded_authors.add(sampled_author_id)
            
            # Append the result
            results['id'].append(current_id)
            results['fandom'].append(sample_row['fandom'].values[0])
            results['author'].append(sampled_author_id)
            results['sampled_doc_id'].append(sampled_doc_id)
        
        # Print progress
        if (i + 1) % (total_ids // 10) == 0:
            print(f"{((i + 1) / total_ids) * 100:.0f}% complete")
    
    # Create a resulting DataFrame
    result_df = pd.DataFrame(results)
    
    return result_df

In [18]:
result = pan_ref_sample(known_df, unknown_df, 100, 42)

KeyboardInterrupt: 