In [1]:
from pathlib import Path

import pandas as pd

import combine_rephrased
import read_and_write_docs

In [2]:
g_drive_base_loc = "/Users/user/Library/CloudStorage/GoogleDrive-benjcross1995@gmail.com/My Drive/"

In [3]:
# Rephrased preprocessed data
# rephrased_preprocessed = f"{g_drive_base_loc}datasets/blogger_new_algorithm/rephrased_preprocessed.jsonl"
rephrased_preprocessed = "../../../datasets/blogger_new_algorithm/rephrased_preprocessed.jsonl"
preprocessed_df = read_and_write_docs.read_jsonl_file(rephrased_preprocessed)

In [4]:
# Get the list of files in the rephrased sentences loc
# rephrased_sentences_loc = f"{g_drive_base_loc}datasets/blogger_new_algorithm/rephrased_sentences/"
rephrased_sentences_loc = "../../../datasets/blogger_new_algorithm/phi_rephrased"

files = [file for file in Path(rephrased_sentences_loc).iterdir() if file.is_file()]

In [5]:
def process_and_combine(old_df, file_paths, num_impostors=1000):
    """
    Process a list of file paths, filter the old DataFrame based on doc_id, 
    and combine the results into a single DataFrame.
    
    Parameters:
    - old_df: The old DataFrame to filter.
    - file_paths: List of file paths to process.
    - num_impostors: Parameter for the combine function.
    
    Returns:
    - result_df: Combined DataFrame.
    """
    
    result_dfs = []
    
    for file_path in file_paths:

        # Read the new DataFrame from the file
        new_df = read_and_write_docs.read_jsonl_file(file_path)
        
        # Ensure 'doc_id' is in the DataFrame
        if 'doc_id' not in new_df.columns:
            raise ValueError(f"The file {file_path} does not contain a 'doc_id' column.")
        
        # Get the unique doc_id from the new DataFrame
        new_doc_id = new_df['doc_id'].unique()[0]
        
        # Filter the old DataFrame for the doc_id
        filtered_old_df = old_df[old_df['doc_id'] == new_doc_id].copy()

        print(f"Sampling Doc: {new_doc_id}")
        # Combine the filtered old DataFrame with the new DataFrame
        combined_df = combine_rephrased.chunk_single_rephrased(filtered_old_df, new_df, num_impostors=num_impostors)
        
        # Append the combined DataFrame to the list of result DataFrames
        result_dfs.append(combined_df)
    
    # Concatenate all the result DataFrames into a single DataFrame
    result_df = pd.concat(result_dfs, ignore_index=True)
    
    return result_df

# Example usage:
# Assuming read_and_write_docs.read_jsonl_file and combine_rephrased.chunk_single_rephrased are defined
# result_df = process_and_combine(preprocessed_df, file_paths, read_and_write_docs.read_jsonl_file, combine_rephrased.chunk_single_rephrased)


In [6]:
files

[WindowsPath('../../../datasets/blogger_new_algorithm/phi_rephrased/doc_114599.jsonl'),
 WindowsPath('../../../datasets/blogger_new_algorithm/phi_rephrased/doc_18516.jsonl'),
 WindowsPath('../../../datasets/blogger_new_algorithm/phi_rephrased/doc_213621.jsonl'),
 WindowsPath('../../../datasets/blogger_new_algorithm/phi_rephrased/doc_253553.jsonl'),
 WindowsPath('../../../datasets/blogger_new_algorithm/phi_rephrased/doc_288775.jsonl'),
 WindowsPath('../../../datasets/blogger_new_algorithm/phi_rephrased/doc_299096.jsonl'),
 WindowsPath('../../../datasets/blogger_new_algorithm/phi_rephrased/doc_349665.jsonl'),
 WindowsPath('../../../datasets/blogger_new_algorithm/phi_rephrased/doc_357437.jsonl'),
 WindowsPath('../../../datasets/blogger_new_algorithm/phi_rephrased/doc_401638.jsonl'),
 WindowsPath('../../../datasets/blogger_new_algorithm/phi_rephrased/doc_402496.jsonl'),
 WindowsPath('../../../datasets/blogger_new_algorithm/phi_rephrased/doc_415925.jsonl'),
 WindowsPath('../../../datasets/b

In [7]:
result = process_and_combine(preprocessed_df, files)

Sampling Doc: 114599
Sampling Doc: 18516
Sampling Doc: 213621
Sampling Doc: 253553
Sampling Doc: 288775
Sampling Doc: 299096
Sampling Doc: 349665
Sampling Doc: 357437
Sampling Doc: 401638
Sampling Doc: 402496
Sampling Doc: 415925
Sampling Doc: 435880
Sampling Doc: 463289
Sampling Doc: 490831
Sampling Doc: 501802
Sampling Doc: 509577
Sampling Doc: 546828
Sampling Doc: 569896
Sampling Doc: 585623
Sampling Doc: 676573


In [8]:
read_and_write_docs.save_as_jsonl(result, "../../../datasets/blogger_new_algorithm/phi_impostor_paragraphs_noqual.jsonl")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/user/Library/CloudStorage/GoogleDrive-benjcross1995@gmail.com/My Drive/datasets/blogger_new_algorithm/phi_impostor_paragraphs_noqual.jsonl'