In [1]:
from pathlib import Path

import pandas as pd

import combine_rephrased
import read_and_write_docs

In [2]:
g_drive_base_loc = "/Users/user/Library/CloudStorage/GoogleDrive-benjcross1995@gmail.com/My Drive/"

In [3]:
# Rephrased preprocessed data
# rephrased_preprocessed = f"{g_drive_base_loc}datasets/blogger_new_algorithm/rephrased_preprocessed.jsonl"
rephrased_preprocessed = "../../../datasets/blogger/raw/rephrased_preprocessed.jsonl"
preprocessed_df = read_and_write_docs.read_jsonl_file(rephrased_preprocessed)

In [4]:
# Get the list of files in the rephrased sentences loc
# rephrased_sentences_loc = f"{g_drive_base_loc}datasets/blogger_new_algorithm/rephrased_sentences/"
# rephrased_sentences_loc = "../../../datasets/blogger/phi_rephrased"
# rephrased_sentences_loc = "../../../datasets/blogger/batch_4o"
rephrased_sentences_loc = "../../../datasets/blogger/batch_4o_parascore"

files = [file for file in Path(rephrased_sentences_loc).iterdir() if file.is_file() and file.suffix == '.jsonl']

In [5]:
preprocessed_df[preprocessed_df['doc_id'] == 552933]

Unnamed: 0,doc_id,author_id,gender,age,topic,sign,date,text,chunk_id
2730,552933,3363676,male,23,Internet,Libra,"22,May,2004",Let me explain the problem science has with Je...,1
2731,552933,3363676,male,23,Internet,Libra,"22,May,2004",The atheist professor of philosophy pauses bef...,2
2732,552933,3363676,male,23,Internet,Libra,"22,May,2004","Youre a Christian, arent you, son?",3
2733,552933,3363676,male,23,Internet,Libra,"22,May,2004","Yes sir, the student says.",4
2734,552933,3363676,male,23,Internet,Libra,"22,May,2004",So you believe in God?,5
...,...,...,...,...,...,...,...,...,...
2819,552933,3363676,male,23,Internet,Libra,"22,May,2004","According to the rules of empirical, testable,...",90
2820,552933,3363676,male,23,Internet,Libra,"22,May,2004","What do you say to that, son?",91
2821,552933,3363676,male,23,Internet,Libra,"22,May,2004","Nothing, the student replies.",92
2822,552933,3363676,male,23,Internet,Libra,"22,May,2004",I only have my faith.,93


In [6]:
def process_and_combine(old_df, file_paths, num_impostors=1000, score_table=False, score_col=None):
    """
    Process a list of file paths, filter the old DataFrame based on doc_id, 
    and combine the results into a single DataFrame.
    
    Parameters:
    - old_df: The old DataFrame to filter.
    - file_paths: List of file paths to process.
    - num_impostors: Parameter for the combine function.
    - score_table: Does the rephrased table have a score column.
    - score_col: If table has score column what is the column name.
    
    Returns:
    - result_df: Combined DataFrame.
    """
    
    result_dfs = []
    
    for file_path in file_paths:

        # Read the new DataFrame from the file
        new_df = read_and_write_docs.read_jsonl_file(file_path)
        
        new_df = new_df.rename(columns={
            'original': 'original_sentence', 
            'rephrased': 'result'})

        # Convert columns to numeric
        new_df['doc_id'] = pd.to_numeric(new_df['doc_id'], errors='coerce')
        new_df['chunk_id'] = pd.to_numeric(new_df['chunk_id'], errors='coerce')
        
        # Ensure 'doc_id' is in the DataFrame
        if 'doc_id' not in new_df.columns:
            raise ValueError(f"The file {file_path} does not contain a 'doc_id' column.")
        
        # Get the unique doc_id from the new DataFrame
        new_doc_id = int(new_df['doc_id'].unique()[0])

        # Filter the old DataFrame for the doc_id
        filtered_old_df = old_df[old_df['doc_id'] == new_doc_id].copy()
        print(f"Sampling Doc: {new_doc_id}")

        if score_table == False:
            # Combine the filtered old DataFrame with the new DataFrame
            combined_df = combine_rephrased.chunk_single_rephrased(filtered_old_df, new_df, num_impostors=num_impostors)
        else:
            combined_df = combine_rephrased.chunk_single_rephrased_with_scores(filtered_old_df, new_df,
                                                                               score_col, num_impostors=num_impostors)
        
        # Append the combined DataFrame to the list of result DataFrames
        result_dfs.append(combined_df)
    
    # Concatenate all the result DataFrames into a single DataFrame
    result_df = pd.concat(result_dfs, ignore_index=True)
    
    return result_df

# Example usage:
# Assuming read_and_write_docs.read_jsonl_file and combine_rephrased.chunk_single_rephrased are defined
# result_df = process_and_combine(preprocessed_df, file_paths, read_and_write_docs.read_jsonl_file, combine_rephrased.chunk_single_rephrased)


In [7]:
test = [files[0]]

In [10]:
result = process_and_combine(preprocessed_df, files, num_impostors=1000,
                             score_table=True, score_col='parascore_free')

Sampling Doc: 350617
Sampling Doc: 202213
Sampling Doc: 184355
Sampling Doc: 214228
Sampling Doc: 526719
Sampling Doc: 334376
Sampling Doc: 341238
Sampling Doc: 530975
Sampling Doc: 678838
Sampling Doc: 481359
Sampling Doc: 175880
Sampling Doc: 343349
Sampling Doc: 297972
Sampling Doc: 617540
Sampling Doc: 543206
Sampling Doc: 448015
Sampling Doc: 579188
Sampling Doc: 671397
Sampling Doc: 357437
Sampling Doc: 26252
Sampling Doc: 429760
Sampling Doc: 445957
Sampling Doc: 349665
Sampling Doc: 323518
Sampling Doc: 274759
Sampling Doc: 483447
Sampling Doc: 452954
Sampling Doc: 82243
Sampling Doc: 623524
Sampling Doc: 306438
Sampling Doc: 124522
Sampling Doc: 125908
Sampling Doc: 207710
Sampling Doc: 542822
Sampling Doc: 288775
Sampling Doc: 506923
Sampling Doc: 110133
Sampling Doc: 597056
Sampling Doc: 621278
Sampling Doc: 397139
Sampling Doc: 560732
Sampling Doc: 488822
Sampling Doc: 178480
Sampling Doc: 398279
Sampling Doc: 148224
Sampling Doc: 279772
Sampling Doc: 116471
Sampling Doc: 5

In [11]:
result

Unnamed: 0,doc_id,rephrased,average_score
0,350617,What need drives the Poet to seek a voice for ...,0.738706
1,350617,What is the necessity for the Poet to have a v...,0.782081
2,350617,What compels the Poet to need a voice in order...,0.756976
3,350617,Why must the Poet have a voice in order to exp...,0.756603
4,350617,Why is it vital for the Poet to have a voice w...,0.766310
...,...,...,...
95995,496050,"Alright, examine this. Let’s be rational. It's...",0.726287
95996,496050,"Listen, look here. Let’s be clear-headed. Gett...",0.741269
95997,496050,"Alright, here’s the thing. Let’s maintain a re...",0.740079
95998,496050,"Hey, check this out. Let’s show some sensibili...",0.737653


In [13]:
read_and_write_docs.save_as_jsonl(result, "../../../datasets/blogger/gpt-4o-impostors-qual.jsonl")