In [3]:
from pathlib import Path
import os
import sys
import pandas as pd
import re

## Import other modules without running main

In [4]:
# Dynamically determine the current script's directory and add the parent directory to sys.path
current_dir = Path().resolve()
parent_dir = current_dir.parent
sys.path.append(str(parent_dir))

# Import combine (ensure that it has the required functions)
import combine_rephrased
import read_and_write_docs

In [5]:
def create_temp_doc_id(input_text):
    # Extract everything between the brackets
    match = re.search(r'\[(.*?)\]', input_text)
    if match:
        extracted_text = match.group(1)
        # Replace all punctuation and spaces with "_"
        cleaned_text = re.sub(r'[^\w]', '_', extracted_text)
        # Replace multiple underscores with a single "_"
        final_text = re.sub(r'_{2,}', '_', cleaned_text)
        return final_text.lower()
    return None

def process_and_combine(old_df, file_paths, num_impostors=1000, score_table=False, score_col=None, seed=None):
    """
    Process a list of file paths, filter the old DataFrame based on doc_id, 
    and combine the results into a single DataFrame.
    
    Parameters:
    - old_df: The old DataFrame to filter.
    - file_paths: List of file paths to process.
    - num_impostors: Parameter for the combine function.
    - score_table: Does the rephrased table have a score column.
    - score_col: If table has score column what is the column name.
    
    Returns:
    - result_df: Combined DataFrame.
    """
    
    result_dfs = []
    
    for file_path in file_paths:

        # Read the new DataFrame from the file
        new_df = read_and_write_docs.read_jsonl_file(file_path)
        
        new_df = new_df.rename(columns={
            'original': 'original_sentence', 
            'rephrased': 'result'})

        # Convert columns to numeric
        new_df['doc_id'] = pd.to_numeric(new_df['doc_id'], errors='coerce')
        new_df['chunk_id'] = pd.to_numeric(new_df['chunk_id'], errors='coerce')
        
        # Ensure 'doc_id' is in the DataFrame
        if 'doc_id' not in new_df.columns:
            raise ValueError(f"The file {file_path} does not contain a 'doc_id' column.")
        
        # Get the unique doc_id from the new DataFrame
        new_doc_id = int(new_df['doc_id'].unique()[0])

        # Filter the old DataFrame for the doc_id
        filtered_old_df = old_df[old_df['doc_id'] == new_doc_id].copy()
        print(f"Sampling Doc: {new_doc_id}")

        if score_table == False:
            # Combine the filtered old DataFrame with the new DataFrame
            combined_df = combine_rephrased.chunk_single_rephrased(filtered_old_df, new_df, num_impostors=num_impostors,
                                                                   seed=seed)
        else:
            combined_df = combine_rephrased.chunk_single_rephrased_with_scores_list(filtered_old_df, new_df,
                                                                                    score_col, num_impostors=num_impostors,
                                                                                    seed=seed)
        
        # Append the combined DataFrame to the list of result DataFrames
        result_dfs.append(combined_df)
    
    # Concatenate all the result DataFrames into a single DataFrame
    result_df = pd.concat(result_dfs, ignore_index=True)
    
    return result_df

In [6]:
rephrased_preprocessed = "/Volumes/BCross/datasets/author_verification/training/The Telegraph/known_processed.jsonl"
df = read_and_write_docs.read_jsonl(rephrased_preprocessed)

In [7]:
df['doc_id'] = df['doc_id'].apply(create_temp_doc_id)

In [8]:
df

Unnamed: 0,corpus,doc_id,chunk_id,author,texttype,sentence
0,The Telegraph,adrianbridge_text_1,1,AdrianBridge,known,Its fame and popularity means that the Oktober...
1,The Telegraph,adrianbridge_text_1,2,AdrianBridge,known,"That said, it is still possible to plan a trip..."
2,The Telegraph,adrianbridge_text_1,3,AdrianBridge,known,Oktoberfest guide Just when does the Oktoberfe...
3,The Telegraph,adrianbridge_text_1,4,AdrianBridge,known,"Although the festival concludes in October, mo..."
4,The Telegraph,adrianbridge_text_1,5,AdrianBridge,known,The original Oktoberfest in October 1810 was h...
...,...,...,...,...,...,...
6477,The Telegraph,hughmorris_text_3,14,HughMorris,known,"According to French newspaper Le Figaro, Dutch..."
6478,The Telegraph,hughmorris_text_3,15,HughMorris,known,"The centre, planned for a 230-metre, man-made ..."
6479,The Telegraph,hughmorris_text_3,16,HughMorris,known,The real people who have the money to go this ...
6480,The Telegraph,hughmorris_text_3,17,HughMorris,known,"In the meantime, a planning application for th..."


In [9]:
df = df[df['doc_id'] == 'gabywood_text_2']
df

Unnamed: 0,corpus,doc_id,chunk_id,author,texttype,sentence
4672,The Telegraph,gabywood_text_2,1,GabyWood,known,"This week, a photography studio advertised a s..."
4673,The Telegraph,gabywood_text_2,2,GabyWood,known,Though they complained that such advertisement...
4674,The Telegraph,gabywood_text_2,3,GabyWood,known,There are two aspects to the difficulty.
4675,The Telegraph,gabywood_text_2,4,GabyWood,known,"One is cultural, to do with the way we now exp..."
4676,The Telegraph,gabywood_text_2,5,GabyWood,known,The first is a knotty problem that feminists h...
4677,The Telegraph,gabywood_text_2,6,GabyWood,known,Photographs of women have been retouched for p...
4678,The Telegraph,gabywood_text_2,7,GabyWood,known,Why display a dress with a model that the dres...
4679,The Telegraph,gabywood_text_2,8,GabyWood,known,Why make actresses look so different on film p...
4680,The Telegraph,gabywood_text_2,9,GabyWood,known,But this is the visual currency now.
4681,The Telegraph,gabywood_text_2,10,GabyWood,known,"Ask any glossy magazine editor, and she will t..."


In [10]:
parascore_loc = '/Volumes/BCross/datasets/author_verification/training/The Telegraph/batch_parascore/'
parascore_files = [
    f for f in os.listdir(parascore_loc)
    if os.path.isfile(os.path.join(parascore_loc, f)) and f.endswith('.jsonl')
]

In [11]:
test = parascore_files[0]

In [12]:
result = process_and_combine(df, test, num_impostors=1000,
                             score_table=True, score_col='parascore_free')

AttributeError: module 'read_and_write_docs' has no attribute 'read_jsonl_file'