In [95]:
from pathlib import Path
import os
import sys
import pandas as pd
import re
import random

## Import other modules without running main

In [90]:
# Dynamically determine the current script's directory and add the parent directory to sys.path
current_dir = Path().resolve()
parent_dir = current_dir.parent
sys.path.append(str(parent_dir))

# Import combine (ensure that it has the required functions)
import combine_rephrased
import read_and_write_docs

In [91]:
def create_temp_doc_id(input_text):
    # Extract everything between the brackets
    match = re.search(r'\[(.*?)\]', input_text)
    if match:
        extracted_text = match.group(1)
        # Replace all punctuation and spaces with "_"
        cleaned_text = re.sub(r'[^\w]', '_', extracted_text)
        # Replace multiple underscores with a single "_"
        final_text = re.sub(r'_{2,}', '_', cleaned_text)
        return final_text.lower()
    return None

def process_and_combine(old_df, file_paths, num_impostors=1000, score_table=False, score_col=None, seed=None):
    """
    Process a list of file paths, filter the old DataFrame based on doc_id, 
    and combine the results into a single DataFrame.
    
    Parameters:
    - old_df: The old DataFrame to filter.
    - file_paths: List of file paths to process.
    - num_impostors: Parameter for the combine function.
    - score_table: Does the rephrased table have a score column.
    - score_col: If table has score column what is the column name.
    
    Returns:
    - result_df: Combined DataFrame.
    """

    if seed is not None:
        random.seed(seed)
        
    result_dfs = []
    
    for file_path in file_paths:

        # Read the new DataFrame from the file
        new_df = read_and_write_docs.read_jsonl(file_path)

        new_df = new_df.rename(columns={
            'original': 'original_sentence', 
            'rephrased': 'result'})

        # Convert columns to numeric
        # new_df['doc_id'] = pd.to_numeric(new_df['doc_id'], errors='coerce')
        new_df['chunk_id'] = pd.to_numeric(new_df['chunk_id'], errors='coerce')
        
        # Ensure 'doc_id' is in the DataFrame
        if 'doc_id' not in new_df.columns:
            raise ValueError(f"The file {file_path} does not contain a 'doc_id' column.")
        
        # Get the unique doc_id from the new DataFrame
        new_doc_id = new_df['doc_id'].unique()[0]

        # Filter the old DataFrame for the doc_id
        filtered_old_df = old_df[old_df['doc_id'] == new_doc_id].copy()

        print(f"Sampling Doc: {new_doc_id}")

        if score_table == False:
            # Combine the filtered old DataFrame with the new DataFrame
            combined_df = combine_rephrased.chunk_single_rephrased(filtered_old_df, new_df, num_impostors=num_impostors,
                                                                   seed=seed)
        else:
            combined_df = combine_rephrased.chunk_single_rephrased_with_scores_list(filtered_old_df, new_df,
                                                                                    score_col, num_impostors=num_impostors,
                                                                                    seed=seed)
        
        # Append the combined DataFrame to the list of result DataFrames
        result_dfs.append(combined_df)
    
    # Concatenate all the result DataFrames into a single DataFrame
    result_df = pd.concat(result_dfs, ignore_index=True)
    
    return result_df

In [92]:
rephrased_preprocessed = "/Volumes/BCross/datasets/author_verification/training/TripAdvisor/known_processed.jsonl"
df = read_and_write_docs.read_jsonl(rephrased_preprocessed)

In [93]:
df['doc_id'] = df['doc_id'].apply(create_temp_doc_id)
df=df.copy()
df.rename(columns={'sentence': 'text'}, inplace=True)

In [94]:
df.head(2)

Unnamed: 0,corpus,doc_id,chunk_id,author,texttype,text
0,TripAdvisor,,1,acbw_8,known,I would like to have given this B&B a fantasti...
1,TripAdvisor,,2,acbw_8,known,"If you stay in this B&B, just make sure you ar..."


In [83]:
parascore_loc = '/Volumes/BCross/datasets/author_verification/training/The Telegraph/batch_parascore/'
parascore_loc = '/Users/user/Documents/temp_datasets/author_verification/training/The Telegraph/batch_parascore/'
parascore_files = [
    f'{parascore_loc}{f}' for f in os.listdir(parascore_loc)
    if os.path.isfile(os.path.join(parascore_loc, f)) and f.endswith('.jsonl')
]
parascore_files = sorted(parascore_files)

In [84]:
result = process_and_combine(df, parascore_files, num_impostors=1000,
                             score_table=True, score_col='parascore_free',
                            seed=42)

Sampling Doc: adrianbridge_text_1
Sampling Doc: adrianbridge_text_2
Sampling Doc: aislinnlaing_text_1
Sampling Doc: aislinnlaing_text_2
Sampling Doc: alanhansen_text_1
Sampling Doc: alanhansen_text_3
Sampling Doc: alansmith_text_2
Sampling Doc: alansmith_text_3
Sampling Doc: alantitchmarsh_text_1
Sampling Doc: alantitchmarsh_text_2
Sampling Doc: alantovey_text_1
Sampling Doc: alantovey_text_2
Sampling Doc: alasdairreid_text_2
Sampling Doc: alasdairreid_text_3
Sampling Doc: alastairgood_text_1
Sampling Doc: alastairgood_text_3
Sampling Doc: alastairsmart_text_1
Sampling Doc: alastairsmart_text_2
Sampling Doc: alastairsooke_text_2
Sampling Doc: alastairsooke_text_3
Sampling Doc: alexjames_text_1
Sampling Doc: alexjames_text_3
Sampling Doc: aliceaudley_text_1
Sampling Doc: aliceaudley_text_3
Sampling Doc: alicephilipson_text_2
Sampling Doc: alicephilipson_text_3
Sampling Doc: alicevincent_text_1
Sampling Doc: alicevincent_text_3
Sampling Doc: allisonpearson_text_1
Sampling Doc: allisonpea

In [86]:
result

Unnamed: 0,doc_id,rephrased,average_score,score_list,original_sentence_list,rephrased_sentence_list
0,adrianbridge_text_1,"As a result of its popularity, Oktoberfest dra...",0.761738,"[0.7057556509971619, 0.7109548449516296, 0.655...",[Its fame and popularity means that the Oktobe...,"[As a result of its popularity, Oktoberfest dr..."
1,adrianbridge_text_1,"Due to its popularity, Oktoberfest serves as a...",0.752489,"[0.6755099296569824, 0.7786570191383362, 0.809...",[Its fame and popularity means that the Oktobe...,"[Due to its popularity, Oktoberfest serves as ..."
2,adrianbridge_text_1,The Oktoberfest's renown and attractiveness dr...,0.760922,"[0.672859787940979, 0.6357008814811707, 0.7936...",[Its fame and popularity means that the Oktobe...,[The Oktoberfest's renown and attractiveness d...
3,adrianbridge_text_1,Oktoberfest's significant recognition and allu...,0.757237,"[0.707846462726593, 0.7063670754432678, 0.6801...",[Its fame and popularity means that the Oktobe...,[Oktoberfest's significant recognition and all...
4,adrianbridge_text_1,Oktoberfest's significant recognition and allu...,0.774004,"[0.707846462726593, 0.6910657286643982, 0.6697...",[Its fame and popularity means that the Oktobe...,[Oktoberfest's significant recognition and all...
...,...,...,...,...,...,...
204995,hughmorris_text_3,Selecting players for school football teams of...,0.761087,"[0.6976029872894287, 0.8282140493392944, 0.818...",[Schoolboy football team selection is a crude ...,[Selecting players for school football teams o...
204996,hughmorris_text_3,Team selection for schoolboy football is marke...,0.767584,"[0.8182606101036072, 0.8468731641769409, 0.791...",[Schoolboy football team selection is a crude ...,[Team selection for schoolboy football is mark...
204997,hughmorris_text_3,The approach to selecting players for schoolbo...,0.767634,"[0.7359892129898071, 0.8274188041687012, 0.779...",[Schoolboy football team selection is a crude ...,[The approach to selecting players for schoolb...
204998,hughmorris_text_3,Selecting teams for schoolboy football is a ra...,0.766722,"[0.8032898902893066, 0.8074444532394409, 0.857...",[Schoolboy football team selection is a crude ...,[Selecting teams for schoolboy football is a r...


In [109]:
result.head(5)

Unnamed: 0,doc_id,rephrased,average_score,score_list,original_sentence_list,rephrased_sentence_list
0,adrianbridge_text_1,"As a result of its popularity, Oktoberfest dra...",0.761738,"[0.7057556509971619, 0.7109548449516296, 0.655...",[Its fame and popularity means that the Oktobe...,"[As a result of its popularity, Oktoberfest dr..."
1,adrianbridge_text_1,"Due to its popularity, Oktoberfest serves as a...",0.752489,"[0.6755099296569824, 0.7786570191383362, 0.809...",[Its fame and popularity means that the Oktobe...,"[Due to its popularity, Oktoberfest serves as ..."
2,adrianbridge_text_1,The Oktoberfest's renown and attractiveness dr...,0.760922,"[0.672859787940979, 0.6357008814811707, 0.7936...",[Its fame and popularity means that the Oktobe...,[The Oktoberfest's renown and attractiveness d...
3,adrianbridge_text_1,Oktoberfest's significant recognition and allu...,0.757237,"[0.707846462726593, 0.7063670754432678, 0.6801...",[Its fame and popularity means that the Oktobe...,[Oktoberfest's significant recognition and all...
4,adrianbridge_text_1,Oktoberfest's significant recognition and allu...,0.774004,"[0.707846462726593, 0.6910657286643982, 0.6697...",[Its fame and popularity means that the Oktobe...,[Oktoberfest's significant recognition and all...


In [None]:
result_loc = "/Volumes/BCross/datasets/author_verification/training/The Telegraph/batch_sentence_impostors.jsonl"
read_and_write_docs.write_jsonl(result, result_loc)

In [124]:
def top_n_impostors(df, n, remove_scores=False, remove_sentence_lists=False):
    """
    Groups the dataframe by 'doc_id' and selects the top n rows per group based on 'average_score'.
    
    Parameters:
        df (pd.DataFrame): Input dataframe with 'doc_id' and 'average_score' columns.
        n (int): Number of top rows to select per group.
        remove_scores (bool): If True, removes 'average_score' and 'score_list' columns from the result.

    Returns:
        pd.DataFrame: Filtered dataframe with top n rows per 'doc_id' by 'average_score'.
    """
    grouped = df.groupby('doc_id', group_keys=False)
    top_n = grouped.apply(lambda x: x.nlargest(n, 'average_score'))
    result = top_n.reset_index(drop=True)
    
    if remove_scores:
        result = result.drop(columns=['average_score', 'score_list'], errors='ignore')

    if remove_sentence_lists:
        result = result.drop(columns=['original_sentence_list', 'rephrased_sentence_list'], errors='ignore')
    
    return result

In [125]:
top_500 = top_n_impostors(result, 500, remove_scores=True, remove_sentence_lists=True)

  top_n = grouped.apply(lambda x: x.nlargest(n, 'average_score'))


In [126]:
top_500.head(5)

Unnamed: 0,doc_id,rephrased
0,adrianbridge_text_1,Oktoberfest's widespread appeal and recognitio...
1,adrianbridge_text_1,"The Oktoberfest enjoys a high level of fame, m..."
2,adrianbridge_text_1,The fame of the Oktoberfest ensures it is a ma...
3,adrianbridge_text_1,"Due to its renowned reputation, Oktoberfest at..."
4,adrianbridge_text_1,Its celebrity status means Oktoberfest easily ...


In [127]:
top_500_loc = "/Volumes/BCross/datasets/author_verification/training/The Telegraph/batch_sentence_impostors_top_500.jsonl"
read_and_write_docs.write_jsonl(top_500, top_500_loc)                    

In [52]:
df['doc_id'] = 'ambroseevans_pritchard_text_1'