This notebook completes all of the steps to create a sample of the blogger corpus with even same and different authors, then it preprocesses the text, chunks it and gathers the metadata. The script saves at each point. I have not functionised it yet but could be done for a larger sample or for the PAN data which is effectively the same.

In [168]:
import pandas as pd

import read_and_write_docs
import preprocessing
import combine_sentences
import combine_rephrased

In [169]:
# Function to count words in text
def count_words(text):
    return len(text.split())

In [170]:
raw_loc = "/Users/user/Downloads/blogtext.csv"
save_base_loc = "/Users/user/Documents/GitHub/paraphrase_py/data/blogger_new_algorithm"

# Raw for split data, then preprocessed is converted to sentences and combined is
# chunked by word count
known_raw_loc = f"{save_base_loc}/known_raw.jsonl"
known_preprocessed_loc = f"{save_base_loc}/known_preprocessed.jsonl"
known_combined_loc = f"{save_base_loc}/known_combined.jsonl"
known_final_loc = f"{save_base_loc}/known_final.jsonl"

unknown_raw_loc = f"{save_base_loc}/unknown_raw.jsonl"
unknown_preprocessed_loc = f"{save_base_loc}/unknown_preprocessed.jsonl"
unknown_combined_loc = f"{save_base_loc}/unknown_combined.jsonl"
unknown_final_loc = f"{save_base_loc}/unknown_final.jsonl"

rephrased_preprocessed_loc = f"{save_base_loc}/rephrased_preprocessed.jsonl"

metadata_loc = f"{save_base_loc}/metadata.jsonl"

In [171]:
df = pd.read_csv(raw_loc)
# Remove any whitespace from the column names
df.columns = df.columns.str.strip()

In [172]:
df['word_count'] = df['text'].apply(count_words)
df['author_id'] = df['id']
df['id'] = df.index
df = df[['id', 'author_id'] + [col for col in df.columns if col not in ['id', 'author_id']]]

In [173]:
filtered_df = df[df['word_count'] > 1000]

In [175]:
# Filter IDs with more than one row
multi_row_ids = filtered_df['author_id'].value_counts()
multi_row_ids = multi_row_ids[multi_row_ids > 1].index

# Sample 5 IDs that have more than one row
common_ids = multi_row_ids.to_series().sample(10, random_state=1)

In [176]:
# Ensure we have at least 2 rows for each common_id in the main dataframe
common_rows = filtered_df[filtered_df['author_id'].isin(common_ids)]

# Separate rows for common IDs into x_common and y_common ensuring different rows for each
x_common = common_rows.groupby('author_id').apply(lambda group: group.sample(1, random_state=2)).reset_index(drop=True)

# Ensure remaining rows for common IDs are used in y_common
y_common = common_rows[~common_rows.index.isin(x_common.index)].groupby('author_id').apply(lambda group: group.sample(1, random_state=5)).reset_index(drop=True)

  x_common = common_rows.groupby('author_id').apply(lambda group: group.sample(1, random_state=2)).reset_index(drop=True)
  y_common = common_rows[~common_rows.index.isin(x_common.index)].groupby('author_id').apply(lambda group: group.sample(1, random_state=5)).reset_index(drop=True)


In [179]:
# Sample remaining unique IDs for x and y
remaining_ids = filtered_df[~filtered_df['author_id'].isin(common_ids)]['author_id'].drop_duplicates()
x_unique_ids = remaining_ids.sample(10, random_state=4)
y_unique_ids = remaining_ids[~remaining_ids.isin(x_unique_ids)].sample(10, random_state=5)

# Extract a random row for each unique ID for x and y
x_unique = filtered_df[filtered_df['author_id'].isin(x_unique_ids)].groupby('author_id').apply(lambda group: group.sample(1, random_state=6)).reset_index(drop=True)
y_unique = filtered_df[filtered_df['author_id'].isin(y_unique_ids)].groupby('author_id').apply(lambda group: group.sample(1, random_state=7)).reset_index(drop=True)

# Combine common and unique rows
x = pd.concat([x_common, x_unique]).reset_index(drop=True)
y = pd.concat([y_common, y_unique]).reset_index(drop=True)

  x_unique = filtered_df[filtered_df['author_id'].isin(x_unique_ids)].groupby('author_id').apply(lambda group: group.sample(1, random_state=6)).reset_index(drop=True)
  y_unique = filtered_df[filtered_df['author_id'].isin(y_unique_ids)].groupby('author_id').apply(lambda group: group.sample(1, random_state=7)).reset_index(drop=True)


In [180]:
x = x.drop(columns="word_count")

In [181]:
y = y.drop(columns="word_count")

In [184]:
# Save the blogger docs

In [185]:
read_and_write_docs.save_as_jsonl(x, known_raw_loc)
read_and_write_docs.save_as_jsonl(y, unknown_raw_loc)

In [186]:
# Run the preprocessing functions

In [187]:
known = preprocessing.apply_sentence_split(x)
known = preprocessing.split_rows_by_word_count(known, num_words=250)

unknown = preprocessing.apply_sentence_split(y)
unknown = preprocessing.split_rows_by_word_count(unknown, num_words=250)

read_and_write_docs.save_as_jsonl(known, known_preprocessed_loc)
read_and_write_docs.save_as_jsonl(unknown, unknown_preprocessed_loc)

In [188]:
# Combine sentences

In [189]:
known_combined = combine_sentences.concatenate_sentences(known, length_threshold=500, threshold_type='word')
unknown_combined = combine_sentences.concatenate_sentences(unknown, length_threshold=500, threshold_type='word')

read_and_write_docs.save_as_jsonl(known_combined, known_combined_loc)
read_and_write_docs.save_as_jsonl(unknown_combined, unknown_combined_loc)

In [190]:
def process_dataframe(df, filter_type):
    """
    Process the dataframe to add row numbers and chunk_count from the first row within each doc_id group.
    Then filter based on the filter_type.

    Parameters:
    df (pd.DataFrame): The input dataframe with columns 'id', 'chunk_id', 'subchunk_id', and 'chunk_count'.
    filter_type (str): The filter criteria, can be 'known', 'unknown', or 'rephrased'.

    Returns:
    pd.DataFrame: The processed and filtered dataframe.
    """
    # Sort the dataframe by id, chunk_id, subchunk_id
    sorted_df = df.sort_values(by=['id', 'chunk_id', 'subchunk_id'])

    # Assign row numbers within each id group
    sorted_df['row_number'] = sorted_df.groupby('id').cumcount() + 1

    # Find the chunk_count where row_number == 1 within each group
    chunk_count_first_row = sorted_df[sorted_df['row_number'] == 1][['id', 'chunk_count']]
    chunk_count_first_row = chunk_count_first_row.rename(columns={'chunk_count': 'chunk_count_first_row'})

    # Merge the chunk_count_first_row back into the original dataframe
    result = pd.merge(sorted_df, chunk_count_first_row, on='id', how='left')

    # Apply the filter based on filter_type
    if filter_type in ['known', 'unknown']:
        columns_to_drop = ['index', 'chunk_id', 'subchunk_id', 'input_length', 'chunk_count',
                           'original_sentence', 'row_number', 'chunk_count_first_row']
        result = result[result['row_number'] == 1]
        result = result.drop(columns=columns_to_drop)
        result = result.rename(columns={'id': 'doc_id'})
    elif filter_type == 'rephrased':
        columns_to_drop = ['index','word_count', 'chunk_id', 'subchunk_id', 'input_length',
                           'chunk_count', 'text', 'chunk_count_first_row']
        result = result[result['row_number'] <= result['chunk_count_first_row']]
        result = result.drop(columns=columns_to_drop)
        result = result.rename(columns={'row_number': 'chunk_id', 'original_sentence': 'text',
                                       'id': 'doc_id'})
        result = result.reset_index(drop=True)
        # Adjust the index to start from 1
        result.index = result.index + 1
    else:
        raise ValueError("Invalid filter_type. Must be 'known', 'unknown', or 'rephrased'.")

    return result

In [191]:
unknown_final = process_dataframe(unknown_combined, 'unknown')
known_final = process_dataframe(known_combined, 'known')
rephrased_final = process_dataframe(unknown_combined, 'rephrased')

read_and_write_docs.save_as_jsonl(known_final, known_final_loc)
read_and_write_docs.save_as_jsonl(unknown_final, unknown_final_loc)
read_and_write_docs.save_as_jsonl(rephrased_final, rephrased_preprocessed_loc)

In [192]:
# Rename columns in x and y
known_final.rename(columns={'author_id': 'author_known', 'topic': 'topic_known', 'doc_id': 'doc_id_known'}, inplace=True)
unknown_final.rename(columns={'author_id': 'author_unknown', 'topic': 'topic_unknown', 'doc_id': 'doc_id_unknown'}, inplace=True)

author_id_x = x['author_id'].tolist()
author_id_y = y['author_id'].tolist()

known_data = known_final.sort_values(by=['author_known'],
                                    key=lambda col: col.map({val: i for i, val in enumerate(author_id_x)}))

unknown_data = unknown_final.sort_values(by=['author_unknown'],
                                        key=lambda col: col.map({val: i for i, val in enumerate(author_id_y)}))

known_data['sample_id'] = range(1,len(known_data) + 1)
unknown_data['sample_id'] = range(1,len(unknown_data) + 1)

# Merge x and y on sample_id
metadata = pd.merge(known_data, unknown_data, on='sample_id', how='inner')

metadata['same_author'] = metadata['author_known'] == metadata['author_unknown']
metadata = metadata[['sample_id', 'doc_id_known', 'doc_id_unknown', 'author_known',
                     'author_unknown', 'topic_known', 'topic_unknown', 'same_author']]


In [194]:
read_and_write_docs.save_as_jsonl(metadata, metadata_loc)

In [196]:
metadata

Unnamed: 0,sample_id,doc_id_known,doc_id_unknown,author_known,author_unknown,topic_known,topic_unknown,same_author
0,1,114606,114599,27603,27603,Advertising,Advertising,True
1,2,509664,509577,993945,993945,HumanResources,HumanResources,True
2,3,402493,402496,1796990,1796990,Sports-Recreation,Sports-Recreation,True
3,4,357508,357437,2534568,2534568,Education,Education,True
4,5,676576,676573,2876684,2876684,Technology,Technology,True
5,6,349669,349665,3152540,3152540,Arts,Arts,True
6,7,463281,463289,3492066,3492066,Student,Student,True
7,8,490838,490831,3835771,3835771,Student,Student,True
8,9,288777,288775,3911836,3911836,indUnk,indUnk,True
9,10,18515,18516,4160528,4160528,Student,Student,True
