This notebook completes all of the steps to create a sample of the blogger corpus with even same and different authors, then it preprocesses the text, chunks it and gathers the metadata. The script saves at each point. I have not functionised it yet but could be done for a larger sample or for the PAN data which is effectively the same.

In [1]:
import pandas as pd

import read_and_write_docs
import preprocessing
import combine_sentences

In [2]:
# Function to count words in text
def count_words(text):
    return len(text.split())

In [3]:
raw_loc = "/Users/user/Downloads/blogtext.csv"
save_base_loc = "/Users/user/Documents/GitHub/paraphrase_py/data/blogger"

# Raw for split data, then preprocessed is converted to sentences and combined is
# chunked by word count
known_raw_loc = f"{save_base_loc}/known_raw.jsonl"
known_preprocessed_loc = f"{save_base_loc}/known_preprocessed.jsonl"
known_combined_loc = f"{save_base_loc}/known_combined.jsonl"

unknown_raw_loc = f"{save_base_loc}/unknown_raw.jsonl"
unknown_preprocessed_loc = f"{save_base_loc}/unknown_preprocessed.jsonl"
unknown_combined_loc = f"{save_base_loc}/unknown_combined.jsonl"

metadata_loc = f"{save_base_loc}/metadata.jsonl"

In [4]:
df = pd.read_csv(raw_loc)
# Remove any whitespace from the column names
df.columns = df.columns.str.strip()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/user/Downloads/blogtext.csv'

In [None]:
df['word_count'] = df['text'].apply(count_words)
df['author_id'] = df['id']
df['id'] = df.index
df = df[['id', 'author_id'] + [col for col in df.columns if col not in ['id', 'author_id']]]

In [None]:
filtered_df = df[df['word_count'] > 1000]

In [None]:
filtered_df

In [None]:
# Filter IDs with more than one row
multi_row_ids = filtered_df['author_id'].value_counts()
multi_row_ids = multi_row_ids[multi_row_ids > 1].index

# Sample 5 IDs that have more than one row
common_ids = multi_row_ids.to_series().sample(5, random_state=1)

In [None]:
# Ensure we have at least 2 rows for each common_id in the main dataframe
common_rows = filtered_df[filtered_df['author_id'].isin(common_ids)]

# Separate rows for common IDs into x_common and y_common ensuring different rows for each
x_common = common_rows.groupby('author_id').apply(lambda group: group.sample(1, random_state=2)).reset_index(drop=True)

# Ensure remaining rows for common IDs are used in y_common
y_common = common_rows[~common_rows.index.isin(x_common.index)].groupby('author_id').apply(lambda group: group.sample(1, random_state=5)).reset_index(drop=True)

In [None]:
x_common

In [None]:
y_common

In [None]:
# Sample remaining unique IDs for x and y
remaining_ids = filtered_df[~filtered_df['author_id'].isin(common_ids)]['author_id'].drop_duplicates()
x_unique_ids = remaining_ids.sample(5, random_state=4)
y_unique_ids = remaining_ids[~remaining_ids.isin(x_unique_ids)].sample(5, random_state=5)

# Extract a random row for each unique ID for x and y
x_unique = filtered_df[filtered_df['author_id'].isin(x_unique_ids)].groupby('author_id').apply(lambda group: group.sample(1, random_state=6)).reset_index(drop=True)
y_unique = filtered_df[filtered_df['author_id'].isin(y_unique_ids)].groupby('author_id').apply(lambda group: group.sample(1, random_state=7)).reset_index(drop=True)

# Combine common and unique rows
x = pd.concat([x_common, x_unique]).reset_index(drop=True)
y = pd.concat([y_common, y_unique]).reset_index(drop=True)

In [None]:
x = x.drop(columns="word_count")

In [None]:
y = y.drop(columns="word_count")

In [None]:
x

In [None]:
y

In [None]:
# Save the blogger docs

In [None]:
read_and_write_docs.save_as_jsonl(x, known_raw_loc)
read_and_write_docs.save_as_jsonl(y, unknown_raw_loc)

In [None]:
# Run the preprocessing functions

In [None]:
known = preprocessing.apply_sentence_split(x)
known = preprocessing.split_rows_by_word_count(known, num_words=250)

unknown = preprocessing.apply_sentence_split(y)
unknown = preprocessing.split_rows_by_word_count(unknown, num_words=250)

read_and_write_docs.save_as_jsonl(known, known_preprocessed_loc)
read_and_write_docs.save_as_jsonl(unknown, unknown_preprocessed_loc)

In [None]:
# Combine sentences

In [None]:
known_combined = combine_sentences.concatenate_sentences(known, length_threshold=250, threshold_type='word')
unknown_combined = combine_sentences.concatenate_sentences(unknown, length_threshold=250, threshold_type='word')

read_and_write_docs.save_as_jsonl(known_combined, known_combined_loc)
read_and_write_docs.save_as_jsonl(unknown_combined, unknown_combined_loc)

In [None]:
# Calculate row counts for x and y by sample_id, author_id, and topic
known_counts = known_combined.groupby(['author_id', 'topic']).size().reset_index(name='row_count_known')
unknown_counts = unknown_combined.groupby(['author_id', 'topic']).size().reset_index(name='row_count_unknown')

# Rename columns in x and y
known_counts.rename(columns={'author_id': 'author_known', 'topic': 'topic_known'}, inplace=True)
unknown_counts.rename(columns={'author_id': 'author_unknown', 'topic': 'topic_unknown'}, inplace=True)


author_id_x = x['author_id'].tolist()
author_id_y = y['author_id'].tolist()

known_counts = known_counts.sort_values(by=['author_known'],
                                        key=lambda col: col.map({val: i for i, val in enumerate(author_id_x)}))

unknown_counts = unknown_counts.sort_values(by=['author_unknown'],
                                            key=lambda col: col.map({val: i for i, val in enumerate(author_id_y)}))

known_counts['sample_id'] = range(1,len(known_counts) + 1)
unknown_counts['sample_id'] = range(1,len(unknown_counts) + 1)

# Merge x and y on sample_id
metadata = pd.merge(known_counts, unknown_counts, on='sample_id', how='inner')

metadata['same_author'] = metadata['author_known'] == metadata['author_unknown']
metadata['total_comparisons'] = metadata['row_count_known'] * metadata['row_count_unknown']
metadata = metadata[['sample_id', 'author_known', 'author_unknown', 'same_author',
                     'topic_known', 'topic_unknown', 'row_count_known', 'row_count_unknown',
                     'total_comparisons']]


In [None]:
metadata

In [None]:
read_and_write_docs.save_as_jsonl(metadata, metadata_loc)