This notebook completes all of the steps to create a sample of the blogger corpus with even same and different authors, then it preprocesses the text, chunks it and gathers the metadata. The script saves at each point. I have not functionised it yet but could be done for a larger sample or for the PAN data which is effectively the same.

In [9]:
import pandas as pd
import numpy as np

import read_and_write_docs
import preprocessing
import combine_sentences
import combine_rephrased

In [10]:
# Function to count words in text
def count_words(text):
    return len(text.split())

In [11]:
def sample_dataframe(filtered_df, sample_num):
    # Step 1: Group by 'author_id' and 'topic'
    grouped = filtered_df.groupby(['author_id', 'topic'])

    # Step 2: Filter groups that have at least two different documents
    eligible_groups = [group for _, group in grouped if group['id'].nunique() > 1]

    # Flatten the list of eligible groups into a single DataFrame
    eligible_df = pd.concat(eligible_groups)

    # Initialize lists to collect rows for x and y
    x_rows = []
    y_rows = []

    # Sample pairs ensuring different documents from the same author with the same topic
    authors_sampled = set()
    pairs_count = 0

    for (author_id, topic), group in eligible_df.groupby(['author_id', 'topic']):
        unique_docs = group['id'].unique()
        if len(unique_docs) >= 2:
            # Get two different documents
            doc1, doc2 = np.random.choice(unique_docs, 2, replace=False)
            sample1 = group[group['id'] == doc1].sample(1)
            sample2 = group[group['id'] == doc2].sample(1)
            authors_sampled.add(author_id)
            x_rows.append(sample1)
            y_rows.append(sample2)
            pairs_count += 1
            if pairs_count >= sample_num // 2:
                break

    # Step 3: Concatenate the sampled rows into DataFrames
    x = pd.concat(x_rows).reset_index(drop=True)
    y = pd.concat(y_rows).reset_index(drop=True)

    # Step 4: Exclude authors already sampled
    remaining_df = filtered_df[~filtered_df['author_id'].isin(authors_sampled)]

    # Step 5: Sample pairs from different authors with the same topic
    remaining_grouped = remaining_df.groupby('topic')

    different_author_pairs_count = 0

    for topic, group in remaining_grouped:
        unique_authors = group['author_id'].unique()
        while different_author_pairs_count < sample_num // 2:
            if len(unique_authors) < 2:
                break
            author1, author2 = np.random.choice(unique_authors, 2, replace=False)
            sample1 = group[group['author_id'] == author1].sample(1)
            sample2 = group[group['author_id'] == author2].sample(1)
            x = pd.concat([x, sample1]).reset_index(drop=True)
            y = pd.concat([y, sample2]).reset_index(drop=True)
            different_author_pairs_count += 1
            unique_authors = unique_authors[unique_authors != author1]
            unique_authors = unique_authors[unique_authors != author2]
            if different_author_pairs_count >= sample_num // 2:
                break

    # Ensure we have exactly sample_num rows in x and y
    x = x.head(sample_num)
    y = y.head(sample_num)

    return x, y

def create_metadata(x, y):
    # Step 1: Add a new index called sample_id to x and y, and ensure it is the first column
    x['sample_id'] = range(1, len(x) + 1)
    y['sample_id'] = range(1, len(y) + 1)

    # Move sample_id to the first column
    x = x[['sample_id'] + [col for col in x.columns if col != 'sample_id']]
    y = y[['sample_id'] + [col for col in y.columns if col != 'sample_id']]

    x.rename(columns={'id': 'doc_id'}, inplace=True)
    y.rename(columns={'id': 'doc_id'}, inplace=True)

    # Step 2: Rename columns in x and y to add _x and _y suffixes respectively
    x = x.add_suffix('_x')
    y = y.add_suffix('_y')

    # Rename sample_id columns back to sample_id (they were also suffixed)
    x.rename(columns={'sample_id_x': 'sample_id'}, inplace=True)
    y.rename(columns={'sample_id_y': 'sample_id'}, inplace=True)

    # Step 3: Join the two tables on sample_id
    metadata = pd.merge(x, y, on='sample_id')

    # Step 4: Create new columns same_author and same_topic
    metadata['same_author'] = metadata['author_id_x'] == metadata['author_id_y']
    metadata['same_topic'] = metadata['topic_x'] == metadata['topic_y']

    # Step 5: Keep only the required columns
    metadata = metadata[['sample_id', 'doc_id_x', 'doc_id_y', 'author_id_x',
                         'author_id_y', 'topic_x', 'topic_y', 'same_author', 'same_topic']]

    return metadata


In [12]:
raw_loc = "/Users/user/Downloads/blogtext.csv"
save_base_loc = "/Users/user/Documents/datasets/blogger"

# Raw for split data, then preprocessed is converted to sentences and combined is
# chunked by word count
raw_lsave_loc = f"{save_base_loc}/raw.jsonl"

known_raw_loc = f"{save_base_loc}/known_raw.jsonl"
known_preprocessed_loc = f"{save_base_loc}/known_preprocessed.jsonl"
known_combined_loc = f"{save_base_loc}/known_combined.jsonl"
known_final_loc = f"{save_base_loc}/known_final.jsonl"

unknown_raw_loc = f"{save_base_loc}/unknown_raw.jsonl"
unknown_preprocessed_loc = f"{save_base_loc}/unknown_preprocessed.jsonl"
unknown_combined_loc = f"{save_base_loc}/unknown_combined.jsonl"
unknown_final_loc = f"{save_base_loc}/unknown_final.jsonl"

rephrased_preprocessed_loc = f"{save_base_loc}/rephrased_preprocessed.jsonl"

metadata_loc = f"{save_base_loc}/metadata.jsonl"

In [13]:
df = pd.read_csv(raw_loc)
# Remove any whitespace from the column names
df.columns = df.columns.str.strip()

In [14]:
df['word_count'] = df['text'].apply(count_words)
df['author_id'] = df['id']
df['id'] = df.index
df = df[['id', 'author_id'] + [col for col in df.columns if col not in ['id', 'author_id']]]

In [15]:
filtered_df = df[df['word_count'] > 1000]

In [16]:
x, y = sample_dataframe(filtered_df, 200)
metadata = create_metadata(x, y)
read_and_write_docs.save_as_jsonl(metadata, metadata_loc)

In [17]:
x = x.drop(columns="word_count")
y = y.drop(columns="word_count")

In [18]:
# Save the blogger docs

In [19]:
read_and_write_docs.save_as_jsonl(x, known_raw_loc)
read_and_write_docs.save_as_jsonl(y, unknown_raw_loc)

In [20]:
# Run the preprocessing functions

In [21]:
known = preprocessing.apply_sentence_split(x)
known = preprocessing.split_rows_by_word_count(known, num_words=250)

unknown = preprocessing.apply_sentence_split(y)
unknown = preprocessing.split_rows_by_word_count(unknown, num_words=250)

read_and_write_docs.save_as_jsonl(known, known_preprocessed_loc)
read_and_write_docs.save_as_jsonl(unknown, unknown_preprocessed_loc)

In [22]:
# Combine sentences

In [23]:
known_combined = combine_sentences.concatenate_sentences(known, length_threshold=500, threshold_type='word')
unknown_combined = combine_sentences.concatenate_sentences(unknown, length_threshold=500, threshold_type='word')

read_and_write_docs.save_as_jsonl(known_combined, known_combined_loc)
read_and_write_docs.save_as_jsonl(unknown_combined, unknown_combined_loc)