This notebook completes all of the steps to create a sample of the blogger corpus with even same and different authors, then it preprocesses the text, chunks it and gathers the metadata. The script saves at each point. I have not functionised it yet but could be done for a larger sample or for the PAN data which is effectively the same.

In [159]:
import pandas as pd

import read_and_write_docs
import preprocessing
import combine_sentences

In [160]:
# Function to count words in text
def count_words(text):
    return len(text.split())

In [161]:
raw_loc = "/Users/user/Downloads/blogtext.csv"
save_base_loc = "/Users/user/Documents/GitHub/paraphrase_py/data/blogger"

# Raw for split data, then preprocessed is converted to sentences and combined is
# chunked by word count
known_raw_loc = f"{save_base_loc}/known_raw.jsonl"
known_preprocessed_loc = f"{save_base_loc}/known_preprocessed.jsonl"
known_combined_loc = f"{save_base_loc}/known_combined.jsonl"

unknown_raw_loc = f"{save_base_loc}/unknown_raw.jsonl"
unknown_preprocessed_loc = f"{save_base_loc}/unknown_preprocessed.jsonl"
unknown_combined_loc = f"{save_base_loc}/unknown_combined.jsonl"

metadata_loc = f"{save_base_loc}/metadata.jsonl"

In [162]:
df = pd.read_csv(raw_loc)
# Remove any whitespace from the column names
df.columns = df.columns.str.strip()

In [163]:
df['word_count'] = df['text'].apply(count_words)
df['author_id'] = df['id']
df['id'] = df.index
df = df[['id', 'author_id'] + [col for col in df.columns if col not in ['id', 'author_id']]]

In [164]:
filtered_df = df[df['word_count'] > 1000]

In [165]:
filtered_df

Unnamed: 0,id,author_id,gender,age,topic,sign,date,text,word_count
2,2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...,4326
48,48,3581210,male,33,InvestmentBanking,Aquarius,"23,July,2004","Just so you know, this blog isn'...",2029
146,146,3705830,male,25,Non-Profit,Cancer,"24,July,2004",&nbsp; aside: you know you've done wel...,1033
192,192,3389918,female,37,indUnk,Aquarius,"23,May,2004",I had this conversation with a man the ...,1093
219,219,3429420,male,15,Student,Aquarius,"23,May,2004",The young elf ducked as a blade shimmer...,1721
...,...,...,...,...,...,...,...,...,...
680937,680937,1673216,male,17,Student,Taurus,"28,November,2003",First survey ever to be posted he...,2711
680944,680944,1673216,male,17,Student,Taurus,"01,December,2003",Donnie (1:00:06 AM): but the real...,2554
680986,680986,1673216,male,17,Student,Taurus,"17,June,2004",Good god...Never save this much h...,2764
681014,681014,3789932,female,24,Non-Profit,Capricorn,"28,July,2004",I know it was controversial.&nbsp;...,1933


In [166]:
# Filter IDs with more than one row
multi_row_ids = filtered_df['author_id'].value_counts()
multi_row_ids = multi_row_ids[multi_row_ids > 1].index

# Sample 5 IDs that have more than one row
common_ids = multi_row_ids.to_series().sample(5, random_state=1)

In [167]:
# Ensure we have at least 2 rows for each common_id in the main dataframe
common_rows = filtered_df[filtered_df['author_id'].isin(common_ids)]

# Separate rows for common IDs into x_common and y_common ensuring different rows for each
x_common = common_rows.groupby('author_id').apply(lambda group: group.sample(1, random_state=2)).reset_index(drop=True)

# Ensure remaining rows for common IDs are used in y_common
y_common = common_rows[~common_rows.index.isin(x_common.index)].groupby('author_id').apply(lambda group: group.sample(1, random_state=5)).reset_index(drop=True)

  x_common = common_rows.groupby('author_id').apply(lambda group: group.sample(1, random_state=2)).reset_index(drop=True)
  y_common = common_rows[~common_rows.index.isin(x_common.index)].groupby('author_id').apply(lambda group: group.sample(1, random_state=5)).reset_index(drop=True)


In [168]:
x_common

Unnamed: 0,id,author_id,gender,age,topic,sign,date,text,word_count
0,114606,27603,male,24,Advertising,Sagittarius,"28,July,2004","Sitting in a caf in Madrid, refle...",1081
1,357508,2534568,male,17,Education,Leo,"09,June,2004","Normally, when someone sees a problem i...",1118
2,463281,3492066,female,34,Student,Cancer,"03,July,2004",I went over to what once was my grandparent...,1383
3,490838,3835771,female,24,Student,Scorpio,"06,August,2004",Today is just what I needed... a day ju...,1352
4,18515,4160528,female,16,Student,Leo,"09,August,2004",Have you ever wondered which...,1893


In [169]:
y_common

Unnamed: 0,id,author_id,gender,age,topic,sign,date,text,word_count
0,114599,27603,male,24,Advertising,Sagittarius,"28,July,2004",So John and I made our way over t...,1134
1,357437,2534568,male,17,Education,Leo,"10,June,2004",I'm taking James' idea.....qu...,1581
2,463289,3492066,female,34,Student,Cancer,"25,July,2004",As Terry Pratchett once wrote in Hitchhike...,2098
3,490831,3835771,female,24,Student,Scorpio,"09,August,2004",urlLink I should have better things...,1335
4,18516,4160528,female,16,Student,Leo,"09,August,2004",Kelz~ We really got close th...,1020


In [170]:
# Sample remaining unique IDs for x and y
remaining_ids = filtered_df[~filtered_df['author_id'].isin(common_ids)]['author_id'].drop_duplicates()
x_unique_ids = remaining_ids.sample(5, random_state=4)
y_unique_ids = remaining_ids[~remaining_ids.isin(x_unique_ids)].sample(5, random_state=5)

# Extract a random row for each unique ID for x and y
x_unique = filtered_df[filtered_df['author_id'].isin(x_unique_ids)].groupby('author_id').apply(lambda group: group.sample(1, random_state=6)).reset_index(drop=True)
y_unique = filtered_df[filtered_df['author_id'].isin(y_unique_ids)].groupby('author_id').apply(lambda group: group.sample(1, random_state=7)).reset_index(drop=True)

# Combine common and unique rows
x = pd.concat([x_common, x_unique]).reset_index(drop=True)
y = pd.concat([y_common, y_unique]).reset_index(drop=True)

  x_unique = filtered_df[filtered_df['author_id'].isin(x_unique_ids)].groupby('author_id').apply(lambda group: group.sample(1, random_state=6)).reset_index(drop=True)
  y_unique = filtered_df[filtered_df['author_id'].isin(y_unique_ids)].groupby('author_id').apply(lambda group: group.sample(1, random_state=7)).reset_index(drop=True)


In [171]:
x = x.drop(columns="word_count")

In [172]:
y = y.drop(columns="word_count")

In [173]:
x

Unnamed: 0,id,author_id,gender,age,topic,sign,date,text
0,114606,27603,male,24,Advertising,Sagittarius,"28,July,2004","Sitting in a caf in Madrid, refle..."
1,357508,2534568,male,17,Education,Leo,"09,June,2004","Normally, when someone sees a problem i..."
2,463281,3492066,female,34,Student,Cancer,"03,July,2004",I went over to what once was my grandparent...
3,490838,3835771,female,24,Student,Scorpio,"06,August,2004",Today is just what I needed... a day ju...
4,18515,4160528,female,16,Student,Leo,"09,August,2004",Have you ever wondered which...
5,655926,2313610,female,17,Student,Scorpio,"06,July,2004","'If you love something, set it fr..."
6,582281,2990004,male,27,indUnk,Aries,"12,May,2004",Painting your bedroom at 3am blow...
7,73745,3354644,male,16,indUnk,Scorpio,"29,June,2004",I guess sometimes in life it's ...
8,317525,3385427,male,24,Technology,Capricorn,"20,May,2004",*I WROTE THIS SHORT STORY AS AN ESSAY ...
9,15111,3898365,male,16,Student,Sagittarius,"18,July,2004",Amy : how many horses do you have ...


In [174]:
y

Unnamed: 0,id,author_id,gender,age,topic,sign,date,text
0,114599,27603,male,24,Advertising,Sagittarius,"28,July,2004",So John and I made our way over t...
1,357437,2534568,male,17,Education,Leo,"10,June,2004",I'm taking James' idea.....qu...
2,463289,3492066,female,34,Student,Cancer,"25,July,2004",As Terry Pratchett once wrote in Hitchhike...
3,490831,3835771,female,24,Student,Scorpio,"09,August,2004",urlLink I should have better things...
4,18516,4160528,female,16,Student,Leo,"09,August,2004",Kelz~ We really got close th...
5,599114,3421454,female,26,indUnk,Cancer,"07,August,2004","Well, that totally sucked. I just spent..."
6,566677,3579704,female,37,Arts,Aries,"05,August,2004",I went hiking last night up a small mountai...
7,70157,3632184,female,23,indUnk,Aries,"04,August,2004",Oh man. I really did have too many dr...
8,443226,3877064,female,23,indUnk,Libra,"14,July,2004","Today, I got my car's oil cha..."
9,440789,3967971,female,27,indUnk,Scorpio,"20,July,2004",Its summer here in gold old California ...


In [175]:
# Save the blogger docs

In [176]:
read_and_write_docs.save_as_jsonl(x, known_raw_loc)
read_and_write_docs.save_as_jsonl(y, unknown_raw_loc)

In [177]:
# Run the preprocessing functions

In [178]:
known = preprocessing.apply_sentence_split(x)
known = preprocessing.split_rows_by_word_count(known, num_words=250)

unknown = preprocessing.apply_sentence_split(y)
unknown = preprocessing.split_rows_by_word_count(unknown, num_words=250)

read_and_write_docs.save_as_jsonl(known, known_preprocessed_loc)
read_and_write_docs.save_as_jsonl(unknown, unknown_preprocessed_loc)

In [179]:
# Combine sentences

In [180]:
known_combined = combine_sentences.concatenate_sentences(known, length_threshold=250, threshold_type='word')
unknown_combined = combine_sentences.concatenate_sentences(unknown, length_threshold=250, threshold_type='word')

read_and_write_docs.save_as_jsonl(known_combined, known_combined_loc)
read_and_write_docs.save_as_jsonl(unknown_combined, unknown_combined_loc)

In [182]:
# Calculate row counts for x and y by sample_id, author_id, and topic
known_counts = known_combined.groupby(['author_id', 'topic']).size().reset_index(name='row_count_known')
unknown_counts = unknown_combined.groupby(['author_id', 'topic']).size().reset_index(name='row_count_unknown')

# Rename columns in x and y
known_counts.rename(columns={'author_id': 'author_known', 'topic': 'topic_known'}, inplace=True)
unknown_counts.rename(columns={'author_id': 'author_unknown', 'topic': 'topic_unknown'}, inplace=True)


author_id_x = x['author_id'].tolist()
author_id_y = y['author_id'].tolist()

known_counts = known_counts.sort_values(by=['author_known'],
                                        key=lambda col: col.map({val: i for i, val in enumerate(author_id_x)}))

unknown_counts = unknown_counts.sort_values(by=['author_unknown'],
                                            key=lambda col: col.map({val: i for i, val in enumerate(author_id_y)}))

known_counts['sample_id'] = range(1,len(known_counts) + 1)
unknown_counts['sample_id'] = range(1,len(unknown_counts) + 1)

# Merge x and y on sample_id
metadata = pd.merge(known_counts, unknown_counts, on='sample_id', how='inner')

metadata['same_author'] = metadata['author_known'] == metadata['author_unknown']
metadata['total_comparisons'] = metadata['row_count_known'] * metadata['row_count_unknown']
metadata = metadata[['sample_id', 'author_known', 'author_unknown', 'same_author',
                     'topic_known', 'topic_unknown', 'row_count_known', 'row_count_unknown',
                     'total_comparisons']]


In [183]:
metadata

Unnamed: 0,sample_id,author_known,author_unknown,same_author,topic_known,topic_unknown,row_count_known,row_count_unknown,total_comparisons
0,1,27603,27603,True,Advertising,Advertising,49,55,2695
1,2,2534568,2534568,True,Education,Education,33,121,3993
2,3,3492066,3492066,True,Student,Student,40,69,2760
3,4,3835771,3835771,True,Student,Student,76,102,7752
4,5,4160528,4160528,True,Student,Student,94,88,8272
5,6,2313610,3421454,False,Student,indUnk,84,63,5292
6,7,2990004,3579704,False,indUnk,Arts,76,106,8056
7,8,3354644,3632184,False,indUnk,indUnk,79,45,3555
8,9,3385427,3877064,False,Technology,indUnk,181,63,11403
9,10,3898365,3967971,False,Student,indUnk,127,59,7493


In [184]:
read_and_write_docs.save_as_jsonl(metadata, metadata_loc)