## Boenninghoff Preprocessing Steps

The following function was pulled from Boenninghoffs Preprocessing Code for PAN Data.

In [89]:
import pandas as pd
import re

In [90]:
%run "../read_and_write_docs.py"

In [91]:
base_loc = "../../../../datasets/PAN/PAN 21/"
known_loc = f"{base_loc}pan21-known.jsonl"
known_preprocessed = f"{base_loc}pan21-known-preprocessed.jsonl"
unknown_loc = f"{base_loc}pan21-unknown.jsonl"
unknown_preprocessed = f"{base_loc}pan21-unknown-preprocessed.jsonl"

In [92]:
def preprocess_doc(doc):
    
    doc = re.sub('\"', '\'', doc) # Replace double quotes with single quotes
    doc = re.sub('\'\'+', ' \' \' ', doc) # Replace consecutive single quotes with a space-padded single quote
    doc = re.sub('--+', ' -- ', doc) # Replace consecutive dashes with a space-padded dash
    doc = re.sub('\.\.+', ' .. ', doc) # Replace consecutive dots with a space-padded double dot
    doc = re.sub('!!+', ' !! ', doc) # Replace Multiple Exclamation Marks with Two Exclamation Marks Surrounded by Spaces
    doc = re.sub(',,+', ' ,, ', doc) # Replace Multiple Commas with Two Commas Surrounded by Spaces
    doc = re.sub(',\'', ', \'', doc) # Fix Comma Followed by Single Quote
    doc = re.sub(',~~+', ' ~~ ', doc) # Replace Multiple Tildes with Two Tildes Surrounded by Spaces
    doc = re.sub('/\\/\\+', ' /\\/\\ ', doc) # Replace Multiple Backslashes with a Single Backslash Surrounded by Spaces
    doc = re.sub("((.)\\2{2})\\2+", r"\1", doc) # Collapse Repeated Characters
    doc = re.sub(r"([a-zA-Z])([!?\-:,])([a-zA-Z])", r"\1 \2 \3", doc) # Add Spaces Around Certain Punctuation Marks
    doc = re.sub(r"([a-zA-Z])([!?.\-:,\(\)])", r"\1 \2", doc) # Add Space Before Certain Punctuation Marks
    doc = re.sub(r"([!?.\-:,\(\)])([a-zA-Z])", r"\1 \2", doc) # Add Space After Certain Punctuation Marks

    return doc

In [93]:
known_df = read_jsonl_file(known_loc)
unknown_df = read_jsonl_file(unknown_loc)

In [94]:
# Apply the preprocessing function to the 'text' column
known_df['text'] = known_df['text'].apply(preprocess_doc)
unknown_df['text'] = unknown_df['text'].apply(preprocess_doc)

In [95]:
save_as_jsonl(known_df, known_preprocessed)
save_as_jsonl(unknown_df, unknown_preprocessed)

In [96]:
known_df

Unnamed: 0,id,author,fandom,text
0,c28e8b03-c02a-5184-b58a-12dd28b8ca74,2072507,Death Note,talk because they hadn't been exposed to commu...
1,b9326101-6352-56dd-9d1b-1f41466897b7,1404943,Samurai Deeper Kyo,"Zazuki nodded his head and got to his feet , k..."
2,e2ac4453-bf54-53f2-bf68-6caae6aacded,2406271,West Side Story,'Oh we did lots of special things . On Christm...
3,a5e9a289-0999-5764-b597-dc1bf8c21ede,189771,Get Backers,"'Hey now , at least Shido brings home some mon..."
4,cb4054b1-d422-58d6-a137-dcfc70100df6,965626,My Hero Academia/僕のヒーローアカデミア,It was a mere five minutes' walk from third ye...
...,...,...,...,...
19994,7b226a98-3d82-50c1-b6f3-ee682f580d27,2264834,Mentalist,They zipped through the formalities before fli...
19995,eed8a22b-c7cd-5adb-b530-88b9d5d30202,1144417,Janet Evanovich,'You're taking me to a spa ?' I asked eyeing h...
19996,07d9c529-6fd2-5cd1-b46f-bcfad1d58503,2733573,Skulduggery Pleasant series,Vanessa : Hahaa ! Shudder : Still here . This ...
19997,88066615-7a7a-5f5b-adb9-f8c67a4c59a6,787585,Jackie Chan Adventures,"bountiful breasts pressed against his chest , ..."
