In [1]:
import json
import os
import spacy
import nltk
import re

import pandas as pd

from nltk.tokenize import sent_tokenize

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [2]:
# nltk.download('punkt_tab')


## Load the data

In [3]:
base_loc = "/Volumes/BCross/datasets/author_verification"

test_or_training = "test"

base_file_type_loc = f"{base_loc}/{test_or_training}/"

In [4]:
def list_files(location, exact_name=None):
    """
    Lists all files in the specified location, optionally filtering by file type.

    Parameters:
    - location (str): The directory to search in.
    - file_type (str, optional): The file extension to filter by (e.g., ".jsonl").

    Returns:
    - list: A list of full file paths that match the file type.
    """
    # Initialize an empty list to store file paths
    file_list = []
    
    # Walk through the directory
    for root, dirs, files in os.walk(location):
        for file_name in files:
            # Match exact file name if specified
            if exact_name and file_name == exact_name:
                file_list.append(os.path.join(root, file_name))
            # If no exact_name is provided, include all files
            elif not exact_name:
                file_list.append(os.path.join(root, file_name))
    
    return file_list


In [5]:
def read_jsonl(file_path):
    """
    Reads a JSONL file and converts it into a pandas DataFrame.

    Parameters:
    - file_path: Path to the JSONL file to read.

    Returns:
    - A pandas DataFrame containing the data from the JSONL file.
    """
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            # Parse the line as JSON
            parsed_line = json.loads(line)
            # If the line is a single-element list, extract the first element
            if isinstance(parsed_line, list) and len(parsed_line) == 1:
                data.append(parsed_line[0])
            else:
                data.append(parsed_line)
    
    # Convert to a DataFrame
    data = pd.DataFrame(data)
    return data

In [6]:
def write_jsonl(data, output_file_path):
    """
    Writes a pandas DataFrame to a JSONL file.

    Parameters:
    - data: A pandas DataFrame to save.
    - output_file_path: Path to the output JSONL file.
    """
    
    with open(output_file_path, 'w') as file:
        for _, row in data.iterrows():
            json.dump(row.to_dict(), file)
            file.write('\n')

In [7]:
nlp = spacy.load("en_core_web_sm")

# Add custom rule for abbreviations like Capt. and others
# Add custom rules for abbreviations
abbreviation_patterns = [
    # Personal titles
    {"label": "PERSON", "pattern": [{"LOWER": "capt."}]},
    {"label": "PERSON", "pattern": [{"LOWER": "dr."}]},
    {"label": "PERSON", "pattern": [{"LOWER": "mr."}]},
    {"label": "PERSON", "pattern": [{"LOWER": "mrs."}]},
    {"label": "PERSON", "pattern": [{"LOWER": "prof."}]},
    {"label": "PERSON", "pattern": [{"LOWER": "rev."}]},
    {"label": "PERSON", "pattern": [{"LOWER": "sr."}]},
    {"label": "PERSON", "pattern": [{"LOWER": "jr."}]},
    
    # Common Latin abbreviations
    {"label": "ABBREVIATION", "pattern": [{"LOWER": "e.g."}]},  # For example
    {"label": "ABBREVIATION", "pattern": [{"LOWER": "i.e."}]},  # That is
    {"label": "ABBREVIATION", "pattern": [{"LOWER": "etc."}]},  # Et cetera
    {"label": "ABBREVIATION", "pattern": [{"LOWER": "et al."}]},  # And others
    {"label": "ABBREVIATION", "pattern": [{"LOWER": "a.m."}]},  # Ante meridiem
    {"label": "ABBREVIATION", "pattern": [{"LOWER": "p.m."}]},  # Post meridiem
    {"label": "ABBREVIATION", "pattern": [{"LOWER": "vs."}]},   # Versus
    {"label": "ABBREVIATION", "pattern": [{"LOWER": "cf."}]},   # Compare
    {"label": "ABBREVIATION", "pattern": [{"LOWER": "viz."}]},  # Namely
    {"label": "ABBREVIATION", "pattern": [{"LOWER": "ca."}]},   # Circa

    # Additional abbreviations
    # {"label": "ABBREVIATION", "pattern": [{"LOWER": "no."}]},  # Number - Ignore 
    {"label": "ABBREVIATION", "pattern": [{"LOWER": "vol."}]},  # Volume
    {"label": "ABBREVIATION", "pattern": [{"LOWER": "pp."}]},   # Pages
    {"label": "ABBREVIATION", "pattern": [{"LOWER": "fig."}]},  # Figure
]

# Adding the patterns to spaCy's pipeline
ruler = nlp.add_pipe("entity_ruler", before="ner", name="abbreviation_ruler")
ruler.add_patterns(abbreviation_patterns)

In [8]:
def process_text(text):
    # Step 1: Perform Named Entity Recognition (NER)
    doc = nlp(text)
    entities = {ent.text: f"__ENTITY{idx}__" for idx, ent in enumerate(doc.ents)}

    # Step 2: Mask named entities
    masked_text = text
    for entity, placeholder in entities.items():
        masked_text = masked_text.replace(entity, placeholder)

    # Step 3: Perform Sentence Splitting using NLTK
    sentences = sent_tokenize(masked_text)

    # Step 4: Restore Named Entities
    restored_sentences = []
    for sentence in sentences:
        for placeholder, entity in entities.items():
            sentence = sentence.replace(entity, placeholder)
        restored_sentences.append(sentence)

    return restored_sentences

In [9]:
def process_text(text):
    """
    Process the input text by performing Named Entity Recognition (NER), masking the entities,
    splitting the text into sentences, and then restoring the entities with their original text.

    Parameters:
    - text (str): The input text to process, including named entities.

    Returns:
    - list: A list of sentences with named entities restored to their original form.
    """
    
    # Step 1: Perform Named Entity Recognition (NER)
    doc = nlp(text)
    
    # Create a dictionary where the keys are the original entities and the values are placeholders
    entities = {ent.text: f"__ENTITY{idx}__" for idx, ent in enumerate(doc.ents)}

    # Step 2: Mask named entities in the text
    masked_text = text
    
    # Sort entities by length to prevent conflicts (longer entities are replaced first)
    sorted_entities = sorted(entities.items(), key=lambda x: -len(x[0]))
    
    for entity, placeholder in sorted_entities:
        # Use regex to ensure we match whole words only (avoiding partial replacements)
        masked_text = re.sub(r'\b' + re.escape(entity) + r'\b', placeholder, masked_text)

    # Step 3: Perform Sentence Splitting using NLTK
    sentences = sent_tokenize(masked_text)

    # Step 3: Perform Sentence Splitting using Spacy
    # doc = nlp(masked_text)
    # sentences = [sent.text for sent in doc.sents]

    reversed_entities = {v: k for k, v in entities.items()}
    
    # Step 4: Restore Named Entities in sentences using regex for exact match
    restored_sentences = []
    for sentence in sentences:
        restored_sentence = sentence
        for placeholder, entity in reversed_entities.items():
            # Replace placeholders with original entities using \b for word boundaries
            restored_sentence = re.sub(r'\b' + re.escape(placeholder) + r'\b', entity, restored_sentence)
        restored_sentences.append(restored_sentence)

    return restored_sentences

In [10]:
def preprocess_and_process_text(text):
    """
    Preprocesses the text by replacing '\\' with a placeholder, processes with NER and sentence splitting,
    then restores the '\\' placeholders back to '\\'.

    Parameters:
    - text (str): The original text to process.

    Returns:
    - list: The list of sentences with entities restored, including correct handling of backslashes.
    """
    # Preprocess step: Replace '\\' with a placeholder to avoid issues during NER
    text = text.replace("\\", "__BACKSLASH_PLACEHOLDER__")
    text = text.replace("(??)", "__REF_ERROR__")

    # Now apply the process_text function
    processed_sentences = process_text(text)

    # Restore the '\\' back in the processed sentences
    processed_sentences_with_backslashes = [
        sentence.replace('__BACKSLASH_PLACEHOLDER__', r'\\') for sentence in processed_sentences
    ]
    processed_sentences_with_backslashes = [
        sentence.replace('__REF_ERROR__', "(??)") for sentence in processed_sentences
    ]
    
    return processed_sentences_with_backslashes

In [11]:
def apply_ner(df):
    """
    Apply Named Entity Recognition (NER) to the specified text column of a DataFrame
    and return a list of unique named entities.

    Parameters:
    - df (pandas.DataFrame): The DataFrame containing the text data.
    - text_column (str): The name of the column that contains the text data.

    Returns:
    - list: A list of unique named entities found in the text column.
    """
    # Function to extract entities from a given text
    def extract_entities(text):
        doc = nlp(text)  # Perform NER using spaCy
        return [ent.text for ent in doc.ents]  # Extract entities as a list

    # Apply NER to the text column and create a new column with the list of entities
    df['entities'] = df['text'].apply(extract_entities)

    # Flatten the list of entities from all rows and get unique entities
    all_entities = [entity for sublist in df['entities'] for entity in sublist]
    unique_entities = list(set(all_entities))  # Remove duplicates by converting to a set

    return unique_entities


In [12]:
file_list = list_files(base_file_type_loc, "known_raw.jsonl")

In [13]:
file_list

['/Volumes/BCross/datasets/author_verification/test/StackExchange/known_raw.jsonl',
 '/Volumes/BCross/datasets/author_verification/test/Amazon/known_raw.jsonl',
 '/Volumes/BCross/datasets/author_verification/test/The Telegraph/known_raw.jsonl',
 '/Volumes/BCross/datasets/author_verification/test/Yelp/known_raw.jsonl',
 '/Volumes/BCross/datasets/author_verification/test/Wiki/known_raw.jsonl',
 '/Volumes/BCross/datasets/author_verification/test/All-the-news/known_raw.jsonl',
 '/Volumes/BCross/datasets/author_verification/test/IMDB/known_raw.jsonl',
 '/Volumes/BCross/datasets/author_verification/test/Reddit/known_raw.jsonl',
 "/Volumes/BCross/datasets/author_verification/test/Koppel's Blogs/known_raw.jsonl",
 '/Volumes/BCross/datasets/author_verification/test/Perverted Justice/known_raw.jsonl',
 '/Volumes/BCross/datasets/author_verification/test/TripAdvisor/known_raw.jsonl',
 '/Volumes/BCross/datasets/author_verification/test/ACL/known_raw.jsonl',
 '/Volumes/BCross/datasets/author_verific

In [14]:
data_types = ['training', 'test']
base_loc = "/Volumes/BCross/datasets/author_verification"

for d_type in data_types:

    base_file_type_loc = f"{base_loc}/{d_type}/"

    file_list = list_files(base_file_type_loc, "known_raw.jsonl")

    for file in file_list:

        print(f"Processing File: {file}")
        
        new_file_loc = file.replace('raw.jsonl', 'processed.jsonl')
    
        df = read_jsonl(file)
        df_copy = df.copy()
        
        # Apply the function to the DataFrame
        df_copy["sentence"] = df_copy["text"].apply(preprocess_and_process_text)
    
        # Expand the processed sentences into separate rows if needed
        expanded_df = df_copy.explode("sentence").reset_index(drop=True)
    
        # Step 3: Add chunk_id column (sequential numbering grouped by doc_id)
        expanded_df["chunk_id"] = expanded_df.groupby("doc_id").cumcount() + 1
    
        # Rename the "processed_sentences" column for clarity
        expanded_df = expanded_df.rename(columns={"processed_sentences": "sentence"})
        
        expanded_df = expanded_df[['corpus', 'doc_id', 'chunk_id', 'author', 'texttype', 'sentence']]
    
        write_jsonl(expanded_df, new_file_loc)
    
        print("    File Processed.")

Processing File: /Volumes/BCross/datasets/author_verification/training/StackExchange/known_raw.jsonl
    File Processed.
Processing File: /Volumes/BCross/datasets/author_verification/training/Amazon/known_raw.jsonl
    File Processed.
Processing File: /Volumes/BCross/datasets/author_verification/training/The Telegraph/known_raw.jsonl
    File Processed.
Processing File: /Volumes/BCross/datasets/author_verification/training/Yelp/known_raw.jsonl
    File Processed.
Processing File: /Volumes/BCross/datasets/author_verification/training/Wiki/known_raw.jsonl
    File Processed.
Processing File: /Volumes/BCross/datasets/author_verification/training/All-the-news/known_raw.jsonl
    File Processed.
Processing File: /Volumes/BCross/datasets/author_verification/training/IMDB/known_raw.jsonl
    File Processed.
Processing File: /Volumes/BCross/datasets/author_verification/training/Reddit/known_raw.jsonl
    File Processed.
Processing File: /Volumes/BCross/datasets/author_verification/training/Kop

In [15]:
# file_list = ['/Volumes/BCross/datasets/author_verification/test/IMDB/known_raw.jsonl']

In [16]:
# # Loop through the files
# for file in file_list:
#     print(f"Processing file: {file}")
#     try:
#         # Load the file into a DataFrame
#         df = read_jsonl(file)
#         df_copy = df.copy()
        
#         # Apply the filter
#         filtered_df = df_copy[df_copy['text'].str.contains("(??)", na=False, regex=False)]
        
#         # Check the number of rows after filtering
#         num_rows = len(filtered_df)
#         print(f"Number of rows after filtering: {num_rows}")
        
#         if num_rows > 0:
#             print(f"{file} has {num_rows} rows after filtering.")
#     except Exception as e:
#         print(f"Error processing file {file}: {e}")

In [17]:
# for file in file_list:

#     print(f"Processing File: {file}")
    
#     new_file_loc = file.replace('raw.jsonl', 'processed.jsonl')

#     df = read_jsonl(file)
#     df_copy = df.copy()
    
#     # Apply the function to the DataFrame
#     df_copy["sentence"] = df_copy["text"].apply(preprocess_and_process_text)

#     # Expand the processed sentences into separate rows if needed
#     expanded_df = df_copy.explode("sentence").reset_index(drop=True)

#     # Step 3: Add chunk_id column (sequential numbering grouped by doc_id)
#     expanded_df["chunk_id"] = expanded_df.groupby("doc_id").cumcount() + 1

#     # Rename the "processed_sentences" column for clarity
#     expanded_df = expanded_df.rename(columns={"processed_sentences": "sentence"})
    
#     expanded_df = expanded_df[['corpus', 'doc_id', 'chunk_id', 'author', 'texttype', 'sentence']]

#     write_jsonl(expanded_df, new_file_loc)

#     print("    File Processed.")