In [1]:
import csv
import re
from collections import OrderedDict
import json
import spacy

nlp = spacy.load('en_core_web_sm')

def load_gendered_terms(csv_filename): 
    """Load gendered terms from a CSV file into a dictionary."""
    gendered_terms = {}
    try:
        with open(csv_filename, 'r') as csvfile:
            reader = csv.reader(csvfile)
            for row in reader:
                if len(row) >= 2:
                    gendered_terms[row[0].lower()] = row[1]
    except Exception as e:
        raise ValueError(f"Error loading gendered terms from {csv_filename}: {e}")
    return gendered_terms

def adjust_capitalization(original, replacement):
    """Preserve capitalization of the original word/phrase in the replacement."""
    if original.isupper():
        return replacement.upper()
    elif original.istitle():
        return replacement.capitalize()
    return replacement

def prioritize_terms(terms):
    """Sort gendered terms by phrase length in descending order."""
    return OrderedDict(
        sorted(terms.items(), key=lambda item: len(item[0].split()), reverse=True)
    )

def is_within_quotes(text, start, end):
    """Check if the match is inside double quotes."""
    before = text[:start]
    after = text[end:]
    return before.count('"') % 2 == 1 and after.count('"') % 2 == 1

def replace_gender_adj_noun_pairs(doc, gendered_terms):
    """Replace gender terms followed by nouns based on prioritized terms."""
    revised_tokens = []
    corrections = []

    skip_next = False
    gender_adjectives = {"male", "female", "lady", "gentlemen", "boy", "girl"}  # Include "boy" and "girl"

    for i, token in enumerate(doc):
        if skip_next:
            skip_next = False
            continue

        term = token.text.lower()
        next_token = doc[i + 1] if i + 1 < len(doc) else None

        # Check if the current term is a gender adjective
        if term in gender_adjectives and next_token and next_token.pos_ == "NOUN":
            compound_phrase = f"{term} {next_token.text.lower()}"
            replacement = gendered_terms.get(compound_phrase, None)

            if replacement:
                # Replace the compound phrase explicitly if it's in the dictionary
                corrections.append({
                    "word_index": i,
                    "original_text": compound_phrase,
                    "replacements": replacement,
                    "character_offset": token.idx,
                    "character_endset": next_token.idx + len(next_token.text)
                })
                revised_tokens.append(replacement)
                skip_next = True  # Skip the next token as it was part of the compound phrase
                continue
            else:
                # If no explicit replacement, replace with just the noun
                corrections.append({
                    "word_index": i,
                    "original_text": compound_phrase,
                    "replacements": next_token.text,
                    "character_offset": token.idx,
                    "character_endset": next_token.idx + len(next_token.text)
                })
                revised_tokens.append(next_token.text)
                skip_next = True
                continue

        revised_tokens.append(token.text)

    revised_text = " ".join(revised_tokens)
    return revised_text, corrections


def main_gfl(text, terms_csv='gendered_terms.csv'):
    """Replace gendered terms in the text with gender-neutral terms."""
    # Load and prioritize gendered terms
    gendered_terms = prioritize_terms(load_gendered_terms(terms_csv))

    # Process text using spaCy for tokenization
    doc = nlp(text)

    # Step 1: Handle gender term followed by noun
    revised_text, noun_corrections = replace_gender_adj_noun_pairs(doc, gendered_terms)

    # Step 2: Apply main gendered term replacement logic
    corrections = []

    # Replace exact matches (including hyphenated terms) using regex
    for phrase, replacement in gendered_terms.items():
        # Regex to match full word/phrase boundaries, case-insensitive
        pattern = re.compile(rf'\b{re.escape(phrase)}\b', re.IGNORECASE)

        # Find all matches and replace them one by one with correct capitalization
        matches = list(pattern.finditer(revised_text))  # Collect matches first to avoid conflicts
        for match in reversed(matches):  # Process in reverse to avoid offset issues
            original = match.group(0)  # The matched text

            # Check if the match is inside double quotes
            if is_within_quotes(revised_text, match.start(), match.end()):
                continue  # Skip if the match is inside quotes

            adjusted_replacement = adjust_capitalization(original, replacement)

            # Replace text
            revised_text = (
                revised_text[:match.start()] +
                adjusted_replacement +
                revised_text[match.end():]
            )

            # Map match offsets to token indices in the original doc
            match_start = match.start()
            match_end = match.end()

            word_index = None
            for i, token in enumerate(doc):
                token_start = token.idx
                token_end = token.idx + len(token)

                # Check if the match fully or partially overlaps this token
                if token_start <= match_start < token_end or token_start < match_end <= token_end:
                    word_index = i
                    break

            # Track correction details with offsets
            corrections.append({
                "word_index": word_index,
                "original_text": original,
                "replacements": adjusted_replacement,
                "character_offset": match.start(),
                "character_endset": match.start() + len(adjusted_replacement)
            })

    # Combine noun corrections and gender term corrections
    all_corrections = noun_corrections + corrections

    return {
        "original_text": text,
        "revised_text": revised_text,
        "corrections": all_corrections
    }

text = """A Lady policeman and he wants to become a male doctor"""
output = main_gfl(text)
print(json.dumps(output, indent=4))


{
    "original_text": "A Lady policeman and he wants to become a male doctor",
    "revised_text": "A police officer and he wants to become a doctor",
    "corrections": [
        {
            "word_index": 1,
            "original_text": "lady policeman",
            "replacements": "police officer",
            "character_offset": 2,
            "character_endset": 16
        },
        {
            "word_index": 9,
            "original_text": "male doctor",
            "replacements": "doctor",
            "character_offset": 42,
            "character_endset": 53
        }
    ]
}


In [None]:
import spacy

def replace_pronouns(text, name_pronoun_map, pronoun_options=None):
    """
    Replace pronouns in a text based on a mapping of names to pronouns.

    Args:
        text (str): The input text to process.
        name_pronoun_map (dict): A dictionary mapping names to pronoun types (e.g., "male", "female", "gender_fair").
        pronoun_options (dict, optional): A dictionary containing pronoun options for each type. Defaults to predefined options.

    Returns:
        dict: A dictionary containing the original text, modified text, and a list of replaced words with their attributes.
    """
    if pronoun_options is None:
        pronoun_options = {
            "male": {
                "nsubj": "he",  # Subject pronoun
                "dobj": "him",  # Object pronoun
                "poss": "his",  # Possessive adjective
                "poss_pronoun": "his",  # Possessive pronoun
                "reflexive": "himself"  # Reflexive pronoun
            },
            "female": {
                "nsubj": "she",
                "dobj": "her",
                "poss": "her",
                "poss_pronoun": "hers",
                "reflexive": "herself"
            },
            "gender_fair": {
                "nsubj": "they",
                "dobj": "them",
                "poss": "their",
                "poss_pronoun": "theirs",
                "reflexive": "themselves"
            }
        }

    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    # Reverse map for quick lookup of pronouns
    pronoun_reverse_map = {}
    for category, pronouns in pronoun_options.items():
        for role, value in pronouns.items():
            pronoun_reverse_map[value] = (category, role)

    # Process the text and replace pronouns
    name_to_category = {name.lower(): category for name, category in name_pronoun_map.items()}

    def get_pronoun_replacement(token, category):
        if token.text.lower() in pronoun_reverse_map:
            _, role = pronoun_reverse_map[token.text.lower()]
            return pronoun_options[category][role]
        return token.text

    replaced_text = []
    replaced_words = []

    for token in doc:
        # Check if token is a pronoun based on its tag and find replacement if applicable
        if token.pos_ == "PRON":
            relevant_entity = None
            for ent in doc.ents:
                if ent.text.lower() in name_to_category and ent.end <= token.i:
                    relevant_entity = ent

            if relevant_entity:
                category = name_to_category[relevant_entity.text.lower()]
                replacement = get_pronoun_replacement(token, category)
                # Only replace if the pronoun is different from the preferred pronoun
                if replacement.lower() != token.text.lower():
                    replaced_text.append(replacement + token.whitespace_)
                    replaced_words.append({
                        "original_word": token.text,
                        "replaced_word": replacement,
                        "word_index": token.i,
                        "char_offset": token.idx,
                        "char_end_offset": token.idx + len(token.text)
                    })
                else:
                    replaced_text.append(token.text_with_ws)  # Keep original pronoun if it matches
            else:
                replaced_text.append(token.text_with_ws)  # Keep original pronoun if no match found
        else:
            replaced_text.append(token.text_with_ws)

    # Ensure proper spacing by joining tokens directly as processed
    return {
        "original_text": text,
        "modified_text": "".join(replaced_text),
        "replaced_words": replaced_words
    }

text = "John said he would help Mary with her project because she needed him."
name_pronoun_map = {
    "John": "male",
    "Mary": "gender_fair"
}
result = replace_pronouns(text, name_pronoun_map)

import json
result = replace_pronouns(text, name_pronoun_map)
print(result)
print(json.dumps(result, indent=4))

{'original_text': 'John said he would help Mary with her project because she needed him.', 'modified_text': 'John said he would help Mary with their project because they needed them.', 'replaced_words': [{'original_word': 'her', 'replaced_word': 'their', 'word_index': 7, 'char_offset': 34, 'char_end_offset': 37}, {'original_word': 'she', 'replaced_word': 'they', 'word_index': 10, 'char_offset': 54, 'char_end_offset': 57}, {'original_word': 'him', 'replaced_word': 'them', 'word_index': 12, 'char_offset': 65, 'char_end_offset': 68}]}
{
    "original_text": "John said he would help Mary with her project because she needed him.",
    "modified_text": "John said he would help Mary with their project because they needed them.",
    "replaced_words": [
        {
            "original_word": "her",
            "replaced_word": "their",
            "word_index": 7,
            "char_offset": 34,
            "char_end_offset": 37
        },
        {
            "original_word": "she",
         