In [1]:
import spacy
import csv
import re
from collections import OrderedDict
import json

nlp = spacy.load('en_core_web_sm')

DEFAULT_PRONOUNS = {"subject": "they", "object": "them", "possessive": "their", "reflexive": "themselves"}

def revise_text_with_preferences(text, terms_csv='gendered_terms.csv', pronoun_preferences=None):
    """
    Revise the text to make it gender-fair and apply preferred pronouns.

    Args:
        text (str): The original text.
        terms_csv (str): CSV file with gendered terms and their replacements.
        pronoun_preferences (dict): A dictionary of names and their preferred pronouns.

    Returns:
        dict: A dictionary with the original text, revised text, and corrections.
    """
    if pronoun_preferences is None:
        pronoun_preferences = {}

    # Load gendered terms
    gendered_terms = prioritize_terms(load_gendered_terms(terms_csv))
    doc = nlp(text)

    corrections = []
    revised_text = text
    applied_indices = set()  # Track processed tokens to avoid duplication

    # Step 1: Identify names and apply pronoun preferences (or default pronouns)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            preferred_pronouns = pronoun_preferences.get(ent.text, DEFAULT_PRONOUNS)

            for token in doc:
                # Skip tokens already processed
                if token.i in applied_indices:
                    continue

                # Check if the token is a pronoun and needs replacement
                if is_within_quotes(token, doc):
                    continue

                replacement = check_pronoun_replacement(token, preferred_pronouns)

                # Apply the replacement
                if replacement:
                    revised_text, corrections = apply_replacement(
                        revised_text, token, replacement, corrections
                    )
                    applied_indices.add(token.i)

    # Step 2: Apply default pronouns for remaining unmatched pronouns
    for token in doc:
        if token.i in applied_indices:
            continue

        # Skip tokens within quotes
        if is_within_quotes(token, doc):
            continue

        # Check if the token is a pronoun and needs replacement
        replacement = check_pronoun_replacement(token, DEFAULT_PRONOUNS)

        # Apply the replacement
        if replacement:
            adjusted_replacement = adjust_capitalization(token.text, replacement)
            revised_text, corrections = apply_replacement(
                revised_text, token, adjusted_replacement, corrections
            )
            applied_indices.add(token.i)

    # Step 3: Apply gender-neutral replacements (only for remaining terms)
    for phrase, replacement in gendered_terms.items():
        pattern = re.compile(rf'\b{re.escape(phrase)}\b', re.IGNORECASE)

        for match in pattern.finditer(revised_text):
            # Skip if it's already processed
            if any(
                match.start() <= correction["character_offset"] < match.end()
                for correction in corrections
            ):
                continue

            # Skip terms within quotes
            if is_within_quotes_from_indices(match.start(), match.end(), revised_text):
                continue

            # Adjust replacement capitalization
            original = match.group(0)
            adjusted_replacement = adjust_capitalization(original, replacement)

            # Apply the replacement
            revised_text = (
                revised_text[:match.start()] +
                adjusted_replacement +
                revised_text[match.end():]
            )
            corrections.append({
                "original_text": original,
                "replacements": adjusted_replacement,
                "character_offset": match.start(),
                "character_endset": match.end()
            })

    return {
        "original_text": text,
        "revised_text": revised_text,
        "corrections": corrections
    }

def load_gendered_terms(csv_filename):
    """Load gendered terms from a CSV file into a dictionary."""
    gendered_terms = {}
    try:
        with open(csv_filename, 'r') as csvfile:
            reader = csv.reader(csvfile)
            for row in reader:
                if len(row) >= 2:
                    gendered_terms[row[0].lower()] = row[1]
    except Exception as e:
        raise ValueError(f"Error loading gendered terms from {csv_filename}: {e}")
    return gendered_terms

def adjust_capitalization(original, replacement):
    """Adjust capitalization of the replacement text to match the original."""
    if original.isupper():
        return replacement.upper()
    elif original.istitle():
        return replacement.capitalize()
    return replacement

def prioritize_terms(terms):
    """Prioritize longer terms first to avoid partial matches."""
    return OrderedDict(
        sorted(terms.items(), key=lambda item: len(item[0].split()), reverse=True)
    )

def check_pronoun_replacement(token, pronouns):
    """Check if a pronoun needs to be replaced based on preferences."""
    if token.tag_ in {"PRP", "PRP$"}:
        if token.text.lower() in {"she", "he", "they"}:
            if token.text.lower() != pronouns["subject"]:
                return pronouns["subject"]
        elif token.text.lower() in {"her", "him", "them"}:
            if token.text.lower() != pronouns["object"]:
                return pronouns["object"]
        elif token.text.lower() in {"hers", "his", "theirs"}:
            if token.text.lower() != pronouns["possessive"]:
                return pronouns["possessive"]
        elif token.text.lower() in {"herself", "himself", "themselves"}:
            if token.text.lower() != pronouns.get("reflexive", "themselves"):
                return pronouns.get("reflexive", "themselves")
    return None

def apply_replacement(revised_text, token, replacement, corrections):
    """Apply the replacement to the text and log the correction."""
    revised_text = (
        revised_text[:token.idx] +
        replacement +
        revised_text[token.idx + len(token.text):]
    )
    corrections.append({
        "word_index": token.i,
        "original_text": token.text,
        "replacements": replacement,
        "character_offset": token.idx,
        "character_endset": token.idx + len(token.text)
    })
    return revised_text, corrections

def is_within_quotes(token, doc):
    """Check if a token is within quotation marks."""
    quote_indices = []
    for i, t in enumerate(doc):
        if t.text in {'"', '\'"', '“', '”'}:
            quote_indices.append(i)

    for start, end in zip(quote_indices[::2], quote_indices[1::2]):
        if start <= token.i <= end:
            return True

    return False

def is_within_quotes_from_indices(start, end, text):
    """Check if a text segment is within quotes using character indices."""
    quote_indices = [m.start() for m in re.finditer(r'"|\'"|“|”', text)]
    for q_start, q_end in zip(quote_indices[::2], quote_indices[1::2]):
        if q_start <= start and end <= q_end:
            return True

    return False

# Example usage
text = """Sarah want to eat salad also he said \"becoming policeman is a dream\" A female policeman came and he was happy."""
pronoun_preferences = {
    "Sarah": {"subject": "she", "object": "her", "possessive": "hers"},
    "police officer": {"subject": "he", "object": "her", "possessive": "hers"}

}

output = revise_text_with_preferences(text, 'gendered_terms.csv', pronoun_preferences)
print(json.dumps(output, indent=4))


{
    "original_text": "Sarah want to eat salad also he said \"becoming policeman is a dream\" A female policeman came and he was happy.",
    "revised_text": "Sarah want to eat salad also she said \"becoming policeman is a dream\" A female police officer came andshee was happy.",
    "corrections": [
        {
            "word_index": 6,
            "original_text": "he",
            "replacements": "she",
            "character_offset": 29,
            "character_endset": 31
        },
        {
            "word_index": 20,
            "original_text": "he",
            "replacements": "she",
            "character_offset": 97,
            "character_endset": 99
        },
        {
            "original_text": "policeman",
            "replacements": "police officer",
            "character_offset": 79,
            "character_endset": 88
        }
    ]
}


In [32]:
import csv
import re
from collections import OrderedDict
import json
import spacy

nlp = spacy.load('en_core_web_sm')

def load_gendered_terms(csv_filename): 
        """Load gendered terms from a CSV file into a dictionary."""
        gendered_terms = {}
        try:
            with open(csv_filename, 'r') as csvfile:
                reader = csv.reader(csvfile)
                for row in reader:
                    if len(row) >= 2:
                        gendered_terms[row[0].lower()] = row[1]
        except Exception as e:
            raise ValueError(f"Error loading gendered terms from {csv_filename}: {e}")
        return gendered_terms

def adjust_capitalization(original, replacement):
    """Preserve capitalization of the original word/phrase in the replacement."""
    if original.isupper():
        return replacement.upper()
    elif original.istitle():
        return replacement.capitalize()
    return replacement
def prioritize_terms(terms):
    """Sort gendered terms by phrase length in descending order."""
    return OrderedDict(
        sorted(terms.items(), key=lambda item: len(item[0].split()), reverse=True)
    )
def is_within_quotes(text, start, end):
    """Check if the match is inside double quotes."""
    before = text[:start]
    after = text[end:]
    return before.count('"') % 2 == 1 and after.count('"') % 2 == 1
def replace_gender_adj_noun_pairs(doc, gendered_terms):
    """Replace gender terms followed by nouns based on prioritized terms."""
    revised_tokens = []
    corrections = []
    skip_next = False
    gender_adjectives = {"male", "female", "lady", "gentlemen", "boy", "girl"}  # Include "boy" and "girl"
    for i, token in enumerate(doc):
        if skip_next:
            skip_next = False
            continue
        term = token.text.lower()
        next_token = doc[i + 1] if i + 1 < len(doc) else None
        # Check if the current term is a gender adjective
        if term in gender_adjectives and next_token and next_token.pos_ == "NOUN":
            compound_phrase = f"{term} {next_token.text.lower()}"
            replacement = gendered_terms.get(compound_phrase, None)
            if replacement:
                # Replace the compound phrase explicitly if it's in the dictionary
                corrections.append({
                    "word_index": i,
                    "original_text": compound_phrase,
                    "replacements": replacement,
                    "character_offset": token.idx,
                    "character_endset": next_token.idx + len(next_token.text),
                    "original_character_endset":next_token.idx + len(next_token.text) -1  # Original word's endset
                })
                revised_tokens.append(replacement)
                skip_next = True  # Skip the next token as it was part of the compound phrase
                continue
            else:
                # If no explicit replacement, replace with just the noun
                corrections.append({
                    "word_index": i,
                    "original_text": compound_phrase,
                    "replacements": next_token.text,
                    "character_offset": token.idx,
                    "character_endset": next_token.idx + len(next_token.text),
                    "original_character_endset":next_token.idx + len(next_token.text) -1  
                })
                revised_tokens.append(next_token.text)
                skip_next = True
                continue
        revised_tokens.append(token.text)
    revised_text = " ".join(revised_tokens)
    return revised_text, corrections
def main_gfl(text, terms_csv='gendered_terms.csv'):
    """Replace gendered terms in the text with gender-neutral terms."""
    # Load and prioritize gendered terms
    gendered_terms = prioritize_terms(load_gendered_terms(terms_csv))
    # Process text using spaCy for tokenization
    doc = nlp(text)
    # Step 1: Handle gender term followed by noun
    revised_text, noun_corrections = replace_gender_adj_noun_pairs(doc, gendered_terms)
    # Step 2: Apply main gendered term replacement logic
    corrections = []
    # Replace exact matches (including hyphenated terms) using regex
    for phrase, replacement in gendered_terms.items():
        # Regex to match full word/phrase boundaries, case-insensitive
        pattern = re.compile(rf'\b{re.escape(phrase)}\b', re.IGNORECASE)
        # Find all matches and replace them one by one with correct capitalization
        matches = list(pattern.finditer(revised_text))  # Collect matches first to avoid conflicts
        for match in reversed(matches):  # Process in reverse to avoid offset issues
            original = match.group(0)  # The matched text
            # Check if the match is inside double quotes
            if is_within_quotes(revised_text, match.start(), match.end()):
                continue  # Skip if the match is inside quotes
            adjusted_replacement = adjust_capitalization(original, replacement)
            # Replace text
            revised_text = (
                revised_text[:match.start()] +
                adjusted_replacement +
                revised_text[match.end():]
            )
            # Map match offsets to token indices in the original doc
            match_start = match.start()
            match_end = match.end()
            word_index = None
            for i, token in enumerate(doc):
                token_start = token.idx
                token_end = token.idx + len(token)
                # Check if the match fully or partially overlaps this token
                if token_start <= match_start < token_end or token_start < match_end <= token_end:
                    word_index = i
                    break
            # Track correction details with offsets
            corrections.append({
                "word_index": word_index,
                "original_text": original,
                "replacements": adjusted_replacement,
                "character_offset": match.start(),
                "character_endset": match.start() + len(adjusted_replacement),
                "original_character_endset": match.end()  # Original word's endset
            })
    # Combine noun corrections and gender term corrections
    all_corrections = noun_corrections + corrections
    return {
        "original_text": text,
        "revised_text": revised_text,
        "corrections": all_corrections
    }

text = """Firemen and policemen are heroes""" 
output = main_gfl(text)
print(json.dumps(output, indent=4))

# todo

{
    "original_text": "Firemen and policemen are heroes",
    "revised_text": "Firefighter and police officer are heroes",
    "corrections": [
        {
            "word_index": 2,
            "original_text": "policemen",
            "replacements": "police officer",
            "character_offset": 12,
            "character_endset": 26,
            "original_character_endset": 21
        },
        {
            "word_index": 0,
            "original_text": "Firemen",
            "replacements": "Firefighter",
            "character_offset": 0,
            "character_endset": 11,
            "original_character_endset": 7
        }
    ]
}
