In [1]:
import csv
import re
from collections import OrderedDict
import json
import spacy

nlp = spacy.load('en_core_web_sm')

In [10]:

def load_gendered_terms(csv_filename): 
    """Load gendered terms from a CSV file into a dictionary."""
    gendered_terms = {}
    try:
        with open(csv_filename, 'r') as csvfile:
            reader = csv.reader(csvfile)
            for row in reader:
                if len(row) >= 2:
                    gendered_terms[row[0].lower()] = row[1]
    except Exception as e:
        raise ValueError(f"Error loading gendered terms from {csv_filename}: {e}")
    return gendered_terms

def adjust_capitalization(original, replacement):
    """Preserve capitalization of the original word/phrase in the replacement."""
    if original.isupper():
        return replacement.upper()
    elif original.istitle():
        return replacement.capitalize()
    return replacement

def prioritize_terms(terms):
    """Sort gendered terms by phrase length in descending order."""
    return OrderedDict(
        sorted(terms.items(), key=lambda item: len(item[0].split()), reverse=True)
    )

def is_within_quotes(text, start, end):
    """Check if the match is inside double quotes."""
    before = text[:start]
    after = text[end:]
    return before.count('"') % 2 == 1 and after.count('"') % 2 == 1

def replace_gender_adj_noun_pairs(doc, gendered_terms):
    """Replace gender terms followed by nouns based on prioritized terms."""
    revised_tokens = []
    corrections = []

    skip_next = False
    gender_adjectives = {"male", "female", "lady", "gentlemen", "boy", "girl"}  # Include "boy" and "girl"

    for i, token in enumerate(doc):
        if skip_next:
            skip_next = False
            continue

        term = token.text.lower()
        next_token = doc[i + 1] if i + 1 < len(doc) else None

        # Check if the current term is a gender adjective
        if term in gender_adjectives and next_token and next_token.pos_ == "NOUN":
            compound_phrase = f"{term} {next_token.text.lower()}"
            replacement = gendered_terms.get(compound_phrase, None)

            if replacement:
                # Replace the compound phrase explicitly if it's in the dictionary
                corrections.append({
                    "word_index": i,
                    "original_text": compound_phrase,
                    "replacements": replacement,
                    "character_offset": token.idx,
                    "character_endset": next_token.idx + len(next_token.text)
                })
                revised_tokens.append(replacement)
                skip_next = True  # Skip the next token as it was part of the compound phrase
                continue
            else:
                # If no explicit replacement, replace with just the noun
                corrections.append({
                    "word_index": i,
                    "original_text": compound_phrase,
                    "replacements": next_token.text,
                    "character_offset": token.idx,
                    "character_endset": next_token.idx + len(next_token.text)
                })
                revised_tokens.append(next_token.text)
                skip_next = True
                continue

        revised_tokens.append(token.text)

    revised_text = " ".join(revised_tokens)
    return revised_text, corrections


def replace_names_with_pronouns(text, name_pronouns):
    """Replace names in the text with their preferred pronouns."""
    doc = nlp(text)
    corrections = []

    revised_tokens = []
    for token in doc:
        word = token.text.lower()
        if word in name_pronouns:
            pronoun = name_pronouns[word]
            corrections.append({
                "original_text": token.text,
                "replacement": pronoun,
                "character_offset": token.idx,
                "character_endset": token.idx + len(token.text)
            })
            revised_tokens.append(pronoun)
        elif token.pos_ == "PROPN":  # Default to gender-neutral pronouns for unknown names
            corrections.append({
                "original_text": token.text,
                "replacement": "they",
                "character_offset": token.idx,
                "character_endset": token.idx + len(token.text)
            })
            revised_tokens.append("they")
        else:
            revised_tokens.append(token.text)

    revised_text = " ".join(revised_tokens)
    return revised_text, corrections

def main_gfl_with_pronouns(text, gendered_terms, name_pronouns):
    """Main function to process text for gender fairness with names and pronouns."""
    # Process text using spaCy for tokenization
    doc = nlp(text)

    # Replace names with pronouns
    text_with_pronouns, name_corrections = replace_names_with_pronouns(text, name_pronouns)

    # Step 1: Handle gender term followed by noun
    revised_text, noun_corrections = replace_gender_adj_noun_pairs(nlp(text_with_pronouns), gendered_terms)

    # Step 2: Apply main gendered term replacement logic
    corrections = []

    for phrase, replacement in gendered_terms.items():
        pattern = re.compile(rf'\b{re.escape(phrase)}\b', re.IGNORECASE)
        matches = list(pattern.finditer(revised_text))
        for match in reversed(matches):
            original = match.group(0)
            if is_within_quotes(revised_text, match.start(), match.end()):
                continue
            adjusted_replacement = adjust_capitalization(original, replacement)
            revised_text = revised_text[:match.start()] + adjusted_replacement + revised_text[match.end():]
            corrections.append({
                "original_text": original,
                "replacement": adjusted_replacement,
                "character_offset": match.start(),
                "character_endset": match.end()
            })

    # Combine all corrections
    all_corrections = name_corrections + noun_corrections + corrections

    return {
        "original_text": text,
        "revised_text": revised_text,
        "corrections": all_corrections
    }

In [11]:
text = """Mr. Smith is here. He is a great teacher. Mrs. Johnson and she loves to bake."""
output = main_gfl(text)
print(json.dumps(output, indent=4))

{
    "original_text": "Mr. Smith is here. He is a great teacher. Mrs. Johnson and she loves to bake.",
    "revised_text": "Mr. Smith is here . They is a great teacher . Mrs. Johnson and they loves to bake .",
    "corrections": [
        {
            "word_index": 5,
            "original_text": "He",
            "replacements": "They",
            "character_offset": 20,
            "character_endset": 24
        },
        {
            "word_index": 15,
            "original_text": "she",
            "replacements": "they",
            "character_offset": 63,
            "character_endset": 67
        }
    ]
}
