In [18]:
import re
import spacy

In [19]:
# Load a pre-trained spaCy model
# If you don't have it, run: python -m spacy download en_core_web_sm
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading spaCy model 'en_core_web_sm'...")

In [20]:
equation_list = [
    "(current month-end discount factor / prior month-end discount factor - 1) * 12",
    "(1 + monthly rate) ^ 12",
    "(Prior Month Ending Loan Count) * Gross Default SMM",
    "(1 - (1 - Curtail SMM)^12)",
    "WAC - (Note Rate - Coupon Rate)",
    "Future Non-Earning FCL Completion Amount * (Foreclosure Convery % + Foreclosure Deed in Lieu %) * (Presale Months + Postsale Months)",
    "Net Default Count * (1-(Recovery% + Payoff% + Modification%))",
    "Foreclosure Count *(1-On-Time Refereral%)",
    "this is pure description, no math operators",
    "this is pure description, no math operators, for month-end",
]

In [21]:
def extract_variables_regex(equation):
    """
    Extracts variables from an equation string using regular expressions.
    Returns an empty list if no math operators are found.
    """
    if not re.search(r"[\+\-\*/\^\(\)]", equation):
        return []
        
    equation_with_commas = re.sub(r"[\+\*/\^\(\)]", ',', equation)
    final_comma_separated = re.sub(r'(?<=[\s,])-(?=[\s,])|(?<=[\s,])-|-(?=[\s,])', ',', equation_with_commas)

    potential_vars = final_comma_separated.split(',')
    
    variables = []
    for var in potential_vars:
        cleaned_var = var.strip()
        if cleaned_var and not cleaned_var.isnumeric():
            variables.append(cleaned_var)
            
    return variables

In [22]:
def extract_variables_spacy(equation):
    """
    Extracts variables from an equation string using spaCy's noun chunking.
    """
    # Replace operators and numbers with commas to act as delimiters
    cleaned_equation = re.sub(r'[\(\)\/\*\+\^0-9]|\s-\s', ',', equation)
    # Collapse multiple commas into a single one
    cleaned_equation = re.sub(r',+', ',', cleaned_equation)
    
    doc = nlp(cleaned_equation)
    # Extract noun phrases and clean up any extra whitespace or commas
    variables = [chunk.text.strip(" ,") for chunk in doc.noun_chunks if chunk.text.strip(" ,")]
    return variables

In [23]:
def extract_variables_combined(equation):
    """
    Combines and validates results from regex and spaCy methods to find the best variable list.
    - Identifies high-confidence variables found by both methods.
    - Validates unique variables from each method against the high-confidence list to avoid partial matches.
    - Merges the validated lists to produce a comprehensive result.
    """
    # Regex check for operators is a good guard clause
    if not re.search(r"[\+\-\*/\^\(\)]", equation):
        return "Description (not an equation)"

    # Extract variables using both methods
    spacy_vars = set(extract_variables_spacy(equation))
    regex_vars = set(extract_variables_regex(equation))
    
    # Find high-confidence variables (intersection)
    high_confidence_vars = spacy_vars.intersection(regex_vars)
    
    # Find unique variables for each method
    unique_spacy_vars = spacy_vars - regex_vars
    unique_regex_vars = regex_vars - spacy_vars
    
    final_vars = set(high_confidence_vars)
    
    # Validate unique spaCy variables
    for svar in unique_spacy_vars:
        # Add if it's not a substring of a high-confidence variable
        if not any(svar in hvar for hvar in high_confidence_vars):
            final_vars.add(svar)
            
    # Validate unique regex variables
    for rvar in unique_regex_vars:
        # Add if it's not a substring of a high-confidence or already added spaCy variable
        if not any(rvar in fvar for fvar in final_vars):
            final_vars.add(rvar)
            
    return list(final_vars)

# Process each equation and print the results
for eq in equation_list:
    print(f"Equation: {eq}")
    variables = extract_variables_combined(eq)
    print(f"Variables: {variables}\n")

Equation: (current month-end discount factor / prior month-end discount factor - 1) * 12
Variables: ['current month-end discount factor', 'prior month-end discount factor']

Equation: (1 + monthly rate) ^ 12
Variables: ['monthly rate']

Equation: (Prior Month Ending Loan Count) * Gross Default SMM
Variables: ['Gross Default SMM', 'Prior Month Ending Loan Count']

Equation: (1 - (1 - Curtail SMM)^12)
Variables: ['Curtail SMM']

Equation: WAC - (Note Rate - Coupon Rate)
Variables: ['Note Rate', 'Coupon Rate', 'WAC']

Equation: Future Non-Earning FCL Completion Amount * (Foreclosure Convery % + Foreclosure Deed in Lieu %) * (Presale Months + Postsale Months)
Variables: ['Postsale Months', 'Foreclosure Deed in Lieu %', 'Foreclosure Deed', 'Future Non-Earning FCL Completion Amount', 'Lieu', 'Foreclosure Convery %', 'Presale Months']

Equation: Net Default Count * (1-(Recovery% + Payoff% + Modification%))
Variables: ['Modification%', 'Payoff%', 'Net Default Count', '-,Recovery%']

Equation: 