In [None]:
import pandas as pd
import spacy
from fuzzywuzzy import fuzz

# Load spacy
nlp = spacy.load("en_core_web_sm")

# Load Taxonomy Sheet
taxonomy_df = pd.read_excel('Task1.xlsx', sheet_name='Taxonomy')

# Extract and clean taxonomy lists
root_causes = taxonomy_df['Root Cause'].dropna().tolist()
symptom_conditions = taxonomy_df['Symptom Condition '].dropna().tolist()
symptom_components = taxonomy_df['Symptom Component'].dropna().tolist()
fix_conditions = taxonomy_df['Fix Condition'].dropna().tolist()
fix_components = taxonomy_df['Fix Component'].dropna().tolist()

def get_lemmas(text):
    """Convert text into a list of lemmatized tokens."""
    doc = nlp(text.lower())
    return [token.lemma_ for token in doc]

def lemmatize_term(term):
    """Lemmatize a given phrase."""
    term = term.replace("-", " ")
    doc = nlp(term.lower())
    return " ".join([token.lemma_ for token in doc])

# Preprocess for better matching
root_causes_lem = [lemmatize_term(rc) for rc in root_causes]
symptom_conditions_lem = [lemmatize_term(sc) for sc in symptom_conditions]
symptom_components_lem = [lemmatize_term(scm) for scm in symptom_components]
fix_conditions_lem = [lemmatize_term(fc) for fc in fix_conditions]
fix_components_lem = [lemmatize_term(fcomp) for fcomp in fix_components]


def is_match(term_lem, combined_lemmas, combined_text):
    """Check for match using exact lemmatized token or fuzzy matching."""
    if ' ' not in term_lem:
        return term_lem in combined_lemmas
    else:
        return fuzz.token_set_ratio(term_lem, combined_text) > 70

def extract_matches(row, targets_lem, labels, max_items=3):
    """Extract up to max_items matches from the text."""
    text = row["Complaint"] + " " + row["Cause"] + " " + row["Correction"]
    combined_lemmas = get_lemmas(text)
    combined_text = " ".join(combined_lemmas)
    found = []
    covered_tokens = set()

    for idx, target_lem in enumerate(targets_lem):
        if is_match(target_lem, combined_lemmas, combined_text):
            target_tokens = set(target_lem.split())
            if not target_tokens.issubset(covered_tokens):
                found.append(labels[idx])
                covered_tokens.update(target_tokens)
                if len(found) == max_items:
                    break

    found += [None] * (max_items - len(found))
    return pd.Series(found)

def detect_root_cause(row):
    """Detect root cause from the 'Cause' field."""
    text = row["Cause"]
    combined_lemmas = get_lemmas(text)
    combined_text = " ".join(combined_lemmas)
    for idx, rc_lem in enumerate(root_causes_lem):
        if is_match(rc_lem, combined_lemmas, combined_text):
            return root_causes[idx]
    return "Not Mentioned"

def extract_fix_components_prioritized(row, max_items=3):
    """Extract fix components giving priority to those matching symptom components."""
    text = row["Complaint"] + " " + row["Cause"] + " " + row["Correction"]
    combined_lemmas = get_lemmas(text)
    combined_text = " ".join(combined_lemmas)

    result = [None] * max_items
    used_components = set()

    # Priority matching using symptom components
    for i in range(max_items):
        symptom_component = row.get(f"Symptom Component {i+1}")
        if symptom_component and symptom_component in fix_components:
            result[i] = symptom_component
            used_components.add(symptom_component)

    # fuzzy match from the full text
    matches = []
    for idx, target_lem in enumerate(fix_components_lem):
        label = fix_components[idx]
        if label in used_components:
            continue
        if is_match(target_lem, combined_lemmas, combined_text):
            matches.append(label)

    # Fill remaining positions
    j = 0
    for i in range(max_items):
        if result[i] is None and j < len(matches):
            result[i] = matches[j]
            used_components.add(matches[j])
            j += 1

    return pd.Series(result)

def align_conditions(df, condition_prefix, component_prefix):
    """Align condition values with components if any condition field is missing."""
    for idx, row in df.iterrows():
        for i in range(2, 4):
            cond_col = f"{condition_prefix} {i}"
            comp_col = f"{component_prefix} {i}"
            if pd.notna(row[comp_col]) and pd.isna(row[cond_col]):
                for j in range(i - 1, 0, -1):
                    prev_cond_col = f"{condition_prefix} {j}"
                    if pd.notna(row[prev_cond_col]):
                        df.at[idx, cond_col] = row[prev_cond_col]
                        break
    return df

# Load Task data
task_df = pd.read_excel('Task1.xlsx', sheet_name='Task')
task_df["Order Date"] = task_df["Order Date"].dt.date

# Tagging
task_df["Root Cause"] = task_df.apply(detect_root_cause, axis=1)

task_df[["Symptom Condition 1", "Symptom Condition 2", "Symptom Condition 3"]] = task_df.apply(
    lambda row: extract_matches(row, symptom_conditions_lem, symptom_conditions), axis=1)

task_df[["Symptom Component 1", "Symptom Component 2", "Symptom Component 3"]] = task_df.apply(
    lambda row: extract_matches(row, symptom_components_lem, symptom_components), axis=1)

task_df[["Fix Condition 1", "Fix Condition 2", "Fix Condition 3"]] = task_df.apply(
    lambda row: extract_matches(row, fix_conditions_lem, fix_conditions), axis=1)

task_df[["Fix Component 1", "Fix Component 2", "Fix Component 3"]] = task_df.apply(
    extract_fix_components_prioritized, axis=1)

# Align Related Conditions
task_df = align_conditions(task_df, "Symptom Condition", "Symptom Component")
task_df = align_conditions(task_df, "Fix Condition", "Fix Component")

output_columns = [
    "Primary Key", "Order Date", "Product Category", "Complaint", "Cause", "Correction",
    "Root Cause",
    "Symptom Condition 1", "Symptom Component 1",
    "Symptom Condition 2", "Symptom Component 2",
    "Symptom Condition 3", "Symptom Component 3",
    "Fix Condition 1", "Fix Component 1",
    "Fix Condition 2", "Fix Component 2",
    "Fix Condition 3", "Fix Component 3"
]

final_df = task_df[output_columns]
final_df.to_excel("first_task.xlsx", index=False)

Original: we had RETIGHTEN ALL P CLIPS, NUTS, AND BOLTS AS NECESSARYALSO FOUND TWO BULKHEAD CONNECTORS, NOT FULLY LOCKED LEFT REAR OF MACHINE AND ONE ON RIGHT BOOMRUNNING TESTS NO FAULT
Corrected: We had TIGHTEN ALL P CLIPS, NUTS, AND BOLTS AS NECESSARIANS FOUND TWO BULKHEAD CONNECTORS, NOT FULLY LOCKED LEFT REAR OF MACHINE AND ONE ON RIGHT OUTRUNNING TESTS NO FAULT
