In [7]:
# !python -m spacy download en_core_web_sm
# !pip install rapidfuzz


In [None]:
import pandas as pd
import spacy
from collections import Counter, defaultdict
from rapidfuzz import fuzz

# -------------------- 1. Load Final Labelled Queries -------------------- #
def load_final_labelled_queries(filepath):
    df = pd.read_csv(filepath)
    if 'cleaned_query' not in df.columns or 'topic_label_final' not in df.columns:
        raise ValueError("Missing required columns in uploaded file.")
    return df

# -------------------- 2. Extract Entities -------------------- #
def extract_entities(df):
    nlp = spacy.load("en_core_web_sm")

    # Define domain-specific keyword buckets
    CUSTOM_BUCKETS = {
    # Account-related
    "account_creation": [
        "create account", "open account", "create user account", "create online account",
        "open new account", "sign up", "register account", "create profile"
    ],
    "account_switching": [
        "switch account", "switch user", "use another account", "change to different account",
        "switch profile", "use different profile"
    ],
    "account_deletion": [
        "delete account", "close account", "terminate account", "cancel account", "remove account"
    ],
    "account_recovery": [
        "recover account", "recover password", "reset password", "forgot password", "retrieve pin",
        "reset pin", "recover key", "password reset"
    ],
    "account_edit": [
        "edit account", "update account", "change account info", "modify account details",
        "correct personal data", "update personal info"
    ],
    "user_profile_edit": [
        "edit profile", "update profile", "change profile info", "modify user profile"
    ],

    # Order-related
    "order_creation_purchase": [
        "make purchase", "buy item", "place order", "purchase item", "order item"
    ],
    "order_modification": [
        "change order", "modify order", "edit order", "add item to order", "remove item from order"
    ],
    "order_cancellation": [
        "cancel order", "order cancellation", "stop order", "terminate order"
    ],
    "order_tracking": [
        "track order", "order status", "check order status", "order eta"
    ],

    # Delivery-related
    "delivery_options": [
        "delivery options", "shipping options", "available delivery methods", "change delivery option"
    ],
    "delivery_address": [
        "change shipping address", "update delivery address", "edit shipping address", "correct address"
    ],
    "delivery_eta": [
        "delivery time", "estimated delivery", "when will item arrive", "delivery date", "how long delivery takes"
    ],

    # Payment-related
    "payment_methods": [
        "payment methods", "payment options", "accepted payment", "list payment methods"
    ],
    "payment_issues": [
        "payment failed", "cannot pay", "payment error", "payment problem", "transaction failed"
    ],
    
    # Refund-related
    "refund_request": [
        "request refund", "get refund", "ask for refund", "money back", "refund of money"
    ],
    "refund_status": [
        "refund status", "check refund", "track refund", "refund progress", "refund update"
    ],
    "refund_policy": [
        "refund policy", "return policy", "money back policy", "refund process", "return process"
    ],

    # Complaints & feedback
    "feedback_submission": [
        "leave review", "leave feedback", "submit feedback", "submit review", "give feedback"
    ],
    "complaint_submission": [
        "file complaint", "lodge complaint", "customer complaint", "submit claim", "make claim"
    ],

    # Support interactions
    "human_agent_request": [
        "speak to agent", "talk to representative", "chat with operator", "contact human agent"
    ],
    "customer_service_contact": [
        "contact customer service", "customer service number", "reach customer service", "customer support"
    ],
    "support_hours": [
        "customer service hours", "when is support available", "support availability"
    ],

    # Newsletter-related
    "newsletter_subscribe": [
        "subscribe newsletter", "sign up newsletter", "join mailing list"
    ],
    "newsletter_unsubscribe": [
        "unsubscribe newsletter", "cancel newsletter subscription", "stop receiving newsletter"
    ],
    "newsletter_status": [
        "newsletter status", "check newsletter subscription"
    ],

    # Invoice-related
    "invoice_request": [
        "get invoice", "request invoice", "find invoice", "retrieve invoice"
    ],
    "invoice_history": [
        "past invoices", "previous invoice", "invoice from last month", "old invoice"
    ],

    # Meta / instruction
    "instruction_request": [
        "how to", "can i", "could you", "i need to know", "tell me how"
    ]
}



    FUZZY_MATCH_THRESHOLD = 70
    ent_counter = Counter()
    ent_to_rows = defaultdict(set)
    custom_counter = defaultdict(Counter)
    custom_to_rows = defaultdict(lambda: defaultdict(set))
    rows_with_entity = set()
    entity_list = set()
    entity_type_list = set()

    for idx, query in enumerate(df['cleaned_query']):
        query_lower = query.lower()
        doc = nlp(query_lower)
        found_entity = False

        for ent in doc.ents:
            key = (ent.label_, ent.text.lower())
            ent_counter[key] += 1
            ent_to_rows[key].add(idx + 2)
            entity_list.add(ent.text.lower())
            entity_type_list.add(ent.label_)
            found_entity = True

        for bucket, keywords in CUSTOM_BUCKETS.items():
            for kw in keywords:
                score = fuzz.partial_ratio(query_lower, kw.lower())
                if score >= FUZZY_MATCH_THRESHOLD:
                    custom_counter[bucket][kw.lower()] += 1
                    custom_to_rows[bucket][kw.lower()].add(idx + 2)
                    entity_list.add(kw.lower())
                    entity_type_list.add(bucket)
                    found_entity = True

        if found_entity:
            rows_with_entity.add(idx)

    # Convert counters to DataFrames
    spacy_df = pd.DataFrame([
        (etype, evalue, count, sorted(list(ent_to_rows[(etype, evalue)])))
        for (etype, evalue), count in ent_counter.items()
    ], columns=["Entity Type", "Entity Value", "Frequency", "Query Rows"])

    custom_dfs = []
    for bucket, c in custom_counter.items():
        df_bucket = pd.DataFrame([
            (bucket, val, count, sorted(list(custom_to_rows[bucket][val])))
            for val, count in c.items()
        ], columns=["Entity Type", "Entity Value", "Frequency", "Query Rows"])
        custom_dfs.append(df_bucket)

    full_entity_df = pd.concat([spacy_df] + custom_dfs, ignore_index=True)
    full_entity_df = full_entity_df.sort_values("Frequency", ascending=False)
    full_entity_df.to_csv("identified_entities.csv", index=False)

    # Save rows with no entities
    all_indices = set(range(len(df)))
    no_entity_indices = sorted(list(all_indices - rows_with_entity))
    no_entity_df = df.iloc[no_entity_indices]
    no_entity_df.to_csv("queries_with_no_entities.csv", index=False)

    print("\n🔍 List of unique extracted entity values:")
    print(sorted(entity_list))
    print("\n📘 List of unique entity types:")
    print(sorted(entity_type_list))

    return full_entity_df

# -------------------- 3. Run Full Pipeline -------------------- #
def run_step_4_pipeline():
    df = load_final_labelled_queries("clustered_labelled_queries_final.csv")
    entity_df = extract_entities(df)
    print("\n✅ Entities extracted and saved to identified_entities.csv")
    print("📄 Queries with no entities saved to queries_with_no_entities.csv")
    return entity_df

# -------------------- 4. Execute -------------------- #
if __name__ == "__main__":
    entity_df = run_step_4_pipeline()



🔍 List of unique extracted entity values:
['1', '1 months ago', '1 purchases ago', '10 months ago', '10 purchases ago', '11', '11 months ago', '2', '2 months ago', '3', '3 purchases ago', '4 purchases', '5', '6 months ago', '6 purchases', '7 months ago', '8 months ago', '9 months ago', '9 purchases ago', 'accepted payment', 'add item to order', 'april', 'ask for refund', 'august', 'available delivery methods', 'buy item', 'can i', 'cancel account', 'cancel newsletter subscription', 'cancel order', 'cannot pay', 'change account info', 'change delivery option', 'change order', 'change shipping address', 'change to different account', 'chat with operator', 'check newsletter subscription', 'check order status', 'check refund', 'close account', 'complaint', 'contact customer service', 'contact human agent', 'correct address', 'correct personal data', 'could you', 'create account', 'create online account', 'create profile', 'create user account', 'customer complaint', 'customer service hour

In [9]:
print("\n📊 Entity DataFrame:")
entity_df  # Display first 10 rows of the entity DataFrame




📊 Entity DataFrame:


Unnamed: 0,Entity Type,Entity Value,Frequency,Query Rows
65,instruction_request,can i,1281,"[2, 19, 42, 57, 58, 107, 108, 134, 136, 148, 1..."
66,instruction_request,how to,803,"[39, 40, 41, 42, 131, 132, 133, 134, 256, 257,..."
120,invoice_request,get invoice,508,"[44, 48, 79, 80, 85, 116, 144, 145, 166, 167, ..."
46,account_creation,open account,448,"[2, 37, 68, 69, 70, 71, 72, 73, 91, 114, 123, ..."
89,refund_request,get refund,444,"[13, 14, 15, 16, 39, 40, 43, 50, 52, 53, 81, 8..."
...,...,...,...,...
143,refund_policy,return process,1,[5812]
142,refund_policy,refund process,1,[5812]
138,refund_status,refund progress,1,[5812]
164,payment_issues,payment failed,1,[5493]
