In [4]:
# !python -m spacy download en_core_web_sm
# !pip install rapidfuzz

In [None]:
import pandas as pd
import spacy
from collections import Counter, defaultdict
from rapidfuzz import fuzz

# -------------------- 1. Load Final Labelled Queries -------------------- #
def load_final_labelled_queries(filepath):
    df = pd.read_csv(filepath)
    if 'cleaned_query' not in df.columns or 'topic_label_final' not in df.columns:
        raise ValueError("Missing required columns in uploaded file.")
    return df

# -------------------- 2. Extract Entities -------------------- #
def extract_entities(df):
    nlp = spacy.load("en_core_web_sm")

    # Define domain-specific keyword buckets
    CUSTOM_BUCKETS = {
    "payment_method": ["credit card", "debit card", "paypal", "net banking", "upi", "wallet", "cash", "cod", "bank transfer"],
    "delivery_option": ["standard delivery", "express delivery", "pickup", "home delivery", "shipping method", "delivery time", "eta", "estimated delivery"],
    "order_reference": ["cancel order", "order status", "track order", "modify order"],
    "invoice_reference": ["invoice", "invoice number", "last invoice"],
    "account_action": ["account deletion", "account recovery", "account creation", "edit profile", "reset password", "register"],
    "refund_request": ["refund request", "refund policy", "cancellation policy", "cancelled", "cancellation fee"],
    "date_reference": ["last month", "January", "august", "months ago", "days ago"],
    "recipient_person": ["my mom", "my wife", "my dad", "my daughter"],
    "support_channel": ["agent", "customer support", "live chat", "human agent", "talk to human"],
    "newsletter_action": ["newsletter", "subscribe", "unsubscribe", "mailing list"],
    "purchase_help": ["buy", "purchase", "payment failed", "transaction error", "cannot pay"],
    "general_complaint": ["problem", "issue", "report", "complaint", "feedback", "claim", "escalate"],
    # additional refined buckets as needed
}


    FUZZY_MATCH_THRESHOLD = 70
    ent_counter = Counter()
    ent_to_rows = defaultdict(set)
    custom_counter = defaultdict(Counter)
    custom_to_rows = defaultdict(lambda: defaultdict(set))
    rows_with_entity = set()
    entity_list = set()
    entity_type_list = set()

    for idx, query in enumerate(df['cleaned_query']):
        query_lower = query.lower()
        doc = nlp(query_lower)
        found_entity = False

        for ent in doc.ents:
            key = (ent.label_, ent.text.lower())
            ent_counter[key] += 1
            ent_to_rows[key].add(idx + 2)
            entity_list.add(ent.text.lower())
            entity_type_list.add(ent.label_)
            found_entity = True

        for bucket, keywords in CUSTOM_BUCKETS.items():
            for kw in keywords:
                score = fuzz.partial_ratio(query_lower, kw.lower())
                if score >= FUZZY_MATCH_THRESHOLD:
                    custom_counter[bucket][kw.lower()] += 1
                    custom_to_rows[bucket][kw.lower()].add(idx + 2)
                    entity_list.add(kw.lower())
                    entity_type_list.add(bucket)
                    found_entity = True

        if found_entity:
            rows_with_entity.add(idx)

    # Convert counters to DataFrames
    spacy_df = pd.DataFrame([
        (etype, evalue, count, sorted(list(ent_to_rows[(etype, evalue)])))
        for (etype, evalue), count in ent_counter.items()
    ], columns=["Entity Type", "Entity Value", "Frequency", "Query Rows"])

    custom_dfs = []
    for bucket, c in custom_counter.items():
        df_bucket = pd.DataFrame([
            (bucket, val, count, sorted(list(custom_to_rows[bucket][val])))
            for val, count in c.items()
        ], columns=["Entity Type", "Entity Value", "Frequency", "Query Rows"])
        custom_dfs.append(df_bucket)

    full_entity_df = pd.concat([spacy_df] + custom_dfs, ignore_index=True)
    full_entity_df = full_entity_df.sort_values("Frequency", ascending=False)
    full_entity_df.to_csv("identified_entities.csv", index=False)

    # Save rows with no entities
    all_indices = set(range(len(df)))
    no_entity_indices = sorted(list(all_indices - rows_with_entity))
    no_entity_df = df.iloc[no_entity_indices]
    no_entity_df.to_csv("queries_with_no_entities.csv", index=False)

    print("\n🔍 List of unique extracted entity values:")
    print(sorted(entity_list))
    print("\n📘 List of unique entity types:")
    print(sorted(entity_type_list))

    return full_entity_df

# -------------------- 3. Run Full Pipeline -------------------- #
def run_step_4_pipeline():
    df = load_final_labelled_queries("clustered_labelled_queries_final.csv")
    entity_df = extract_entities(df)
    print("\n✅ Entities extracted and saved to identified_entities.csv")
    print("📄 Queries with no entities saved to queries_with_no_entities.csv")
    return entity_df

# -------------------- 4. Execute -------------------- #
if __name__ == "__main__":
    entity_df = run_step_4_pipeline()



🔍 List of unique extracted entity values:
['1', '1 months ago', '1 purchases ago', '10 months ago', '10 purchases ago', '11', '11 months ago', '2', '2 months ago', '3', '3 purchases ago', '4 purchases', '5', '6 months ago', '6 purchases', '7 months ago', '8 months ago', '9 months ago', '9 purchases ago', 'account', 'agent', 'april', 'assistant', 'august', 'buy', 'cancel order', 'cancellation', 'cancellation policy', 'cancelled', 'cannot pay', 'cash', 'chargeback', 'chatbot', 'claim', 'cod', 'complaint', 'credit card', 'customer support', 'debit card', 'december', 'delayed', 'delete account', 'delivered', 'delivery time', 'edit profile', 'eight months ago', 'eleven months ago', 'email updates', 'estimated delivery', 'eta', 'express delivery', 'february', 'fee', 'feedback', 'five purchases ago', 'forgot password', 'forgot pin', 'four months ago', 'home delivery', 'hours', 'human agent', 'invoice', 'issue', 'item', 'january', 'july', 'june', 'last month', 'login', 'mailing list', 'need h

In [6]:
print("\n📊 Entity DataFrame:")
entity_df  # Display first 10 rows of the entity DataFrame




📊 Entity DataFrame:


Unnamed: 0,Entity Type,Entity Value,Frequency,Query Rows
48,product,order,889,"[3, 4, 23, 25, 26, 27, 29, 30, 32, 36, 49, 51,..."
46,product,account,786,"[2, 37, 66, 67, 68, 69, 70, 71, 72, 73, 91, 11..."
91,support_term,assistant,731,"[18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 2..."
50,product,invoice,645,"[8, 9, 10, 11, 44, 48, 57, 67, 79, 80, 85, 86,..."
70,refund_term,refunded,605,"[6, 7, 12, 13, 14, 15, 16, 39, 40, 43, 50, 52,..."
...,...,...,...,...
43,DATE,two months ago,1,[4441]
19,DATE,7 months ago,1,[493]
15,MONEY,3 purchases ago,1,[480]
79,newsletter,email updates,1,[7]
