## Phase 2 – Intent Mapping

### Step 1: Load Cleaned Data

We start Phase 2 by importing the pre-processed, version-controlled Amazon Reviews dataset created in Phase 1.
This ensures all downstream intent mapping uses consistent, normalized data.

- **File:** `chatbot_amazon_cleaned_v1.csv`


In [None]:
import os
import sys

project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)
    
from utils.path_config import get_data_path
from utils.text_cleaning import clean_review_df
from utils.data_saving import save_with_rolling_backup
from utils.intent_mapping import apply_intent_mapping, inspect_potential_fp
import notebook_setup

import os
import sys

models_path = os.path.abspath("../models")
if models_path not in sys.path:
    sys.path.append(models_path)

from shared import load_and_prepare_data, split_data, get_label_names


In [7]:
import notebook_setup

Project root set to: D:\0) Abhay\04) SRH University Study Docs\Advance Programming\Python Files\Case Study Files
sys.path updated, project-level imports ready.


In [20]:
import pandas as pd
import os
import re

from utils.path_config import get_data_path
from utils.text_cleaning import clean_review_df
from utils.data_saving import save_with_rolling_backup

In [9]:
# Phase 2, Step 1: Import Cleaned Data
# Load the cleaned Amazon reviews dataset from Phase 1
df = pd.read_csv(get_data_path("chatbot_amazon_cleaned_v1.csv"))

# Quick check: columns and shape
print("Columns:", df.columns.tolist())
print("Shape:", df.shape)

# Display a preview of the first few rows
df.head()


Columns: ['overall', 'verified', 'reviewTime', 'reviewerID', 'asin', 'reviewText', 'summary', 'unixReviewTime', 'category', 'reviewText_imputed']
Shape: (75000, 10)


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewText,summary,unixReviewTime,category,reviewText_imputed
0,1.0,True,"04 10, 2017",A26PO1B2Q2G1CS,014789302X,I didn't like this product it smudged all unde...,One Star,1491782400,Beauty,False
1,5.0,True,"09 14, 2010",A3BEXXPM77KR5Z,1620213982,This product is great!! I nvr thought it would...,Great product,1284422400,Beauty,False
2,4.0,True,"04 17, 2018",A2XBDWQ3Z8GZM,1620213982,Works great.,Four Stars,1523923200,Beauty,False
3,5.0,True,"03 15, 2018",A2RL2YV966PEF8,1620213982,Very nice quality. Highly recommend.,Quality piece,1521072000,Beauty,False
4,2.0,True,"02 7, 2018",AHEV0FVXWXHS3,1620213982,It's a nice stand. Well built and good foundat...,It's a nice stand. Well built and good foundation,1517961600,Beauty,False


### Step 2: Define Intent Mapping Keywords

We explicitly define our intent categories and associated keyword triggers as a Python dictionary.
This forms the basis of our rule-based intent mapping engine, ensuring consistency and maintainability.

- **Each intent:** Has a list of phrases or words likely to indicate that intent in user text.
- **Future:** This dictionary can be easily extended, merged, or re-mapped as project needs evolve.

In [None]:
# Phase 2, Step 2: Define Intent Mapping Keywords


# ---- INTENT KEYWORDS DICTIONARY ----
intent_keywords = {
    "PRODUCT_SEARCH": [
        "buy", "purchase", "order", "looking for", "find", "shop", "browse", "search"
    ],
    "VIEW_PRODUCT_DETAILS": [
        "detail", "feature", "specification", "info", "information",
        "specs", "dimensions", "product description", "technical info",
        "more about this", "full description", "material", "breakdown",
        "in-depth details", "weight", "size", "how big", "how small",
        "product data"
    ],
    "CHECK_SALE_ITEMS": [
        "sale", "deal", "discount", "promotion", "offer", "offers",
        "any discounts today", "running promotion", "seasonal sale",
        "weekly deal", "flash sale", "student discount", "today's offer",
        "coupon available", "redeem offer"
    ],
    "ADD_TO_CART": [
        "add to cart", "add this", "put in cart", "add item"
    ],
    "VIEW_CART": [
        "my cart", "shopping cart", "what's in cart", "view cart", "see my cart"
    ],
    "REMOVE_FROM_CART": [
        "remove from cart", "delete from cart", "take out", "remove item"
    ],
    "CLEAR_CART": [
        "empty cart", "clear cart", "remove all", "clear all"
    ],
    "TRACK_ORDER": [
        "track order", "shipment", "shipping status", "where is my order",
        "delivered", "not delivered", "late delivery", "track my item"
    ],
    "MULTI_ORDER_TRACK": [
        "track my orders", "multiple orders", "all orders", "past orders"
    ],
    "CANCEL_ORDER": [
        "cancel order", "abort order", "stop order", "don't want it anymore"
    ],
    "RETURN_ITEM": [
        "return", "refund", "exchange", "replace", "send back"
    ],
    "VIEW_RETURNS": [
        "returned", "returns history", "my returns"
    ],
    "CONNECT_TO_AGENT": [
        "support", "talk to human", "customer care", "help center", "agent", "representative",
        "contact support", "talk to someone", "live agent", "real person", "speak with support",
        "chat with human", "I need help", "connect me to someone", "get assistance"
    ],
    "FAQ_RETURN_POLICY": [
        "return policy", "how to return", "returns allowed", "can I return"
    ],
    "FAQ_SHIPPING_TIME": [
        "shipping time", "how long delivery", "delivery time", "when will it arrive", "expected delivery"
    ],
    "FAQ_COD": [
        "cash on delivery", "cod available", "pay on delivery", "cod option"
    ],
    "FAQ_PAYMENT_METHODS": [
        "payment option", "credit card", "debit card", "upi", "paypal", "apple pay", "how to pay"
    ],
    "PRODUCT_AVAILABILITY": [
        "in stock", "out of stock", "available", "currently available"
    ],
    "VIEW_RECOMMENDATIONS": [
        "recommend", "suggest for me", "similar products", "you may like"
    ],
    "PRODUCT_EXPLAINABILITY": [
        "why recommend", "explain recommendation", "why this", "based on what"
    ],
    "SENTIMENT_PRAISE": [
        "amazing", "fantastic", "excellent", "outstanding", "love it",
        "highly recommend", "best purchase", "would buy again",
        "great product", "superb", "really good"
    ],
    "SENTIMENT_COMPLAINT": [
        "broken", "worst", "never again", "not working",
        "pathetic", "useless", "hate", "regret", "terrible", "awful",
        "damaged", "late", "disappointed"
    ],
    "GENERIC_GREETING": [
        "hello", "hi", "hey", "greetings", "good morning", "good afternoon", "good evening"
    ],
    "GOODBYE": [
        "bye", "goodbye", "see you", "thank you", "thanks", "see ya", "later"
    ],
    "OTHER": [
        "blablabla", "xyz product feedback", "what is this even", "not related to purchase",
        "your interface is weird", "I have a random question", "I want to report something else",
        "your name is funny", "do you like ice cream", "what's 2 + 2",
        "how's the weather", "nothing", "random message", "nonsense input",
        "banana", "i was just typing", "no idea", "whatever", "this is not helpful", "makes no sense"
    ]
}

print(f"{len(intent_keywords)} intent categories loaded.")


24 intent categories loaded.


### Step 3: Precision-First Rule-Based Intent Mapping

To maximize labeling reliability, we assign rule-based intents *only* when exactly one intent is matched.
All ambiguous or unmatched cases are flagged as 'OTHER' (or 'AMBIGUOUS') for downstream handling via ML or advanced logic.
This ensures high precision, reduces noise, and simplifies downstream model training and QA.


In [24]:
# Phase 2, Step 3: Baseline Rule-Based Intent Mapping (No Fuzzy Matching Yet)
import re

def is_greeting(text, greeting_keywords):
    text_lc = text.lower().strip()
    for kw in greeting_keywords:
        # Only match at the very start of string (optionally allow punctuation/whitespace before)
        if re.match(rf"^(?:\W*){re.escape(kw)}\b", text_lc):
            return True
    return False

def map_intent_conservative_contextual(text, intent_keywords, default_intent="OTHER", ambiguous_intent="AMBIGUOUS"):
    if not isinstance(text, str):
        return default_intent

    text_lc = text.lower().strip()
    matches = []

    # Special handling for greeting
    greeting_keywords = intent_keywords.get("GENERIC_GREETING", [])
    if is_greeting(text, greeting_keywords):
        matches.append("GENERIC_GREETING")

    # Handle all other intents as usual (except GENERIC_GREETING)
    for intent, keywords in intent_keywords.items():
        if intent == "GENERIC_GREETING":
            continue
        for kw in keywords:
            if kw in text_lc:
                matches.append(intent)
                break  # Only need one keyword per intent

    unique_matches = set(matches)
    if len(unique_matches) == 1:
        return unique_matches.pop()
    elif len(unique_matches) > 1:
        return ambiguous_intent
    else:
        return default_intent


### Three-Tier, Confidence-Tagged Rule-Based Intent Assignment

We assign intent labels using a multi-stage process:
- **Tier 1:** Both 'reviewText' and 'summary' agree on the same intent (high confidence).
- **Tier 2:** If inconclusive, assign based on merged text (medium confidence).
- **Tier 3:** All ambiguous or unassigned cases are routed for ML/model-based handling (low confidence).

Only 'tier1_both' labels are trusted as rule-based gold standard; all others are flagged for advanced processing.


In [25]:
# Three-Tier, Confidence-Tagged Intent Assignment
# Use the new context-sensitive function everywhere you previously called map_intent_conservative
def assign_intent_three_tier(row, intent_keywords):
    rtext = row['reviewText'] if isinstance(row['reviewText'], str) else ""
    summ = row['summary'] if isinstance(row['summary'], str) else ""
    merged = f"{rtext}. {summ}".strip(" .")

    intent_review = map_intent_conservative_contextual(rtext, intent_keywords)
    intent_summary = map_intent_conservative_contextual(summ, intent_keywords)

    if (intent_review == intent_summary) and (intent_review not in ["OTHER", "AMBIGUOUS"]):
        return intent_review, "tier1_both"
    if (intent_review not in ["OTHER", "AMBIGUOUS"]) and (intent_summary in ["OTHER", "AMBIGUOUS"]):
        return intent_review, "tier1_review"
    if (intent_summary not in ["OTHER", "AMBIGUOUS"]) and (intent_review in ["OTHER", "AMBIGUOUS"]):
        return intent_summary, "tier1_summary"
    if (intent_review not in ["OTHER", "AMBIGUOUS"]) and (intent_summary not in ["OTHER", "AMBIGUOUS"]) and (intent_review != intent_summary):
        return "AMBIGUOUS", "tier1_conflict"
    intent_merged = map_intent_conservative_contextual(merged, intent_keywords)
    if intent_merged not in ["OTHER", "AMBIGUOUS"]:
        return intent_merged, "tier2_merged"
    return "AMBIGUOUS", "tier3_model"


# Apply the function to your DataFrame
df[['intent_label', 'confidence_tag']] = df.apply(
    lambda row: pd.Series(assign_intent_three_tier(row, intent_keywords)),
    axis=1
)

# Check the breakdown
print(df['confidence_tag'].value_counts(dropna=False))
print(df[['intent_label', 'confidence_tag']].sample(10, random_state=42))


confidence_tag
tier3_model       51514
tier1_review      16724
tier1_summary      3689
tier1_both         2306
tier1_conflict      767
Name: count, dtype: int64
               intent_label confidence_tag
26837             AMBIGUOUS    tier3_model
2592              AMBIGUOUS    tier3_model
18359  VIEW_RECOMMENDATIONS   tier1_review
73292             AMBIGUOUS    tier3_model
60127             AMBIGUOUS    tier3_model
71737   SENTIMENT_COMPLAINT   tier1_review
26473             AMBIGUOUS    tier3_model
1040              AMBIGUOUS    tier3_model
32622             AMBIGUOUS    tier3_model
44679        PRODUCT_SEARCH   tier1_review


#### Results: Three-Tier Rule-Based Intent Mapping

- **Gold standard (tier1_both):** 4,594 reviews with high-confidence rule-based intent, ideal for model training and benchmarking.
- **Silver/ambiguous (tier1_review, tier1_summary, tier1_conflict):** Require advanced model or manual review for accurate intent assignment.
- **Model-only (tier3_model):** Majority of reviews are routed to ML/NLP pipeline, ensuring high overall precision and recall.

This split guarantees reliable labels for performance validation, while maintaining scalability and flexibility in chatbot deployment.

#### Manual Inspection of Rule-Based Intent Assignment

Randomly sampling reviews from each confidence group helps us:
- Spot missed intent keywords and improve the rule dictionary
- Understand the nature of ambiguous cases for model design
- Validate the precision and coverage of our current logic


In [27]:
def inspect_potential_fp(intent, tag="tier1_review", n=10):
    """
    Show a sample of reviews assigned a specific intent label
    within a specific confidence tag group (e.g., tier1_review).
    """
    subset = df[(df['confidence_tag'] == tag) & (df['intent_label'] == intent)]
    print(f"\n--- Possible false positives for {intent} in {tag} ---")
    if subset.shape[0] == 0:
        print("No examples found.")
        return
    display_cols = ['reviewText', 'summary', 'intent_label', 'confidence_tag']
    print(subset[display_cols].sample(min(n, subset.shape[0]), random_state=123).to_string(index=False))

# Check for remaining false positives for greetings
inspect_potential_fp("GENERIC_GREETING", "tier1_review", n=10)
inspect_potential_fp("GENERIC_GREETING", "tier1_summary", n=10)
inspect_potential_fp("GENERIC_GREETING", "tier1_both", n=10)

# Check for praise/complaint as needed
inspect_potential_fp("SENTIMENT_PRAISE", "tier1_review", n=10)
inspect_potential_fp("SENTIMENT_COMPLAINT", "tier1_review", n=10)



--- Possible false positives for GENERIC_GREETING in tier1_review ---
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   reviewText                                        summary     intent_label confidence_tag
                                                                                                                                                                                                                                                                    