In [1]:
# ==============================================================================
#  SETUP: INSTALL AND IMPORT LIBRARIES
# ==============================================================================
# Installs all necessary libraries.
!pip install pandas openpyxl transformers torch tqdm textblob
!pip install huggingface_hub[hf_xet]
!python -m textblob.download_corpora

import pandas as pd
import os
from datetime import datetime
from transformers import pipeline
from tqdm.auto import tqdm
from textblob import TextBlob

# Register tqdm for use with pandas .progress_apply()
tqdm.pandas(desc="Categorizing Verbatims")

# ==============================================================================
#  🔴 USER CONFIGURATION 🔴
# ==============================================================================

# --- Input File Details ---
FILE_PATH = r'D:/Z - Neverwonderland/2 - Prince Sarcawesum/1 - DESIGN/MEDIAWARE ARTS AND PRINTS/AAA_Corporate Material/Data Analysis/Verbatim Analysis/Dummy Verbatim Dataset.xlsx'  # 👈 CHANGE THIS
TEXT_COLUMN = 'Verbatim'   # 👈 CHANGE THIS

# --- Output File Details ---
OUTPUT_FOLDER_PATH = r'C:/Users/EJG/Documents/Analysis_Results' # 👈 Set your desired output folder
PROGRAM_NAME = "Program_1"
KPIS_IN_SCOPE = "KPI"
LOBS_IN_SCOPE = "LOB"
MAJOR_VERSION = 1

# --- Classification Settings ---
CLASSIFICATION_THRESHOLD = 0.60

# --- Define Your Categories and Sub-Categories ---
CATEGORIES = {
    'Interaction with Agent or Staff': [
        "Agent's communication and listening skills", "Agent's knowledge and problem-solving ability",
        'Efficiency and speed of call handling', "Attitude, empathy, and professionalism of the agent",
        "Representative's sales skills or pressure"
    ],
    'Company Process or Policy Issue': [
        'Confusion or disagreement with a company policy', 'The resolution process was too complex or long',
        'The time it took to resolve the issue', "Problems with a follow-up or return contact",
        'Difficulty with the sign-up or onboarding process'
    ],
    'Technical System or Tool Problem': [
        'A software bug, glitch, or error message', 'The system, app, or website was slow and unresponsive',
        'The tool or software was difficult to use or understand', 'Difficulty navigating or finding information on the website',
        'The application or website crashed or froze'
    ],
    'Feedback on the Product Itself': [
        'The quality, a defect, or damage of the product', 'A suggestion or request for a new product feature',
        'Feedback on the price, cost, or value for money', 'The design, appearance, or ease of use of the product',
        'The product was out of stock or unavailable'
    ],
    'Call Environment or Connection Issue': [
        'Loud background noise during the interaction', 'Poor audio quality, static, or a bad connection',
        "External factors outside of the agent's control"
    ]
}

# ==============================================================================
#  CORE LOGIC (No need to edit below this line)
# ==============================================================================

# --- 1. Load the Zero-Shot Classification Model ---
print("Loading Zero-Shot Classification model...")
try:
    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
    print("✅ Model loaded successfully.")
except Exception as e:
    print(f"Error loading model: {e}")
    classifier = None

# --- 2. Load and Prepare Data ---
try:
    df = pd.read_excel(FILE_PATH)
    print(f"\nSuccessfully loaded {len(df)} rows from '{FILE_PATH}'.")
except FileNotFoundError:
    print(f"⚠️ Warning: File not found. Loading dummy data.")
    df = pd.DataFrame({ TEXT_COLUMN: ["The agent was knowledgeable, but the new billing statement is very confusing."] })

df.dropna(subset=[TEXT_COLUMN], inplace=True)

# --- 3. Define the Classification and Extraction Functions ---
def get_multi_label_predictions(text, labels, threshold):
    if not text or not isinstance(text, str): return []
    results = classifier(text, candidate_labels=labels, multi_label=True)
    return [label for i, label in enumerate(results['labels']) if results['scores'][i] >= threshold]

def extract_key_phrases(text):
    """Extracts noun phrases from a text to suggest new sub-categories."""
    blob = TextBlob(text)
    # Join the noun phrases, limit to the first 3 for brevity
    phrases = [phrase for phrase in blob.noun_phrases[:3]]
    return ", ".join(phrases) if phrases else ""

def categorize_row(row, text_column, category_map, threshold):
    text = row[text_column]
    main_categories = list(category_map.keys())
    matched_categories = get_multi_label_predictions(text, main_categories, threshold)
    
    matched_subcategories = []
    
    if matched_categories:
        for category in matched_categories:
            sub_category_labels = category_map.get(category, [])
            if sub_category_labels:
                sub_preds = get_multi_label_predictions(text, sub_category_labels, threshold)
                matched_subcategories.extend(sub_preds)
        
        # If a main category was found but NO sub-categories were, suggest new ones.
        if not matched_subcategories:
            key_phrases = extract_key_phrases(text)
            if key_phrases:
                matched_subcategories.append(f"SUGGESTION: {key_phrases}")

    category_str = ", ".join(matched_categories) if matched_categories else "Uncategorized"
    subcategory_str = ", ".join(matched_subcategories) if matched_subcategories else ""
    return category_str, subcategory_str

# --- 4. Apply Categorization to the DataFrame ---
if classifier and not df.empty:
    print(f"\nStarting categorization with a threshold of {CLASSIFICATION_THRESHOLD:.2f}...")
    df[['Category', 'Sub-Category']] = df.progress_apply(
        lambda row: categorize_row(row, TEXT_COLUMN, CATEGORIES, CLASSIFICATION_THRESHOLD),
        axis=1, result_type='expand'
    )

    # --- 5. Review, Summarize, and Save Results ---
    print("\n--- Categorization Complete ---")
    
    # Feature: In-Notebook Preview of the results
    print("Result Preview:")
    display(df[[TEXT_COLUMN, 'Category', 'Sub-Category']].head())
    
    # Feature: Count of blanks / uncategorized items
    total_rows = len(df)
    uncategorized_count = len(df[df['Category'] == 'Uncategorized'])
    categorized_count = total_rows - uncategorized_count
    categorization_rate = (categorized_count / total_rows) * 100 if total_rows > 0 else 0
    
    print("\n--- Categorization Summary ---")
    print(f"Total Verbatims Analyzed: {total_rows}")
    print(f"Successfully Categorized:   {categorized_count}")
    print(f"Uncategorized ('Blanks'):   {uncategorized_count}")
    print(f"Categorization Rate:        {categorization_rate:.2f}%")
    
    # Feature: Dynamic & Versioned Filename
    os.makedirs(OUTPUT_FOLDER_PATH, exist_ok=True)
    current_date = datetime.now().strftime('%Y-%m-%d')
    base_filename = f"{current_date}_{PROGRAM_NAME}_{KPIS_IN_SCOPE}_{LOBS_IN_SCOPE}_Verbatim_Analysis"

    minor_version = 0
    while True:
        version_str = f"v{MAJOR_VERSION:02d}.{minor_version:02d}"
        output_filename = f"{base_filename}_{version_str}.xlsx"
        full_path = os.path.join(OUTPUT_FOLDER_PATH, output_filename)
        if not os.path.exists(full_path):
            break
        minor_version += 1
    
    # Save the final DataFrame to the unique, versioned Excel file
    df.to_excel(full_path, index=False)
    print(f"\n✅ Successfully saved categorized results to:")
    print(full_path)
else:
    print("\nSkipping categorization due to model loading or data issues.")

Collecting textblob
  Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
Installing collected packages: textblob
Successfully installed textblob-0.18.0.post0
Finished.


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\EJG\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\EJG\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\EJG\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\EJG\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package conll2000 to
[nltk_data]     C:\Users\EJG\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\conll2000.zip.
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\EJG\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.


Loading Zero-Shot Classification model...
✅ Model loaded successfully.

Successfully loaded 50 rows from 'D:/Z - Neverwonderland/2 - Prince Sarcawesum/1 - DESIGN/MEDIAWARE ARTS AND PRINTS/AAA_Corporate Material/Data Analysis/Verbatim Analysis/Dummy Verbatim Dataset.xlsx'.

Starting categorization with a threshold of 0.60...


Categorizing Verbatims:   0%|          | 0/50 [00:00<?, ?it/s]


--- Categorization Complete ---
Result Preview:


Unnamed: 0,Verbatim,Category,Sub-Category
0,"""The website was a mess, couldn't find the spe...","Interaction with Agent or Staff, Technical Sys...",Difficulty navigating or finding information o...
1,"""My new laptop overheated almost immediately a...","Interaction with Agent or Staff, Feedback on t...","The quality, a defect, or damage of the produc..."
2,"""I tried to use the live chat, but no one ever...","Interaction with Agent or Staff, Call Environm...",External factors outside of the agent's control
3,"""The delivery was two days late and the box wa...",Interaction with Agent or Staff,SUGGESTION: hope
4,"""I returned a broken phone and haven't receive...","Interaction with Agent or Staff, Call Environm...",



--- Categorization Summary ---
Total Verbatims Analyzed: 50
Successfully Categorized:   47
Uncategorized ('Blanks'):   3
Categorization Rate:        94.00%

✅ Successfully saved categorized results to:
C:/Users/EJG/Documents/Analysis_Results\2025-09-18_Program_1_KPI_LOB_Verbatim_Analysis_v01.03.xlsx
