# Cell 1: Setup and Master Configuration
**This cell handles all installations, imports, model downloads, and contains a single configuration section for all your settings.**

In [None]:
# ==============================================================================
#  1. SETUP: INSTALL AND IMPORT LIBRARIES
# ==============================================================================
# For local environments (like Anaconda), uncomment these lines to install libraries.
# !pip install pandas openpyxl transformers torch tqdm textblob seaborn nltk scikit-learn
# !pip install huggingface_hub[hf_xet]
# !pip install --upgrade Pillow==9.5.0

# --- Import Libraries ---
import pandas as pd
import os
import re
import string
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import spacy
from transformers import pipeline
from tqdm.auto import tqdm
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import FreqDist, bigrams, trigrams
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import silhouette_score
from openpyxl.drawing.image import Image
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel

# --- Download NLP Models ---
# This section ensures all necessary data models are available.
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('vader_lexicon', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('brown', quiet=True) # For TextBlob
!python -m textblob.download_corpora --quiet
!python -m spacy download en_core_web_md --quiet

# Register tqdm for use with pandas .progress_apply()
tqdm.pandas(desc="Processing Verbatims")

# ==============================================================================
#  2. 🔴 MASTER USER CONFIGURATION 🔴
# ==============================================================================

# --- Input File Details ---
# For Kaggle, the path is typically '/kaggle/input/your-dataset-name/your-file-name.xlsx'
FILE_PATH = r'/kaggle/input/on-survey-20250922-02/EMEA CSAT BR0922.xlsx'
TEXT_COLUMN_NAME = 'star_rating_comment'

# --- Output File Details ---
OUTPUT_FOLDER_PATH = r'/kaggle/working/' # Correct for Kaggle
PROGRAM_NAME = "On"
KPIS_IN_SCOPE = "EMEA"
LOBS_IN_SCOPE = "20250922"
MAJOR_VERSION = 1 # Manually change for new datasets or category versions

# --- Analysis Settings ---
CUSTOM_STOP_WORDS = {
    # --- Generic Business & Service Terms ---
    'company', 'business', 'service', 'services', 'product', 'products', 'team', 'customer', 
    'customers', 'client', 'clients', 'staff', 'agent', 'representative', 'organization',
    'experience', 'issue', 'issues', 'problem', 'problems', 'question', 'questions', 
    'feedback', 'inquiry', 'inquiries', 'request', 'requests', 'ticket', 'case', 'account',

    # --- Common Vague Fillers & Conversational Terms ---
    'also', 'really', 'actually', 'always', 'just', 'like', 'im', 'ive', 'thing', 'things', 
    'something', 'anything', 'everything', 'well', 'get', 'got', 'getting', 'would', 'could', 
    'should', 'make', 'made', 'one', 'even', 'since', 'every', 'time', 'times', 'day', 'days', 
    'week', 'weeks', 'month', 'months', 'year', 'years', 'lot',

    # --- Politeness, Greetings, & Inquiries ---
    'please', 'help', 'hello', 'hi', 'hey', 'thank', 'thanks', 'appreciate', 'regards', 'best', 
    'know', 'see', 'want', 'wanted', 'looking', 'wondering', 'information', 'info', 'details',
    
    # --- Placeholders for Your Specific Names ---
    'company',str(PROGRAM_NAME),str(KPIS_IN_SCOPE),str(LOBS_IN_SCOPE),

    # --- Prepositions ---
    'aboard', 'about', 'above', 'across', 'after', 'against', 'along', 'among', 'around', 'at',
    'before', 'behind', 'below', 'beneath', 'beside', 'between', 'beyond', 'by', 'down', 'during',
    'for', 'from', 'in', 'inside', 'into', 'like', 'near', 'of', 'off', 'on', 'onto', 'out',
    'outside', 'over', 'past', 'since', 'through', 'throughout', 'to', 'toward', 'under',
    'underneath', 'until', 'up', 'upon', 'with', 'within', 'without',
}

CLASSIFICATION_THRESHOLD = 0.45 # Confidence score (0.0 to 1.0) for categorization

# --- Define Your Zero-Shot Categories and Sub-Categories (Granular Version) ---
CATEGORIES = {
    # --- People Driven Categories ---
    'Interaction with Call Center Agent': [
        "Call agent's communication and listening skills",
        "Call agent's knowledge and problem-solving ability",
        "Call agent's attitude empathy and professionalism",
        "Efficiency and speed of call handling or resolution",
    ],
    'Interaction with In-Store Staff': [
        "In-store staff's helpfulness and attitude",
        "Staff's product knowledge and ability to answer questions",
        "Availability and attentiveness of staff in the store",
        "Efficiency of in-store processes like checkout or returns",
    ],
    'Interaction with Field Technician': [
        "Technician's professionalism, timeliness, and communication",
        "Technician's skill and ability to fix the issue",
        "Cleanliness and care taken by the technician in the home",
        "Explanation of work performed by the technician",
    ],

    # --- Process Driven Category ---
    'Company Process or Policy Issue': [
        'Confusion or disagreement with a company policy',
        'The overall process was too complex or had too many steps',
        'The total time it took to resolve the issue',
        'Problems with a follow-up, callback, or promised contact',
    ],

    # --- Technical and System Categories ---
    'Website or Online Portal Issue': [
        "Website was slow, lagging, or unresponsive",
        'Difficulty navigating or finding information on the website',
        'A website bug, glitch, or error message',
        'The website crashed, froze, or was unavailable',
    ],
    'Mobile Application Issue': [
        'The mobile app was slow or had poor performance',
        'A bug or error in the mobile app',
        'The mobile app crashed or froze',
        'The mobile app was difficult to use or understand',
    ],
    'Communication Channel Quality': [
        'Poor audio quality, static, or bad phone connection',
        'Loud background noise during a call',
        'Issues with the live chat tool or functionality',
        'Problems with email communication or response times',
    ],

    # --- Product Driven Category ---
    'Feedback on the Product Itself': [
        'The quality, a defect, or damage of the product',
        'A suggestion or request for a new product feature',
        'Feedback on the price, cost, or value for money',
        'The design, appearance, or general ease of use of the product',
    ]
}

print("✅ Setup complete. All libraries and models are ready.")

# Cell 2: Data Loading, Cleaning, and Sentiment Analysis
**This cell prepares your core DataFrame by loading, cleaning, and running sentiment analysis.**

In [None]:
# # --- 1. Load Data Safely ---
# # This block will attempt to load your file. If it fails, it will print an error and stop.
# try:
#     df = pd.read_excel(FILE_PATH)
#     print(f"✅ Successfully loaded {len(df)} rows from '{FILE_PATH}'.")
    
#     # --- 2. Text Cleaning ---
#     stop_words = set(stopwords.words('english')).union(CUSTOM_STOP_WORDS)
#     lemmatizer = WordNetLemmatizer()
#     def clean_text(text):
#         if not isinstance(text, str): return ""
#         text = text.lower()
#         text = re.sub(r'[\d\n]', '', text)
#         text = text.translate(str.maketrans('', '', string.punctuation))
#         tokens = word_tokenize(text.strip())
#         cleaned_tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words and len(w) > 2]
#         return " ".join(cleaned_tokens)
    
#     df.dropna(subset=[TEXT_COLUMN_NAME], inplace=True)
#     df['cleaned_text'] = df[TEXT_COLUMN_NAME].apply(clean_text)
    
#     # --- 3. Sentiment Analysis ---
#     sia = SentimentIntensityAnalyzer()
#     df['sentiment_compound'] = df[TEXT_COLUMN_NAME].apply(lambda x: sia.polarity_scores(x)['compound'])
#     def categorize_sentiment(compound):
#         if compound >= 0.05: return 'Positive'
#         if compound <= -0.05: return 'Negative'
#         return 'Neutral'
#     df['sentiment_label'] = df['sentiment_compound'].apply(categorize_sentiment)

#     print("\n--- Data Preview with Cleaned Text and Sentiment ---")
#     display(df[[TEXT_COLUMN_NAME, 'cleaned_text', 'sentiment_label']].head())

#     # --- 4. Plot Sentiment Distribution and Save ---
#     plt.figure(figsize=(6, 4))
#     df['sentiment_label'].value_counts().plot(kind='bar', color=['green', 'red', 'grey'])
#     plt.title('Sentiment Distribution')
#     plt.ylabel('Number of Responses')
#     plt.xticks(rotation=0)
#     plt.savefig(os.path.join(OUTPUT_FOLDER_PATH, 'sentiment_distribution.png'), bbox_inches='tight')
#     plt.show()

# except FileNotFoundError:
#     print("="*80)
#     print(f"❌ FATAL ERROR: File not found at the specified path.")
#     print(f"   Your specified path: '{FILE_PATH}'")
#     print("   Please check the 'FILE_PATH' variable in your configuration cell and try again.")
#     print("="*80)
#     # Raising the error stops the notebook execution
#     raise

# Revised 2.1

In [None]:
# --- 1. Load Data Safely ---
# This block will attempt to load your file. If it fails, it will print an error and stop.
try:
    df = pd.read_excel(FILE_PATH)
    print(f"✅ Successfully loaded {len(df)} rows from '{FILE_PATH}'.")

    # --- 2. Text Cleaning (UPGRADED) ---
    # Import new libraries
    from textblob import TextBlob
    
    stop_words = set(stopwords.words('english')).union(CUSTOM_STOP_WORDS)
    lemmatizer = WordNetLemmatizer()
    
    def clean_text(text):
        if not isinstance(text, str) or len(text.strip()) < 1: 
            return ""

        # ✨ STEP 1: Translate non-English text to English
        # Create a TextBlob object
        blob = TextBlob(text)
        # Detect language and translate if not English ('en')
        if blob.detect_language() != 'en':
            text = str(blob.translate(to='en'))

        # ✨ STEP 2: Correct spelling
        # Note: This step can be slow on large datasets
        text = str(TextBlob(text).correct())
        
        # --- Original cleaning steps ---
        text = text.lower()
        text = re.sub(r'[\d\n]', '', text)
        text = text.translate(str.maketrans('', '', string.punctuation))
        tokens = word_tokenize(text.strip())
        cleaned_tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words and len(w) > 2]
        return " ".join(cleaned_tokens)
    
    df.dropna(subset=[TEXT_COLUMN_NAME], inplace=True)
    # Apply the new and improved function
    df['cleaned_text'] = df[TEXT_COLUMN_NAME].progress_apply(clean_text) # Using progress_apply is good for slow functions
    
    # ... (The rest of your code for sentiment analysis and plotting remains the same) ...

    # --- 3. Sentiment Analysis ---
    sia = SentimentIntensityAnalyzer()
    df['sentiment_compound'] = df[TEXT_COLUMN_NAME].apply(lambda x: sia.polarity_scores(x)['compound'])
    def categorize_sentiment(compound):
        if compound >= 0.05: return 'Positive'
        if compound <= -0.05: return 'Negative'
        return 'Neutral'
    df['sentiment_label'] = df['sentiment_compound'].apply(categorize_sentiment)

    print("\n--- Data Preview with Cleaned Text and Sentiment ---")
    display(df[[TEXT_COLUMN_NAME, 'cleaned_text', 'sentiment_label']].head())

    # --- 4. Plot Sentiment Distribution and Save ---
    plt.figure(figsize=(6, 4))
    df['sentiment_label'].value_counts().plot(kind='bar', color=['green', 'red', 'grey'])
    plt.title('Sentiment Distribution')
    plt.ylabel('Number of Responses')
    plt.xticks(rotation=0)
    plt.savefig(os.path.join(OUTPUT_FOLDER_PATH, 'sentiment_distribution.png'), bbox_inches='tight')
    plt.show()

except FileNotFoundError:
    print("="*80)
    print(f"❌ FATAL ERROR: File not found at the specified path.")
    print(f"    Your specified path: '{FILE_PATH}'")
    print("    Please check the 'FILE_PATH' variable in your configuration cell and try again.")
    print("="*80)
    # Raising the error stops the notebook execution
    raise

# Cell 3: Topic Evaluation (Silhouette & Coherence)

In [None]:
# ==============================================================================
#  AUTOMATED TOPIC MODEL EVALUATION (Corrected and Complete)
# ==============================================================================
from sklearn.metrics import silhouette_score
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
import numpy as np

# --- 1. Prepare Data for Evaluation ---
eval_vectorizer = CountVectorizer(max_df=0.9, min_df=5, stop_words='english')
dtm_eval = eval_vectorizer.fit_transform(df['cleaned_text'].dropna())
texts_for_gensim = [word_tokenize(text) for text in df['cleaned_text'].dropna()]
dictionary = Dictionary(texts_for_gensim)
corpus = [dictionary.doc2bow(text) for text in texts_for_gensim]

# --- 2. Define a Range of Topics to Test ---
min_topics = 2
max_topics = 11
topic_range = range(min_topics, max_topics)

# --- 3. Calculate Scores for Each Number of Topics ---
silhouette_scores = []
coherence_scores = []
print("Evaluating optimal number of topics (k). This may take several minutes...")
for k in topic_range:
    # Silhouette Score
    lda_sklearn = LatentDirichletAllocation(n_components=k, random_state=42)
    lda_sklearn.fit(dtm_eval)
    if len(np.unique(lda_sklearn.transform(dtm_eval).argmax(axis=1))) > 1:
        score = silhouette_score(dtm_eval, lda_sklearn.transform(dtm_eval).argmax(axis=1))
        silhouette_scores.append(score)
    else:
        silhouette_scores.append(-1)
    
    # Coherence Score
    lda_gensim = LdaModel(corpus=corpus, id2word=dictionary, num_topics=k, random_state=42)
    coherence_model = CoherenceModel(model=lda_gensim, texts=texts_for_gensim, dictionary=dictionary, coherence='c_v')
    coherence = coherence_model.get_coherence()
    coherence_scores.append(coherence)
    print(f"  - Processed k={k} topics...")

# --- 4. Find the Optimal Number of Topics ---
# Handle cases with no variance in scores to prevent errors
s_range = np.max(silhouette_scores) - np.min(silhouette_scores)
c_range = np.max(coherence_scores) - np.min(coherence_scores)
norm_silhouette = (silhouette_scores - np.min(silhouette_scores)) / s_range if s_range > 0 else np.zeros(len(silhouette_scores))
norm_coherence = (coherence_scores - np.min(coherence_scores)) / c_range if c_range > 0 else np.zeros(len(coherence_scores))
combined_score = norm_silhouette + norm_coherence
best_k_index = np.argmax(combined_score)
recommended_k = topic_range[best_k_index]

# --- 5. Plot and Save the Results ---
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
# Plot Silhouette
ax1.plot(topic_range, silhouette_scores, marker='o', color='b')
ax1.set_title('Silhouette Score vs. Number of Topics')
ax1.axvline(x=recommended_k, color='grey', linestyle='--', label=f'Recommended k={recommended_k}')
ax1.legend()
# Plot Coherence
ax2.plot(topic_range, coherence_scores, marker='o', color='r')
ax2.set_title('Topic Coherence (C_v) vs. Number of Topics')
ax2.axvline(x=recommended_k, color='grey', linestyle='--', label=f'Recommended k={recommended_k}')
ax2.legend()
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_FOLDER_PATH, 'topic_evaluation_charts.png'), bbox_inches='tight')
plt.show()

# --- 6. Store Results and Update Variable ---
evaluation_results = {
    'Num_Topics (k)': list(topic_range),
    'Silhouette_Score': silhouette_scores,
    'Coherence_Score': coherence_scores
}
df_topic_evaluation = pd.DataFrame(evaluation_results)

# --- 7. Print Summary and Update TOPIC_MODEL_TOPICS Variable ---
print("\n--- Evaluation Results Table ---")
display(df_topic_evaluation.round(3))

print("\n--- Automated Recommendation ---")
print(f"Best Silhouette Score at k = {topic_range[np.argmax(silhouette_scores)]}")
print(f"Best Coherence Score at k = {topic_range[np.argmax(coherence_scores)]}")
print("-" * 30)
print(f"🏆 Recommended number of topics (best combined score): {recommended_k}")
print("-" * 30)

TOPIC_MODEL_TOPICS = recommended_k
print(f"✅ The 'TOPIC_MODEL_TOPICS' variable has been automatically set to {TOPIC_MODEL_TOPICS}.")

# Cell 4: Exploratory Analysis (Frequency, N-grams, Word Clouds)
**This cell prepares frequency tables and word cloud images for the final report.**

In [None]:
# --- 1. Prepare Overall Text and Tokens ---
all_cleaned_text = " ".join(df['cleaned_text'])
all_tokens = word_tokenize(all_cleaned_text)

# --- 2. Create and Store Frequency Tables ---
fdist = FreqDist(all_tokens)
df_top_words = pd.DataFrame(fdist.most_common(20), columns=['Word', 'Frequency'])

bigram_fdist = FreqDist(list(bigrams(all_tokens)))
# CORRECTED: Provide both the bigram and its frequency to the DataFrame
df_top_bigrams = pd.DataFrame([(' '.join(gram), freq) for gram, freq in bigram_fdist.most_common(10)], columns=['Bigram', 'Frequency'])

trigram_fdist = FreqDist(list(trigrams(all_tokens)))
# CORRECTED: Provide both the trigram and its frequency to the DataFrame
df_top_trigrams = pd.DataFrame([(' '.join(gram), freq) for gram, freq in trigram_fdist.most_common(10)], columns=['Trigram', 'Frequency'])

print("--- Top 20 Most Common Words ---")
display(df_top_words)

print("--- Top 10 Most Common Bigrams ---")
display(df_top_bigrams)

print("--- Top 10 Most Common Trigrams ---")
display(df_top_trigrams)

# --- 3. Generate and Save Word Clouds ---
def generate_and_save_wordcloud(text, title, filename):
    if not text.strip():
        print(f"Skipping '{title}' word cloud: No text available.")
        return
    wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.savefig(os.path.join(OUTPUT_FOLDER_PATH, filename), bbox_inches='tight')
    plt.show()

generate_and_save_wordcloud(all_cleaned_text, 'Word Cloud (All Feedback)', 'wordcloud_all.png')
generate_and_save_wordcloud(" ".join(df[df.sentiment_label == 'Positive']['cleaned_text']), 'Word Cloud (Positive)', 'wordcloud_positive.png')
generate_and_save_wordcloud(" ".join(df[df.sentiment_label == 'Negative']['cleaned_text']), 'Word Cloud (Negative)', 'wordcloud_negative.png')

# Cell 5: Thematic Analysis (Topic Modeling and Zero-Shot Categorization)
**This cell performs the main "what are they talking about?" analyses.**

In [None]:
# --- 1. Topic Modeling (LDA) ---
print("\n--- Discovering Latent Topics (LDA) ---")
vectorizer = CountVectorizer(max_df=0.9, min_df=3, stop_words='english')
dtm = vectorizer.fit_transform(df['cleaned_text'].dropna())
if dtm.shape[0] > 1 and dtm.shape[1] > 1:
    lda = LatentDirichletAllocation(n_components=TOPIC_MODEL_TOPICS, random_state=42)
    lda.fit(dtm)
    topic_results = []
    feature_names = vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(lda.components_):
        top_words_str = ", ".join([feature_names[i] for i in topic.argsort()[:-10 - 1:-1]])
        topic_results.append([f"Topic #{topic_idx + 1}", top_words_str])
    df_topics = pd.DataFrame(topic_results, columns=['Discovered Topic', 'Top Words'])
    display(df_topics)
else:
    print("Not enough data to perform topic modeling.")
    df_topics = pd.DataFrame() # Create empty df if it fails

# --- 2. Zero-Shot Root Cause Categorization ---
print("\n--- Loading Zero-Shot Classification model ---")
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def get_multi_label_predictions(text, labels, threshold):
    if not text or not isinstance(text, str): return []
    results = classifier(text, candidate_labels=labels, multi_label=True)
    return [label for i, label in enumerate(results['labels']) if results['scores'][i] >= threshold]

def extract_key_phrases(text):
    return "|".join([str(p) for p in TextBlob(text).noun_phrases[:3]]) if text else ""

def categorize_row(row):
    text = row[TEXT_COLUMN_NAME]
    matched_cats = get_multi_label_predictions(text, list(CATEGORIES.keys()), CLASSIFICATION_THRESHOLD)
    matched_subcats = []
    if matched_cats:
        for cat in matched_cats:
            sub_preds = get_multi_label_predictions(text, CATEGORIES.get(cat, []), CLASSIFICATION_THRESHOLD)
            matched_subcats.extend(sub_preds)
        if not matched_subcats and (key_phrases := extract_key_phrases(text)):
            matched_subcats.append(f"SUGGESTION: {key_phrases}")
    return "|".join(matched_cats) if matched_cats else 'Uncategorized', "|".join(matched_subcats) if matched_subcats else ""

print(f"\n--- Starting Zero-Shot categorization with a threshold of {CLASSIFICATION_THRESHOLD:.2f} ---")
df[['Category', 'Sub-Category']] = df.progress_apply(categorize_row, axis=1, result_type='expand')

print("\n--- Categorization Complete ---")

# Cell 6: Deep-Dive Categorization Analysis
**This cell is dedicated to analyzing the results of our categorization, creating the tables and charts needed for the executive summary.**

In [None]:
# ==============================================================================
# 6. DEEP-DIVE CATEGORIZATION ANALYSIS (REVISED)
# ==============================================================================

# --- 1. Create Exploded DataFrames (for re-use) ---
# Create these once to avoid re-computing them, improving efficiency.
df_exploded_cat = df.assign(Category=df['Category'].str.split('|')).explode('Category')

# --- FIX: Explicitly split and explode the correct 'Sub-Category' column ---
# This ensures we get counts of individual sub-categories, not combined strings.
df_exploded_subcat = df.copy()
df_exploded_subcat['Sub-Category'] = df_exploded_subcat['Sub-Category'].str.split('|')
df_exploded_subcat = df_exploded_subcat.explode('Sub-Category')
# --- END FIX ---

df_exploded_subcat.dropna(subset=['Sub-Category'], inplace=True) # Drop rows where sub-category is now null
df_exploded_subcat = df_exploded_subcat[df_exploded_subcat['Sub-Category'].str.strip() != ''] # Remove empty strings

# --- 2. Categorization Summary ---
total_rows = len(df)
uncategorized_count = len(df[df['Category'] == 'Uncategorized'])
categorized_count = total_rows - uncategorized_count
categorization_rate = (categorized_count / total_rows) * 100 if total_rows > 0 else 0
print("\n--- Categorization Summary ---")
summary_metrics = {
    'Metric': ['Total Verbatims', 'Categorized', 'Uncategorized', 'Categorization Rate'],
    'Value': [total_rows, categorized_count, uncategorized_count, f"{categorization_rate:.2f}%"]
}
df_summary_metrics = pd.DataFrame(summary_metrics)
display(df_summary_metrics)

# --- 3. Sentiment Breakdown Tables ---
print("\n--- Sentiment Breakdown by Category ---")
df_sentiment_by_cat = pd.crosstab(df_exploded_cat['Category'], df_exploded_cat['sentiment_label'])
df_sentiment_by_cat['Total'] = df_sentiment_by_cat.sum(axis=1)
df_sentiment_by_cat.sort_values(by='Total', ascending=False, inplace=True)
display(df_sentiment_by_cat.drop(columns='Total'))

print("\n--- Sentiment Breakdown by Sub-Category ---")
# This will now use the correctly exploded data
df_sentiment_by_subcat = pd.crosstab(df_exploded_subcat['Sub-Category'], df_exploded_subcat['sentiment_label'])
df_sentiment_by_subcat['Total'] = df_sentiment_by_subcat.sum(axis=1)
df_sentiment_by_subcat.sort_values(by='Total', ascending=False, inplace=True)
display(df_sentiment_by_subcat.drop(columns='Total'))

# --- 4. Top Keywords for "Uncategorized" Verbatims ---
print("\n--- Top Keywords in Uncategorized Verbatims ---")
uncategorized_text = " ".join(df[df['Category'] == 'Uncategorized']['cleaned_text'])
if uncategorized_text.strip():
    uncategorized_fdist = FreqDist(word_tokenize(uncategorized_text))
    df_uncategorized_keywords = pd.DataFrame(uncategorized_fdist.most_common(20), columns=['Keyword', 'Frequency'])
    display(df_uncategorized_keywords.head(10))
else:
    print("No keywords to display for uncategorized verbatims.")
    df_uncategorized_keywords = pd.DataFrame(columns=['Keyword', 'Frequency'])

# --- 5. Category and Sub-Category Frequency Analysis & Visualization ---
def plot_and_save_top_n(series, title, filename, n=15):
    """Helper function to plot and save frequency charts."""
    if series.empty:
        print(f"Skipping plot '{title}': No data.")
        return
    plt.figure(figsize=(12, 8)) 
    series.head(n).sort_values(ascending=True).plot(kind='barh', color='skyblue')
    plt.title(title)
    plt.xlabel('Count')
    plt.subplots_adjust(left=0.4) 
    plt.savefig(os.path.join(OUTPUT_FOLDER_PATH, filename), bbox_inches='tight')
    plt.show()

# --- Calculate Overall Frequencies ---
print("\n--- Calculating and Plotting Overall Frequencies ---")
df_cat_counts = df_exploded_cat['Category'].value_counts()
# This will now use the correctly exploded data
df_subcat_counts = df_exploded_subcat['Sub-Category'].value_counts()

plot_and_save_top_n(df_cat_counts.drop('Uncategorized', errors='ignore'), 'Overall Top Categories', 'freq_cat_overall.png')
plot_and_save_top_n(df_subcat_counts, 'Overall Top Sub-Categories', 'freq_subcat_overall.png')

# --- Calculate Frequencies Split by Sentiment ---
print("\n--- Calculating and Plotting Frequencies by Sentiment ---")
for sentiment in ['Positive', 'Negative', 'Neutral']:
    df_sentiment_exploded_cat = df_exploded_cat[df_exploded_cat['sentiment_label'] == sentiment]
    if not df_sentiment_exploded_cat.empty:
        cat_counts = df_sentiment_exploded_cat['Category'].value_counts()
        plot_and_save_top_n(cat_counts, f'Top Categories ({sentiment} Sentiment)', f'freq_cat_{sentiment.lower()}.png')

    df_sentiment_exploded_subcat = df_exploded_subcat[df_exploded_subcat['sentiment_label'] == sentiment]
    if not df_sentiment_exploded_subcat.empty:
        subcat_counts = df_sentiment_exploded_subcat['Sub-Category'].value_counts()
        plot_and_save_top_n(subcat_counts, f'Top Sub-Categories ({sentiment} Sentiment)', f'freq_subcat_{sentiment.lower()}.png')

# Cell 7: Similarity Analysis (Jaccard & Semantic)
**This cell performs the deeper analysis on vocabulary overlap and semantic relationships.**

In [None]:
# --- 1. Jaccard Similarity Between Sentiments ---
def get_top_ngrams(tokens, n, N_top):
    """Extracts the top N n-grams from a list of tokens."""
    ngrams_list = tokens if n == 1 else list(nltk.ngrams(tokens, n))
    return [item for item, freq in FreqDist(ngrams_list).most_common(N_top)]

def jaccard_similarity(list1, list2):
    """Calculates Jaccard similarity between two lists."""
    set1, set2 = set(list1), set(list2)
    intersection_len = len(set1.intersection(set2))
    union_len = len(set1.union(set2))
    return intersection_len / union_len if union_len > 0 else 0.0

# Calculate Jaccard similarity for top words and bigrams between sentiments
sentiments = ['Positive', 'Negative', 'Neutral']
tokens_by_sentiment = {s: word_tokenize(" ".join(df[df.sentiment_label == s]['cleaned_text'])) for s in sentiments}
comparisons = [("Positive", "Negative"), ("Positive", "Neutral"), ("Negative", "Neutral")]

results = {
    "Comparison": [f"{s1} vs. {s2}" for s1, s2 in comparisons],
    "Words (Top 20)": [jaccard_similarity(get_top_ngrams(tokens_by_sentiment[s1], 1, 20), get_top_ngrams(tokens_by_sentiment[s2], 1, 20)) for s1, s2 in comparisons],
    "Bigrams (Top 10)": [jaccard_similarity(get_top_ngrams(tokens_by_sentiment[s1], 2, 10), get_top_ngrams(tokens_by_sentiment[s2], 2, 10)) for s1, s2 in comparisons],
    "Trigrams (Top 10)": [jaccard_similarity(get_top_ngrams(tokens_by_sentiment[s1], 3, 10), get_top_ngrams(tokens_by_sentiment[s2], 3, 10)) for s1, s2 in comparisons],
}
similarity_df = pd.DataFrame(results)

# --- 2. Inter-Item Semantic & Jaccard Similarity ---
nlp = spacy.load("en_core_web_md")
top_words_overall = get_top_ngrams(all_tokens, 1, 20) # Top 20 Words
top_bigrams_overall = get_top_ngrams(all_tokens, 2, 10) # Top 10 Bigrams
top_trigrams_overall = get_top_ngrams(all_tokens, 3, 10) # Top 10 Trigrams

# Word vs Word (Semantic Similarity)
matrix_words = np.array([[nlp(w1).similarity(nlp(w2)) for w2 in top_words_overall] for w1 in top_words_overall])
similarity_df_words = pd.DataFrame(matrix_words, index=top_words_overall, columns=top_words_overall)

# Words vs Bigrams (Jaccard Similarity)
matrix_words_vs_bigrams = np.array([[jaccard_similarity([word], list(bigram)) for bigram in top_bigrams_overall] for word in top_words_overall])
similarity_df_words_vs_bigrams = pd.DataFrame(matrix_words_vs_bigrams, index=top_words_overall, columns=[' '.join(g) for g in top_bigrams_overall])

# Words vs Trigrams (Jaccard Similarity)
matrix_words_vs_trigrams = np.array([[jaccard_similarity([word], list(trigram)) for trigram in top_trigrams_overall] for word in top_words_overall])
similarity_df_words_vs_trigrams = pd.DataFrame(matrix_words_vs_trigrams, index=top_words_overall, columns=[' '.join(g) for g in top_trigrams_overall])

# Bigram vs Bigram (Jaccard Similarity)
matrix_bg = np.array([[jaccard_similarity(list(g1), list(g2)) for g2 in top_bigrams_overall] for g1 in top_bigrams_overall])
similarity_df_bigrams = pd.DataFrame(matrix_bg, index=[' '.join(g) for g in top_bigrams_overall], columns=[' '.join(g) for g in top_bigrams_overall])

# Trigram vs Trigram (Jaccard Similarity)
matrix_tg = np.array([[jaccard_similarity(list(g1), list(g2)) for g2 in top_trigrams_overall] for g1 in top_trigrams_overall])
similarity_df_trigrams = pd.DataFrame(matrix_tg, index=[' '.join(g) for g in top_trigrams_overall], columns=[' '.join(g) for g in top_trigrams_overall])


# --- 3. Generate and Save Heatmap Images ---
def create_and_save_heatmap(df_plot, title, filename, annot=False, cmap='viridis', figsize=(12, 10)):
    """Creates, displays, and saves a heatmap from a DataFrame."""
    plt.figure(figsize=figsize)
    sns.heatmap(df_plot, annot=annot, cmap=cmap, fmt=".2f")
    plt.title(title, fontsize=16)
    plt.savefig(os.path.join(OUTPUT_FOLDER_PATH, filename), bbox_inches='tight')
    plt.show()

# Generate and save the heatmaps
create_and_save_heatmap(similarity_df_words, 'Semantic Similarity of Top 20 Words', 'heatmap_words.png')
create_and_save_heatmap(similarity_df_bigrams, 'Jaccard Similarity of Top 10 Bigrams', 'heatmap_bigrams.png', annot=True, cmap='coolwarm')

# ADDED: Heatmap for Trigrams vs Trigrams
create_and_save_heatmap(similarity_df_trigrams, 'Jaccard Similarity of Top 10 Trigrams', 'heatmap_trigrams.png', annot=True, cmap='coolwarm')

# ADDED: Heatmap for Words vs Bigrams
create_and_save_heatmap(similarity_df_words_vs_bigrams, 'Jaccard Similarity: Top Words vs. Top Bigrams', 'heatmap_words_vs_bigrams.png', annot=True, cmap='magma', figsize=(10, 12))

# ADDED: Heatmap for Words vs Trigrams
create_and_save_heatmap(similarity_df_words_vs_trigrams, 'Jaccard Similarity: Top Words vs. Top Trigrams', 'heatmap_words_vs_trigrams.png', annot=True, cmap='magma', figsize=(10, 12))

# Cell 8: Final Report Generation
**This final cell gathers every DataFrame and image and compiles them into a single, multi-sheet Excel report.**

In [None]:
# ==============================================================================
# 8. FINAL REPORT GENERATION (FINAL, ROBUST VERSION)
# ==============================================================================
# Make sure to have this import for adding images
from openpyxl.drawing.image import Image
import pandas as pd
import os
from datetime import datetime

# --- 0. Helper Function to Add Images ---
# This helper function simplifies adding images by checking if the file exists first.
def add_image_if_exists(worksheet, image_filename, cell_anchor, folder_path):
    """Checks for an image file and adds it to the specified worksheet cell."""
    image_path = os.path.join(folder_path, image_filename)
    if os.path.exists(image_path):
        img = Image(image_path)
        worksheet.add_image(img, cell_anchor)
    else:
        print(f"Warning: Image file not found at {image_path}, skipping.")

# --- 1. Generate Versioned Filename ---
current_date = datetime.now().strftime('%Y-%m-%d')
base_filename = f"{current_date}_{PROGRAM_NAME}_{KPIS_IN_SCOPE}_{LOBS_IN_SCOPE}_Verbatim_Analysis"
minor_version = 0
while True:
    version_str = f"v{MAJOR_VERSION:02d}.{minor_version:02d}"
    output_filename = f"{base_filename}_{version_str}.xlsx"
    full_path = os.path.join(OUTPUT_FOLDER_PATH, output_filename)
    if not os.path.exists(full_path):
        break
    minor_version += 1

# --- 2. Use ExcelWriter to save all results ---
with pd.ExcelWriter(full_path, engine='openpyxl') as writer:
    print(f"\n--- 🚀 Writing to Excel file: {output_filename} ---")

    # --- Sheet 1: Executive Summary ---
    print("Writing Sheet: Executive_Summary")
    row_offset = 0
    df_summary_metrics.to_excel(writer, sheet_name='Executive_Summary', index=False, startrow=row_offset + 1)
    pos1 = row_offset + len(df_summary_metrics) + 3
    
    df_topic_evaluation.round(3).to_excel(writer, sheet_name='Executive_Summary', index=False, startrow=pos1 + 1)
    pos2 = pos1 + len(df_topic_evaluation) + 3

    df_cat_counts.to_frame(name='Count').to_excel(writer, sheet_name='Executive_Summary', startrow=pos2 + 1)
    pos3 = pos2 + len(df_cat_counts) + 3

    df_subcat_counts.to_frame(name='Count').to_excel(writer, sheet_name='Executive_Summary', startrow=pos3 + 1)

    ws_summary = writer.sheets['Executive_Summary']
    ws_summary.cell(row=row_offset + 1, column=1, value="Categorization Summary")
    ws_summary.cell(row=pos1 + 1, column=1, value="Topic Model Evaluation (Silhouette & Coherence)")
    ws_summary.cell(row=pos2 + 1, column=1, value="Overall Category Counts")
    ws_summary.cell(row=pos3 + 1, column=1, value="Overall Sub-Category Counts")

    add_image_if_exists(ws_summary, 'topic_evaluation_charts.png', 'E2', OUTPUT_FOLDER_PATH)
    add_image_if_exists(ws_summary, 'freq_cat_overall.png', f'E{pos2 + 2}', OUTPUT_FOLDER_PATH)
    add_image_if_exists(ws_summary, 'freq_subcat_overall.png', f'E{pos3 + 2}', OUTPUT_FOLDER_PATH)

    # --- Sheet 2: Categorization Results (Full Data) ---
    print("Writing Sheet: Categorization_Results")
    df.to_excel(writer, sheet_name='Categorization_Results', index=False)

    # --- Sheet 3: Sentiment Analysis ---
    print("Writing Sheet: Sentiment_Analysis")
    row_offset = 0
    df_sentiment_by_cat.to_excel(writer, sheet_name='Sentiment_Analysis', startrow=row_offset + 1)
    pos1 = row_offset + len(df_sentiment_by_cat) + 3
    df_sentiment_by_subcat.to_excel(writer, sheet_name='Sentiment_Analysis', startrow=pos1 + 1)
    
    ws_sentiment = writer.sheets['Sentiment_Analysis']
    ws_sentiment.cell(row=row_offset + 1, column=1, value="Sentiment Breakdown by Category")
    ws_sentiment.cell(row=pos1 + 1, column=1, value="Sentiment Breakdown by Sub-Category")
    add_image_if_exists(ws_sentiment, 'sentiment_distribution.png', 'G2', OUTPUT_FOLDER_PATH)

    charts_start_row = pos1 + len(df_sentiment_by_subcat) + 5
    sentiment_charts = [
        ('Positive Sentiment', 'freq_cat_positive.png', 'freq_subcat_positive.png'),
        ('Negative Sentiment', 'freq_cat_negative.png', 'freq_subcat_negative.png'),
        ('Neutral Sentiment', 'freq_cat_neutral.png', 'freq_subcat_neutral.png')
    ]
    current_row = charts_start_row
    for title, cat_chart, subcat_chart in sentiment_charts:
        ws_sentiment.cell(row=current_row, column=1, value=title)
        add_image_if_exists(ws_sentiment, cat_chart, f'A{current_row + 1}', OUTPUT_FOLDER_PATH)
        add_image_if_exists(ws_sentiment, subcat_chart, f'K{current_row + 1}', OUTPUT_FOLDER_PATH)
        current_row += 40

    # --- Sheet 4: Exploratory Analysis (N-Grams & Word Clouds) ---
    print("Writing Sheet: Exploratory_Analysis")
    row_offset = 0
    df_top_words.to_excel(writer, sheet_name='Exploratory_Analysis', index=False, startrow=row_offset + 1)
    pos1 = row_offset + len(df_top_words) + 3
    df_top_bigrams.to_excel(writer, sheet_name='Exploratory_Analysis', index=False, startrow=pos1 + 1)
    pos2 = pos1 + len(df_top_bigrams) + 3
    df_top_trigrams.to_excel(writer, sheet_name='Exploratory_Analysis', index=False, startrow=pos2 + 1)

    ws_exploratory = writer.sheets['Exploratory_Analysis']
    ws_exploratory.cell(row=row_offset + 1, column=1, value="Top 20 Words")
    ws_exploratory.cell(row=pos1 + 1, column=1, value="Top 10 Bigrams")
    ws_exploratory.cell(row=pos2 + 1, column=1, value="Top 10 Trigrams")
    
    add_image_if_exists(ws_exploratory, 'wordcloud_all.png', 'E2', OUTPUT_FOLDER_PATH)
    add_image_if_exists(ws_exploratory, 'wordcloud_positive.png', 'E25', OUTPUT_FOLDER_PATH)
    add_image_if_exists(ws_exploratory, 'wordcloud_negative.png', 'E50', OUTPUT_FOLDER_PATH)

    # --- Sheet 5: Topic Modeling & Uncategorized ---
    print("Writing Sheet: Topic_Modeling_Deep_Dive")
    row_offset = 0
    df_topics.to_excel(writer, sheet_name='Topic_Modeling_Deep_Dive', index=False, startrow=row_offset+1)
    pos1 = row_offset + len(df_topics) + 3
    df_uncategorized_keywords.to_excel(writer, sheet_name='Topic_Modeling_Deep_Dive', index=False, startrow=pos1 + 1)
    
    ws_topics = writer.sheets['Topic_Modeling_Deep_Dive']
    ws_topics.cell(row=row_offset + 1, column=1, value="Discovered Topics (LDA)")
    ws_topics.cell(row=pos1 + 1, column=1, value="Top Keywords in Uncategorized Verbatims")

    # --- Sheet 6: Similarity Analysis (REVISED) ---
    print("Writing Sheet: Similarity_Analysis")
    
    # Write Jaccard Similarity table
    similarity_df.to_excel(writer, sheet_name='Similarity_Analysis', index=False, startrow=1)
    ws_similarity = writer.sheets['Similarity_Analysis']
    ws_similarity.cell(row=1, column=1, value="Jaccard Similarity Between Sentiments")
    
    # Position for the first heatmap
    current_row = len(similarity_df) + 5
    
    # Add heatmaps with titles
    ws_similarity.cell(row=current_row, column=1, value="Semantic Similarity of Top 20 Words")
    add_image_if_exists(ws_similarity, 'heatmap_words.png', f'A{current_row + 1}', OUTPUT_FOLDER_PATH)
    
    ws_similarity.cell(row=current_row, column=15, value="Jaccard Similarity of Top 10 Bigrams")
    add_image_if_exists(ws_similarity, 'heatmap_bigrams.png', f'O{current_row + 1}', OUTPUT_FOLDER_PATH)
    
    # Move to the next row for more heatmaps
    current_row += 55
    
    ws_similarity.cell(row=current_row, column=1, value="Jaccard Similarity of Top 10 Trigrams")
    add_image_if_exists(ws_similarity, 'heatmap_trigrams.png', f'A{current_row + 1}', OUTPUT_FOLDER_PATH)
    
    ws_similarity.cell(row=current_row, column=15, value="Jaccard Similarity: Top Words vs. Top Bigrams")
    add_image_if_exists(ws_similarity, 'heatmap_words_vs_bigrams.png', f'O{current_row + 1}', OUTPUT_FOLDER_PATH)
    
    current_row += 55
    ws_similarity.cell(row=current_row, column=1, value="Jaccard Similarity: Top Words vs. Top Trigrams")
    add_image_if_exists(ws_similarity, 'heatmap_words_vs_trigrams.png', f'A{current_row + 1}', OUTPUT_FOLDER_PATH)


print(f"\n✅ All analysis results and plots have been saved to a multi-sheet file:")
print(f"   {full_path}")

# Cell 9: Clear the /Kaggle/Working output directory

In [None]:
# # Ctrl + / to un/comment out code while highlighted

# import os
# import shutil

# # This is the directory you want to clear
# output_dir = '/kaggle/working/'

# # Loop through everything in the directory
# for filename in os.listdir(output_dir):
#     file_path = os.path.join(output_dir, filename)
#     try:
#         # If it's a file or link, delete it
#         if os.path.isfile(file_path) or os.path.islink(file_path):
#             os.unlink(file_path)
#         # If it's a directory, delete it and all its contents
#         elif os.path.isdir(file_path):
#             shutil.rmtree(file_path)
#         print(f"Deleted: {filename}")
#     except Exception as e:
#         print(f'Failed to delete {file_path}. Reason: {e}')

# print("\n✅ Output directory has been cleared.")