In [None]:
# Cell 1: Setup and Configuration
# --- 1. Install Libraries (Reliable Method) ---
import sys
# !{sys.executable} -m pip install pandas openpyxl matplotlib wordcloud seaborn
# !{sys.executable} -m pip install --upgrade nltk scikit-learn spacy
# !{sys.executable} -m pip install --upgrade Pillow==9.5.0

# --- 2. Import Libraries ---
import os
import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import spacy
from datetime import datetime
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# ... (the rest of the imports from your original cell)
# ... make sure all original imports are here ...
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import FreqDist, bigrams, trigrams
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# --- 3. Download NLP Models ---
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('vader_lexicon', quiet=True) # Correct
nltk.download('omw-1.4', quiet=True)
!{sys.executable} -m spacy download en_core_web_md

# --- 4. 🔴 USER CONFIGURATION 🔴 ---
EXCEL_FILE_PATH = r'///.xlsx'
TEXT_COLUMN_NAME = 'Verbatim'
CUSTOM_STOP_WORDS = {'app', 'product', 'service', 'company'}
TOPIC_MODEL_TOPICS = 3
output_folder_path = r'///Analysis_Results'
program_name = "Program"
kpis_in_scope = "KPI"
lobs_in_scope = "LOB"

print("✅ Setup complete. All libraries and models are ready.")

In [None]:
# Cell 2: Data Loading and Preprocessing
# --- 1. Load Data ---
try:
    df = pd.read_excel(EXCEL_FILE_PATH)
    print(f"Successfully loaded {len(df)} rows from '{EXCEL_FILE_PATH}'.")
except FileNotFoundError:
    print(f"⚠️ Warning: File not found. Loading dummy data for demonstration.")
    df = pd.DataFrame({
        TEXT_COLUMN_NAME: [
            "The customer service was excellent! Very helpful and friendly.",
            "I'm very unhappy with the new update. It's slow and buggy.",
            "It's okay, but the price is too high for what you get.",
            "Love the new design! The user interface is so much better.",
            "The app crashes all the time. Please fix this bug.",
        ]
    })
df.dropna(subset=[TEXT_COLUMN_NAME], inplace=True)

# --- 2. Text Cleaning ---
stop_words = set(stopwords.words('english')).union(CUSTOM_STOP_WORDS)
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if not isinstance(text, str): return ""
    text = text.lower()
    text = re.sub(r'[\d\n]', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text.strip())
    cleaned_tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words and len(w) > 2]
    return " ".join(cleaned_tokens)

df['cleaned_text'] = df[TEXT_COLUMN_NAME].apply(clean_text)

# --- 3. Sentiment Analysis ---
sia = SentimentIntensityAnalyzer()
df['sentiment_compound'] = df[TEXT_COLUMN_NAME].apply(lambda x: sia.polarity_scores(x)['compound'])

def categorize_sentiment(compound):
    if compound >= 0.05: return 'Positive'
    if compound <= -0.05: return 'Negative'
    return 'Neutral'

df['sentiment_label'] = df['sentiment_compound'].apply(categorize_sentiment)

print("\n--- Data Preview with Cleaned Text and Sentiment ---")
display(df[[TEXT_COLUMN_NAME, 'cleaned_text', 'sentiment_label']].head())

# --- 4. Plot Sentiment Distribution ---
plt.figure(figsize=(6, 4))
df['sentiment_label'].value_counts().plot(kind='bar', color=['green', 'red', 'grey'])
plt.title('Sentiment Distribution')
plt.ylabel('Number of Responses')
plt.xticks(rotation=0)
plt.show()

In [None]:
# Cell 3: Overall Frequency Analysis & Word Clouds
# --- 1. Prepare Overall Text and Tokens ---
all_cleaned_text = " ".join(df['cleaned_text'])
all_tokens = word_tokenize(all_cleaned_text)

# --- 2. Frequency Distribution ---
fdist = FreqDist(all_tokens)
print("--- Top 20 Most Common Words ---")
print(fdist.most_common(20))
fdist.plot(20, title='Top 20 Most Common Words')
plt.show()

# --- 3. Word Cloud Visualization ---
def generate_wordcloud(text, title):
    if not text.strip():
        print(f"Skipping '{title}' word cloud: No text available.")
        return
    wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.show()

# Generate clouds for overall, positive, and negative sentiment
generate_wordcloud(all_cleaned_text, 'Word Cloud (All Feedback)')
generate_wordcloud(" ".join(df[df.sentiment_label == 'Positive']['cleaned_text']), 'Word Cloud (Positive Feedback)')
generate_wordcloud(" ".join(df[df.sentiment_label == 'Negative']['cleaned_text']), 'Word Cloud (Negative Feedback)')

In [None]:
# Cell 4: Topic Modeling (LDA)
# --- Topic Modeling using LDA ---
print("\n--- Discovering Latent Topics ---")
vectorizer = CountVectorizer(max_df=0.9, min_df=2, stop_words='english')
dtm = vectorizer.fit_transform(df['cleaned_text'].dropna())

if dtm.shape[0] > 1:
    lda = LatentDirichletAllocation(n_components=TOPIC_MODEL_TOPICS, random_state=42)
    lda.fit(dtm)
    
    topic_results = []
    feature_names = vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(lda.components_):
        top_words_str = ", ".join([feature_names[i] for i in topic.argsort()[:-10 - 1:-1]])
        topic_results.append([f"Topic #{topic_idx + 1}", top_words_str])
    
    # Create a new DataFrame to hold the results
    df_topics = pd.DataFrame(topic_results, columns=['Topic', 'Top_Words'])
    print("Topic Modeling Results:")
    display(df_topics)
    
    feature_names = vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(lda.components_):
        top_words = " ".join([feature_names[i] for i in topic.argsort()[:-10 - 1:-1]])
        print(f"Topic #{topic_idx + 1}: {top_words}")
else:
    print("Not enough data to perform topic modeling.")

In [None]:
# Cell 5: Jaccard Similarity (Between Sentiments)
# --- 1. Jaccard Similarity Analysis ---
N_WORDS, N_BIGRAMS, N_TRIGRAMS = 20, 10, 10

def get_top_ngrams(tokens, n, N_top):
    if n == 1:
        ngrams_list = tokens
    elif n == 2:
        ngrams_list = list(bigrams(tokens))
    elif n == 3:
        ngrams_list = list(trigrams(tokens))
    return [item for item, freq in FreqDist(ngrams_list).most_common(N_top)]

sentiments = ['Positive', 'Negative', 'Neutral']
tokens_by_sentiment = {s: word_tokenize(" ".join(df[df.sentiment_label == s]['cleaned_text'])) for s in sentiments}

top_words = {s: get_top_ngrams(tokens_by_sentiment[s], 1, N_WORDS) for s in sentiments}
top_bigrams = {s: get_top_ngrams(tokens_by_sentiment[s], 2, N_BIGRAMS) for s in sentiments}
top_trigrams = {s: get_top_ngrams(tokens_by_sentiment[s], 3, N_TRIGRAMS) for s in sentiments}

def jaccard_similarity(list1, list2):
    set1, set2 = set(list1), set(list2)
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union) if union else 0.0

# --- 2. Display Results ---
comparisons = [("Positive", "Negative"), ("Positive", "Neutral"), ("Negative", "Neutral")]
results = {
    "Comparison": [f"{s1} vs. {s2}" for s1, s2 in comparisons],
    f"Words (Top {N_WORDS})": [jaccard_similarity(top_words[s1], top_words[s2]) for s1, s2 in comparisons],
    f"Bigrams (Top {N_BIGRAMS})": [jaccard_similarity(top_bigrams[s1], top_bigrams[s2]) for s1, s2 in comparisons],
    f"Trigrams (Top {N_TRIGRAMS})": [jaccard_similarity(top_trigrams[s1], top_trigrams[s2]) for s1, s2 in comparisons]
}

print("\n--- Vocabulary Similarity Between Sentiments ---")
display(pd.DataFrame(results).round(3))

In [None]:
# --- 1. Load spaCy Model and Define Jaccard Function ---
try:
    nlp = spacy.load("en_core_web_md")
    print("\n--- Calculating Inter-Item Similarity ---")
except OSError:
    print("spaCy model not found. Please run Cell 1 to download.")
    nlp = None

def jaccard_similarity(list1, list2):
    set1, set2 = set(list1), set(list2)
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union) if union else 0.0

if nlp:
    # --- 2. Semantic Similarity for Words ---
    top_words_overall = get_top_ngrams(all_tokens, 1, 20)
    
    print(f"\n--- Top {len(top_words_overall)} Overall Words ---")
    print(top_words_overall)

    matrix = np.array([[nlp(w1).similarity(nlp(w2)) for w2 in top_words_overall] for w1 in top_words_overall])
    similarity_df_words = pd.DataFrame(matrix, index=top_words_overall, columns=top_words_overall)

    plt.figure(figsize=(12, 10))
    sns.heatmap(similarity_df_words, cmap='viridis')
    plt.title('Semantic Similarity Matrix of Top 20 Words', fontsize=16)
    plt.show()

    # --- 3. Jaccard Similarity for N-Grams ---
    def create_ngram_df(ngrams):
        matrix = np.array([[jaccard_similarity(list(g1), list(g2)) for g2 in ngrams] for g1 in ngrams])
        labels = [' '.join(gram) for gram in ngrams]
        return pd.DataFrame(matrix, index=labels, columns=labels)

    top_bigrams_overall = get_top_ngrams(all_tokens, 2, 10)
    top_trigrams_overall = get_top_ngrams(all_tokens, 3, 10)
    
    print(f"\n--- Top {len(top_bigrams_overall)} Overall Bigrams ---")
    print([' '.join(bigram) for bigram in top_bigrams_overall])
    
    print(f"\n--- Top {len(top_trigrams_overall)} Overall Trigrams ---")
    print([' '.join(trigram) for trigram in top_trigrams_overall])
    
    similarity_df_bigrams = create_ngram_df(top_bigrams_overall)
    similarity_df_trigrams = create_ngram_df(top_trigrams_overall)

    # Plot the bigram heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(similarity_df_bigrams, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title('Jaccard Similarity Matrix of Top 10 Bigrams', fontsize=16)
    plt.show()

    # Plot the trigram heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(similarity_df_trigrams, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title('Jaccard Similarity Matrix of Top 10 Trigrams', fontsize=16)
    plt.show()

print("\n✅ Similarity matrices created and assigned to variables.")

In [None]:
import os
from datetime import datetime
from openpyxl.drawing.image import Image # Required for adding images

# --- 1. Create the output folder if it doesn't exist ---
os.makedirs(output_folder_path, exist_ok=True)

# --- 2. Construct the dynamic filename ---
current_date = datetime.now().strftime('%Y-%m-%d')
dynamic_filename = f"{current_date}_{program_name}_{kpis_in_scope}_{lobs_in_scope}_Verbatim_Analysis_Results.xlsx"
full_path_filename = os.path.join(output_folder_path, dynamic_filename)

# --- 3. Save Heatmap Plots as Image Files ---
# Define image paths
path_img_words = os.path.join(output_folder_path, 'word_similarity.png')
path_img_bigrams = os.path.join(output_folder_path, 'bigram_similarity.png')
path_img_trigrams = os.path.join(output_folder_path, 'trigram_similarity.png')

# --- 4. Save word similarity plot
fig1 = plt.figure(figsize=(12, 10))
sns.heatmap(similarity_df_words, cmap='viridis')
plt.title('Semantic Similarity Matrix of Top 20 Words', fontsize=16)
plt.savefig(path_img_words, bbox_inches='tight')
plt.close(fig1)

# --- 5. Save bigram similarity plot
fig2 = plt.figure(figsize=(10, 8))
sns.heatmap(similarity_df_bigrams, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Jaccard Similarity Matrix of Top 10 Bigrams', fontsize=16)
plt.savefig(path_img_bigrams, bbox_inches='tight')
plt.close(fig2)

# --- 6. Save trigram similarity plot
fig3 = plt.figure(figsize=(10, 8))
sns.heatmap(similarity_df_trigrams, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Jaccard Similarity Matrix of Top 10 Trigrams', fontsize=16)
plt.savefig(path_img_trigrams, bbox_inches='tight')
plt.close(fig3)

# --- 7. Use ExcelWriter to save all data and embed images ---
with pd.ExcelWriter(full_path_filename, engine='openpyxl') as writer:
    # Save DataFrames to sheets
    df[[TEXT_COLUMN_NAME, 'cleaned_text', 'sentiment_label']].to_excel(writer, sheet_name='Sentiment_Analysis', index=False)
    if 'similarity_df' in locals(): similarity_df.to_excel(writer, sheet_name='Jaccard_Similarity', index=False)
    if 'df_topics' in locals(): df_topics.to_excel(writer, sheet_name='Topic_Modeling_Results', index=False)
    
    # Save the matrix data first, then we'll add the image
    if 'similarity_df_words' in locals(): similarity_df_words.to_excel(writer, sheet_name='Word_Similarity_Matrix')
    if 'similarity_df_bigrams' in locals(): similarity_df_bigrams.to_excel(writer, sheet_name='Bigram_Similarity_Matrix')
    if 'similarity_df_trigrams' in locals(): similarity_df_trigrams.to_excel(writer, sheet_name='Trigram_Similarity_Matrix')

    # --- Embed Images into the Excel Sheets ---
    # Get the workbook and worksheet objects
    workbook = writer.book
    
    # Add word similarity image
    if os.path.exists(path_img_words):
        ws_words = workbook['Word_Similarity_Matrix']
        img_words = Image(path_img_words)
        ws_words.add_image(img_words, 'W2') # Position the image in cell W2

    # Add bigram similarity image
    if os.path.exists(path_img_bigrams):
        ws_bigrams = workbook['Bigram_Similarity_Matrix']
        img_bigrams = Image(path_img_bigrams)
        ws_bigrams.add_image(img_bigrams, 'O2') # Position the image in cell O2

    # Add trigram similarity image
    if os.path.exists(path_img_trigrams):
        ws_trigrams = workbook['Trigram_Similarity_Matrix']
        img_trigrams = Image(path_img_trigrams)
        ws_trigrams.add_image(img_trigrams, 'O2') # Position the image in cell O2

print(f"✅ All analysis results and plots have been saved to a multi-sheet file:")
print(full_path_filename)