In [1]:
# Fix imports when running from notebooks/ folder
import sys
from pathlib import Path

project_root = Path.cwd().parent 
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(f"Added to path: {project_root}")

%load_ext autoreload
%autoreload 2

Added to path: /Users/elshaday/DEV/10Academy/customer-experience-analytics-week2


In [2]:
from src.data import DataManager
from tabulate import tabulate
from src.analysis import KeywordExtractor, ThemeClusterer
from tabulate import tabulate
import pandas as pd
from scripts.constants import THEMES

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/elshaday/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
dm = DataManager()
reviews_df = dm.load_data()

In [4]:
print("Showing 5 records as an overview:")
print(tabulate(reviews_df.head(), headers="keys", tablefmt="psql"))

Showing 5 records as an overview:
+----+--------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+---------------------+--------+-------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+---------------+------------------+------------------+
|    | review_id                            | review                                                                                               

In [5]:
# 1. Extract Keywords
# TfidfVectorizer with bigrams (n=1â€“2)
# spaCy to extract nouns/verbs if you want cleaner thematic clusters

# 2. Group keywords into themes
# 3-5 theme per bank
# Look at top TF-IDF keywords, Cluster similar keywords, Assign each cluster a theme label

# 3. Assign Theme to Each Review
# Check if any keyword from a theme appears
# Add a theme column

# 4. Group by bank and theme
# Count of each theme per bank
# Average sentiment per theme
# Top keywords per theme
# Correlation between rating and sentiment

In [6]:
extractor = KeywordExtractor()
clusterer = ThemeClusterer(n_clusters=5)


def analyze( df: pd.DataFrame, review_col: str, bank_col: str):
    bank_themes = {}
    for bank in df[bank_col].unique():
        bank_reviews = df.loc[df[bank_col] == bank, review_col].astype(str)
        # Extract keywords
        kw_with_scores = extractor.extract_keywords_tfidf(bank_reviews)
        keywords = [kw for kw, _ in kw_with_scores]
        # Cluster keywords into themes
        clusters = clusterer.cluster(keywords)
        themes = clusterer.assign_labels(clusters)
        bank_themes[bank] = themes

    return bank_themes


# Run per-bank theme analysis
bank_themes = analyze(
    reviews_df, review_col="clean_review", bank_col="bank"
)

# Display
for bank, themes in bank_themes.items():
    print(f"\n=== {bank} ===")
    rows = [[theme, ", ".join(words)] for theme, words in themes.items()]
    print(tabulate(rows, headers=["Theme", "Keywords"], tablefmt="grid"))


=== boa ===
+-------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Theme                               | Keywords                                                                                                                                                           |
| App Performance & Reliability       | please fix, please try, sometimes work, never work, working please, work well, work device, please check, work version, please working, work please, please invest |
+-------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Customer Support & Feature Requests | good job, good service, great service, customer service, overall good, service please                                          

In [8]:
print("Apply theme per review")
def assign_theme_to_review(review_text: str, theme_dict: dict) -> str:
    """
    Assign a theme to a single review based on keyword matches.
    Returns 'Other' if no theme matches.
    """
    if not isinstance(review_text, str):
        return "Other"

    text = review_text.lower()

    for theme, keywords in theme_dict.items():
        for kw in keywords:
            if kw.lower() in text:
                return theme

    return "Other"


reviews_df["theme"] = reviews_df["clean_review"].apply(lambda txt: assign_theme_to_review(txt, THEMES))

print(tabulate(reviews_df[["theme", "clean_review"]].head(), headers=["Clean Review", "Theme"], tablefmt="grid"))

+----+-------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|    | Clean Review                        | Theme                                                                                                                                                                                                                                          |
|  0 | Other                               | ok                                                                                                                                                                                                                                             |
+----+-------------------------------------+--------------------------------------------------------------------------------------------------

In [10]:
print("Save to csv")

dm.save_to_csv(df=reviews_df, processed=True)


Save to csv
Saved data to ../data/processed/cleaned_reviews_data.csv
