In [9]:
# ==============================
# Keyword Extraction Notebook
# ==============================

# ------------------------------
# 1️⃣ Setup & Imports
# ------------------------------
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt_tab')

# ------------------------------
# 2️⃣ Add src folder to Python path
# ------------------------------
src_path = os.path.join(os.getcwd(), "..", "src")
if src_path not in sys.path:
    sys.path.append(src_path)

# ------------------------------
# 3️⃣ Import functions from modular .py files
# ------------------------------
from analysis.keyword_extraction.frequency import extract_frequency_keywords_by_bank
from analysis.keyword_extraction.tfidf import extract_tfidf_keywords_by_bank
from analysis.keyword_extraction.topic_modelling import lda_topics_by_bank
from analysis.keyword_extraction.pos_tagging import extract_noun_phrases_by_bank

# ------------------------------
# 4️⃣ Load Processed Reviews
# ------------------------------
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
processed_csv = os.path.join(project_root, "data", "processed", "reviews_processed.csv")

df = pd.read_csv(processed_csv)
print(f"Loaded {len(df)} reviews")
df.head()

# ------------------------------
# 5️⃣ Frequency-Based Keywords
# ------------------------------
top_n = 20
freq_keywords_per_bank = extract_frequency_keywords_by_bank(df, top_n=top_n)

for bank, keywords in freq_keywords_per_bank.items():
    print(f"\nTop {top_n} frequency keywords for {bank}:")
    print(keywords)

# ------------------------------
# 6️⃣ TF-IDF Keywords
# ------------------------------
tfidf_keywords_per_bank = extract_tfidf_keywords_by_bank(df, top_n=top_n)

for bank, keywords in tfidf_keywords_per_bank.items():
    print(f"\nTop {top_n} TF-IDF keywords for {bank}:")
    print(keywords)

# ------------------------------
# 7️⃣ LDA Topic Modeling
# ------------------------------
n_topics = 5
n_top_words = 10

lda_topics_per_bank = lda_topics_by_bank(df, n_topics=n_topics, n_top_words=n_top_words)

for bank, topics in lda_topics_per_bank.items():
    print(f"\nLDA Topics for {bank}:")
    for topic, words in topics.items():
        print(f"{topic}: {words}")

# ------------------------------
# 8️⃣ POS Tagging / Noun Phrase Extraction
# ------------------------------
noun_phrases_per_bank = extract_noun_phrases_by_bank(df, top_n=top_n)

for bank, phrases in noun_phrases_per_bank.items():
    print(f"\nTop {top_n} noun phrases for {bank}:")
    print(phrases)

# ------------------------------
# 9️⃣ Optional: Save Results per Bank
# ------------------------------
output_dir = os.path.join(project_root, "data", "processed")
os.makedirs(output_dir, exist_ok=True)

# Save frequency keywords
for bank, keywords in freq_keywords_per_bank.items():
    pd.DataFrame(keywords, columns=['keyword','frequency']).to_csv(
        os.path.join(output_dir, f"{bank}_frequency_keywords.csv"), index=False
    )

# Save TF-IDF keywords
for bank, keywords in tfidf_keywords_per_bank.items():
    pd.DataFrame(keywords, columns=['keyword','tfidf_score']).to_csv(
        os.path.join(output_dir, f"{bank}_tfidf_keywords.csv"), index=False
    )

# Save noun phrases
for bank, phrases in noun_phrases_per_bank.items():
    pd.DataFrame(phrases, columns=['noun_phrase','count']).to_csv(
        os.path.join(output_dir, f"{bank}_noun_phrases.csv"), index=False
    )

print("All keyword extraction results saved in data/processed/")


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\derej\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


Loaded 1646 reviews

Top 20 frequency keywords for Bank of Abyssinia:
[('app', 412), ('bank', 102), ('please', 85), ('mobile', 82), ('banking', 80), ('work', 75), ('worst', 63), ('use', 62), ('update', 61), ('working', 60), ('boa', 56), ('time', 53), ('fix', 53), ('ca', 53), ('even', 52), ('good', 50), ('developer', 44), ('like', 43), ('one', 42), ('ever', 41)]

Top 20 frequency keywords for Commercial Bank of Ethiopia:
[('app', 349), ('update', 92), ('good', 90), ('use', 83), ('bank', 81), ('cbe', 79), ('best', 60), ('application', 57), ('money', 56), ('please', 55), ('developer', 54), ('banking', 53), ('time', 47), ('like', 46), ('easy', 43), ('screenshot', 43), ('ca', 42), ('mobile', 42), ('account', 39), ('service', 39)]

Top 20 frequency keywords for Dashen Bank:
[('app', 307), ('dashen', 129), ('bank', 112), ('banking', 100), ('super', 89), ('one', 76), ('use', 66), ('easy', 51), ('fast', 48), ('best', 47), ('features', 43), ('good', 43), ('like', 36), ('mobile', 34), ('step', 33