In [1]:
 #notebooks/02_sentiment_thematic_analysis.ipynb

import os
import sys
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import re # For regex in text cleaning

In [2]:

# --- Project Setup: Ensure src module is discoverable ---
def find_project_root(current_path):
    path = current_path
    while path != os.path.dirname(path):
        if (os.path.isdir(os.path.join(path, 'src')) and
            os.path.isdir(os.path.join(path, 'data')) and
            os.path.isdir(os.path.join(path, 'notebooks'))):
            return path
        path = os.path.dirname(path)
    return current_path

current_working_dir = os.getcwd()
project_root = find_project_root(current_working_dir)

if project_root not in sys.path:
    sys.path.append(project_root)
    print(f"Added '{project_root}' to sys.path for module imports.")
else:
    print(f"'{project_root}' already in sys.path.")

# Import configuration variables
from src.config import CLEAN_REVIEWS_CSV, PROCESSED_DATA_DIR

# Define output file for Task 2 results
SENTIMENT_THEMES_CSV = os.path.join(PROCESSED_DATA_DIR, 'reviews_with_sentiment_themes.csv')
# Ensure the output directory exists

Added 'c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM-2\fintech-app-customer-experience-analytics' to sys.path for module imports.
Project structure setup complete and config.py created/updated.
Base Directory: c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM-2\fintech-app-customer-experience-analytics
Raw Data Directory: c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM-2\fintech-app-customer-experience-analytics\data\raw
Processed Data Directory: c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM-2\fintech-app-customer-experience-analytics\data\processed
App IDs to scrape: {'Commercial Bank of Ethiopia': 'com.combanketh.mobilebanking', 'Bank of Abyssinia': 'com.boa.boaMobileBanking', 'Dashen Bank': 'com.dashen.dashensuperapp'}


In [3]:

# --- Step 1: Load Cleaned Data ---
print("\n--- Loading Cleaned Review Data ---")
try:
    df = pd.read_csv(CLEAN_REVIEWS_CSV)
    print(f"Cleaned data loaded successfully from {CLEAN_REVIEWS_CSV}.")
    print(f"Initial DataFrame shape: {df.shape}")
    print("\nFirst 5 rows of loaded data:")
    print(df.head())
except FileNotFoundError:
    print(f"CRITICAL ERROR: Cleaned data file not found at {CLEAN_REVIEWS_CSV}. Please run Task 1 first.")
    sys.exit("Exiting: Cleaned data not found.")
except Exception as e:
    print(f"CRITICAL ERROR: Could not load cleaned data: {e}")
    sys.exit("Exiting: Cleaned data loading failed.")

if df.empty:
    print("WARNING: Loaded DataFrame is empty. Skipping sentiment and thematic analysis.")
    sys.exit("Exiting: Empty DataFrame.")


--- Loading Cleaned Review Data ---
Cleaned data loaded successfully from c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM-2\fintech-app-customer-experience-analytics\data\processed\clean_play_store_reviews.csv.
Initial DataFrame shape: (8989, 6)

First 5 rows of loaded data:
                              review_id  \
0  a7d1c799-ba53-4a0a-a8d6-c5400a009825   
1  64ed5562-1758-4eb8-9291-8b6edc394118   
2  d0c05687-ddd4-43fb-95a9-08f6358d80a2   
3  811bf820-3529-433a-9b6d-e624fa23a16a   
4  be2cb2ac-bbe0-4175-81c4-9f6c86afdaaa   

                                         review_text  rating        date  \
0  A great app. It's like carrying a bank in your...       4  2025-06-07   
1                      More than garrantty bank EBC.       4  2025-06-07   
2  really am happy to this app it is Siple to use...       5  2025-06-07   
3  I liked this app. But the User interface is ve...       2  2025-06-07   
4  "Why don’t your ATMs support account-to-accoun...       4  2025-06-06   

             

In [4]:

# --- Step 2: NLP Preprocessing for Sentiment and Thematic Analysis ---
print("\n--- Performing NLP Preprocessing (Tokenization, Stopword Removal) ---")

# Download NLTK data
# These commands will download the necessary data if not already present.
# It's more robust to call nltk.download() directly without complex try-except for DownloadError
print("Downloading NLTK data (stopwords, punkt, vader_lexicon)...")
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('vader_lexicon', quiet=True)
print("NLTK data download complete.")

# Initialize NLTK Stopwords and VADER Sentiment Analyzer
stop_words = set(stopwords.words('english'))
analyzer = SentimentIntensityAnalyzer()

# Function for basic text cleaning (for TF-IDF and VADER)
def clean_text_nlp(text):
    text = str(text).lower() # Convert to string and lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # Remove URLs
    text = re.sub(r'\@w+|\#', '', text) # Remove @mentions and hashtags
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    text = re.sub(r'\d+', '', text) # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
    return text

# Apply NLP cleaning
df['cleaned_review_text'] = df['review_text'].apply(clean_text_nlp)

# Check for reviews that might have become empty after cleaning
empty_cleaned_reviews = df[df['cleaned_review_text'].str.strip() == '']
if not empty_cleaned_reviews.empty:
    print(f"WARNING: {len(empty_cleaned_reviews)} reviews became empty after NLP cleaning. They might affect analysis.")
    # Option: You could fill these with a placeholder or drop them if they are too many.
    # For now, we'll keep them but be aware.

print("\nFirst 5 rows of data with cleaned_review_text:")
print(df[['review_text', 'cleaned_review_text']].head())


--- Performing NLP Preprocessing (Tokenization, Stopword Removal) ---
Downloading NLTK data (stopwords, punkt, vader_lexicon)...
NLTK data download complete.

First 5 rows of data with cleaned_review_text:
                                         review_text  \
0  A great app. It's like carrying a bank in your...   
1                      More than garrantty bank EBC.   
2  really am happy to this app it is Siple to use...   
3  I liked this app. But the User interface is ve...   
4  "Why don’t your ATMs support account-to-accoun...   

                                 cleaned_review_text  
0  a great app its like carrying a bank in your p...  
1                       more than garrantty bank ebc  
2  really am happy to this app it is siple to use...  
3  i liked this app but the user interface is ver...  
4  why dont your atms support accounttoaccount tr...  


# Sentiment Analysis using Vendor

In [5]:
# --- Step 3: Sentiment Analysis using VADER ---
print("\n--- Performing Sentiment Analysis using VADER ---")

# Function to get VADER sentiment
def get_vader_sentiment(text):
    if pd.isna(text) or text.strip() == '':
        return {'sentiment_label': 'neutral', 'sentiment_score': 0.0}
    vs = analyzer.polarity_scores(text)
    compound_score = vs['compound']

    if compound_score >= 0.05:
        label = 'positive'
    elif compound_score <= -0.05:
        label = 'negative'
    else:
        label = 'neutral'
    return {'sentiment_label': label, 'sentiment_score': compound_score}

# Apply VADER sentiment analysis
# Use 'cleaned_review_text' as input for sentiment analysis
sentiment_results = df['cleaned_review_text'].apply(get_vader_sentiment)
df = pd.concat([df, sentiment_results.apply(pd.Series)], axis=1)

print("\nSentiment Analysis Complete. First 5 rows with sentiment:")
print(df[['review_text', 'cleaned_review_text', 'sentiment_label', 'sentiment_score']].head())

# Aggregate sentiment by bank and rating (as per prompt)
print("\nAggregated Sentiment (Mean Compound Score by Bank and Rating):")
print(df.groupby(['bank_name', 'rating'])['sentiment_score'].mean().unstack())

# You can also aggregate by sentiment label
print("\nSentiment Distribution by Bank:")
print(df.groupby('bank_name')['sentiment_label'].value_counts(normalize=True).unstack())


--- Performing Sentiment Analysis using VADER ---

Sentiment Analysis Complete. First 5 rows with sentiment:
                                         review_text  \
0  A great app. It's like carrying a bank in your...   
1                      More than garrantty bank EBC.   
2  really am happy to this app it is Siple to use...   
3  I liked this app. But the User interface is ve...   
4  "Why don’t your ATMs support account-to-accoun...   

                                 cleaned_review_text sentiment_label  \
0  a great app its like carrying a bank in your p...        positive   
1                       more than garrantty bank ebc         neutral   
2  really am happy to this app it is siple to use...        positive   
3  i liked this app but the user interface is ver...        negative   
4  why dont your atms support accounttoaccount tr...        positive   

   sentiment_score  
0           0.7650  
1           0.0000  
2           0.6096  
3          -0.2980  
4           0.0

# Thematic Execution

In [6]:
# --- Step 4: Thematic Analysis (Keyword Extraction & Rule-Based Theming) ---
print("\n--- Performing Thematic Analysis ---")

# Initialize TF-IDF Vectorizer (after stopwords are handled in clean_text_nlp if you added it)
# Make sure to handle empty strings for TfidfVectorizer
tfidf_texts = df[df['cleaned_review_text'].str.strip() != '']['cleaned_review_text']

if not tfidf_texts.empty:
    tfidf_vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1,2)) # Consider unigrams and bigrams
    tfidf_matrix = tfidf_vectorizer.fit_transform(tfidf_texts)
    feature_names = tfidf_vectorizer.get_feature_names_out()

    print(f"Top 10 TF-IDF keywords (overall):")
    # Get overall top keywords
    sums_tfidf = tfidf_matrix.sum(axis=0)
    features_scores = []
    for col, term in enumerate(feature_names):
        features_scores.append((term, sums_tfidf[0, col]))
    sorted_features = sorted(features_scores, key=lambda x: x[1], reverse=True)
    for i, (term, score) in enumerate(sorted_features[:10]):
        print(f"{term}: {score:.2f}")
else:
    print("No non-empty texts for TF-IDF vectorization.")

# --- Step 5: Thematic Analysis using Rule-Based Classification ---

# Define themes and associated keywords for simple rule-based classification
themes_keywords = {
    'Account Access Issues': ['login', 'account', 'access', 'otp', 'user id', 'password', 'face id'],
    'Transaction & Performance': ['transfer', 'send', 'money', 'slow', 'fast', 'transaction', 'payment', 'loading'],
    'UI & User Experience': ['interface', 'design', 'user friendly', 'easy', 'layout', 'ui', 'ux'],
    'Bugs & Reliability': ['bug', 'crash', 'error', 'issue', 'problem', 'fix', 'stable', 'glitch'],
    'Customer Support & Features': ['support', 'customer service', 'help', 'response', 'feature', 'request', 'update', 'fingerprint']
}

def assign_themes(review_text):
    assigned = []
    text_lower = str(review_text).lower()
    for theme, keywords in themes_keywords.items():
        if any(keyword in text_lower for keyword in keywords):
            assigned.append(theme)
    return assigned if assigned else ['Miscellaneous'] # Assign 'Miscellaneous' if no theme found

print("\nAssigning themes to reviews...")
df['identified_themes'] = df['cleaned_review_text'].apply(assign_themes)

# Convert list of themes to a string for easier viewing
df['identified_themes_str'] = df['identified_themes'].apply(lambda x: ', '.join(x))


print("\nThematic Analysis Complete. First 5 rows with identified themes:")
print(df[['review_text', 'identified_themes_str']].head())

print("\nTheme distribution by bank:")
# To get a count of reviews per theme per bank, you'll need to expand the lists
themes_exploded = df.explode('identified_themes')
print(themes_exploded.groupby(['bank_name', 'identified_themes']).size().unstack(fill_value=0))

# --- Save Results for Task 3 ---
print(f"\nSaving processed reviews with sentiment and themes to: {SENTIMENT_THEMES_CSV}")
# Drop the original 'cleaned_review_text' and 'identified_themes' list column if you want a cleaner CSV
df_to_save = df.drop(columns=['cleaned_review_text', 'identified_themes'])
df_to_save.to_csv(SENTIMENT_THEMES_CSV, index=False)

print("\n--Sentiment & Thematic Analysis Complete ")


--- Performing Thematic Analysis ---
Top 10 TF-IDF keywords (overall):
good: 964.16
app: 581.30
it: 394.61
nice: 387.56
best: 375.28
the: 340.16
is: 292.30
to: 283.37
very: 278.37
and: 264.62

Assigning themes to reviews...

Thematic Analysis Complete. First 5 rows with identified themes:
                                         review_text  \
0  A great app. It's like carrying a bank in your...   
1                      More than garrantty bank EBC.   
2  really am happy to this app it is Siple to use...   
3  I liked this app. But the User interface is ve...   
4  "Why don’t your ATMs support account-to-accoun...   

                               identified_themes_str  
0                                      Miscellaneous  
1                                      Miscellaneous  
2                                      Miscellaneous  
3                               UI & User Experience  
4  Account Access Issues, Transaction & Performan...  

Theme distribution by bank:
identified_th