In [1]:
 #notebooks/02_sentiment_thematic_analysis.ipynb

import os
import sys
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import re # For regex in text cleaning

In [3]:


# --- Project Setup: Ensure src module is discoverable ---
def find_project_root(current_path):
    path = current_path
    while path != os.path.dirname(path):
        if (os.path.isdir(os.path.join(path, 'src')) and
            os.path.isdir(os.path.join(path, 'data')) and
            os.path.isdir(os.path.join(path, 'notebooks'))):
            return path
        path = os.path.dirname(path)
    return current_path

current_working_dir = os.getcwd()
project_root = find_project_root(current_working_dir)

if project_root not in sys.path:
    sys.path.append(project_root)
    print(f"Added '{project_root}' to sys.path for module imports.")
else:
    print(f"'{project_root}' already in sys.path.")

# Import configuration variables
from src.config import CLEAN_REVIEWS_CSV, PROCESSED_DATA_DIR

# Define output file for Task 2 results
SENTIMENT_THEMES_CSV = os.path.join(PROCESSED_DATA_DIR, 'reviews_with_sentiment_themes.csv')
# Ensure the output directory exists

'c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM-2\fintech-app-customer-experience-analytics' already in sys.path.


In [4]:


# --- Step 1: Load Cleaned Data ---
print("\n--- Loading Cleaned Review Data ---")
try:
    df = pd.read_csv(CLEAN_REVIEWS_CSV)
    print(f"Cleaned data loaded successfully from {CLEAN_REVIEWS_CSV}.")
    print(f"Initial DataFrame shape: {df.shape}")
    print("\nFirst 5 rows of loaded data:")
    print(df.head())
except FileNotFoundError:
    print(f"CRITICAL ERROR: Cleaned data file not found at {CLEAN_REVIEWS_CSV}. Please run Task 1 first.")
    sys.exit("Exiting: Cleaned data not found.")
except Exception as e:
    print(f"CRITICAL ERROR: Could not load cleaned data: {e}")
    sys.exit("Exiting: Cleaned data loading failed.")

if df.empty:
    print("WARNING: Loaded DataFrame is empty. Skipping sentiment and thematic analysis.")
    sys.exit("Exiting: Empty DataFrame.")


--- Loading Cleaned Review Data ---
Cleaned data loaded successfully from c:\Users\hp\OneDrive\Desktop\kaim-ai\KAIM-2\fintech-app-customer-experience-analytics\data\processed\clean_play_store_reviews.csv.
Initial DataFrame shape: (8989, 6)

First 5 rows of loaded data:
                              review_id  \
0  a7d1c799-ba53-4a0a-a8d6-c5400a009825   
1  64ed5562-1758-4eb8-9291-8b6edc394118   
2  d0c05687-ddd4-43fb-95a9-08f6358d80a2   
3  811bf820-3529-433a-9b6d-e624fa23a16a   
4  be2cb2ac-bbe0-4175-81c4-9f6c86afdaaa   

                                         review_text  rating        date  \
0  A great app. It's like carrying a bank in your...       4  2025-06-07   
1                      More than garrantty bank EBC.       4  2025-06-07   
2  really am happy to this app it is Siple to use...       5  2025-06-07   
3  I liked this app. But the User interface is ve...       2  2025-06-07   
4  "Why donâ€™t your ATMs support account-to-accoun...       4  2025-06-06   

           

In [None]:

# --- Step 2: NLP Preprocessing for Sentiment and Thematic Analysis ---
print("\n--- Performing NLP Preprocessing (Tokenization, Stopword Removal) ---")

# Download NLTK data
# These commands will download the necessary data if not already present.
# It's more robust to call nltk.download() directly without complex try-except for DownloadError
print("Downloading NLTK data (stopwords, punkt, vader_lexicon)...")
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('vader_lexicon', quiet=True)
print("NLTK data download complete.")

# Initialize NLTK Stopwords and VADER Sentiment Analyzer
stop_words = set(stopwords.words('english'))
analyzer = SentimentIntensityAnalyzer()

# Function for basic text cleaning (for TF-IDF and VADER)
def clean_text_nlp(text):
    text = str(text).lower() # Convert to string and lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # Remove URLs
    text = re.sub(r'\@w+|\#', '', text) # Remove @mentions and hashtags
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    text = re.sub(r'\d+', '', text) # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
    return text

# Apply NLP cleaning
df['cleaned_review_text'] = df['review_text'].apply(clean_text_nlp)

# Check for reviews that might have become empty after cleaning
empty_cleaned_reviews = df[df['cleaned_review_text'].str.strip() == '']
if not empty_cleaned_reviews.empty:
    print(f"WARNING: {len(empty_cleaned_reviews)} reviews became empty after NLP cleaning. They might affect analysis.")
    # Option: You could fill these with a placeholder or drop them if they are too many.
    # For now, we'll keep them but be aware.

print("\nFirst 5 rows of data with cleaned_review_text:")
print(df[['review_text', 'cleaned_review_text']].head())

In [None]:




# --- Step 3: Sentiment Analysis using VADER ---
print("\n--- Performing Sentiment Analysis (VADER) ---")

# Function to get VADER sentiment scores
def get_vader_sentiment(text):
    if not isinstance(text, str) or not text.strip(): # Handle non-string or empty texts
        return {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0} # Neutral score for empty
    vs = analyzer.polarity_scores(text)
    return vs

# Apply sentiment analysis
# Using .progress_apply for a progress bar as it can take time
df['sentiment_scores'] = df['cleaned_review_text'].apply(get_vader_sentiment)

# Extract individual sentiment scores
df['sentiment_neg'] = df['sentiment_scores'].apply(lambda x: x['neg'])
df['sentiment_neu'] = df['sentiment_scores'].apply(lambda x: x['neu'])
df['sentiment_pos'] = df['sentiment_scores'].apply(lambda x: x['pos'])
df['sentiment_compound'] = df['sentiment_scores'].apply(lambda x: x['compound'])

# Determine sentiment label (positive, negative, neutral) based on compound score
def get_sentiment_label(compound_score):
    if compound_score >= 0.05:
        return 'positive'
    elif compound_score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

df['sentiment_label'] = df['sentiment_compound'].apply(get_sentiment_label)

print("\nSentiment analysis completed. First 5 rows with sentiment scores and labels:")
print(df[['review_text', 'sentiment_compound', 'sentiment_label']].head())
print("\nSentiment label distribution:")
print(df['sentiment_label'].value_counts())

# KPI Check: Sentiment scores for 90%+ reviews.
# Since we explicitly handled empty strings and assigned neutral, this should be 100% non-null.
sentiment_coverage = (df['sentiment_compound'].notnull().sum() / len(df)) * 100
print(f"\nSentiment coverage: {sentiment_coverage:.2f}%")
if sentiment_coverage >= 90:
    print("KPI Met: Sentiment scores for 90%+ reviews.")
else:
    print("KPI Warning: Sentiment coverage is below 90%.")


# --- Step 4: Thematic Analysis (Keyword Extraction using TF-IDF) ---
print("\n--- Performing Thematic Analysis (Keyword Extraction) ---")

# Combine reviews by bank for TF-IDF
bank_reviews_combined = df.groupby('bank_name')['cleaned_review_text'].apply(lambda x: ' '.join(x)).reset_index()

# Initialize TF-IDF Vectorizer
# max_df can be used to ignore terms that appear in too many documents (e.g., 85% of reviews)
# min_df can be used to ignore terms that appear in too few documents (e.g., less than 5 documents)
# ngram_range to get single words and common phrases
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.85, min_df=5, ngram_range=(1, 2))

# Fit and transform for each bank (separately or combined, let's do combined for general keywords)
# For bank-specific keywords, we might want to run TFIDF per bank, but for now, let's get general keywords
# and then focus on manual clustering per bank later.
tfidf_matrix = tfidf_vectorizer.fit_transform(bank_reviews_combined['cleaned_review_text'])
feature_names = tfidf_vectorizer.get_feature_names_out()

print("\nTop 10 TF-IDF keywords (Unigrams & Bigrams) per bank:")
bank_keywords = {}
for i, row in bank_reviews_combined.iterrows():
    bank_name = row['bank_name']
    
    # Get TF-IDF scores for the current bank
    vector = tfidf_matrix[i]
    # Create a DataFrame of feature names and their TF-IDF scores for the current bank
    tfidf_scores = pd.DataFrame(vector.T.todense(), index=feature_names, columns=["tfidf"])
    tfidf_scores = tfidf_scores.sort_values(by="tfidf", ascending=False)
    
    bank_keywords[bank_name] = tfidf_scores.head(20).index.tolist() # Top 20 keywords for each bank
    
    print(f"\n--- {bank_name} ---")
    print(bank_keywords[bank_name])

# --- Step 5: Manual/Rule-Based Thematic Clustering ---
# This is a conceptual step that you would refine based on the keywords above.
# Here, we'll demonstrate a simple rule-based approach for common themes.
# You will likely need to refine these rules significantly based on your actual data.

def assign_theme(review_text, bank_name):
    text = str(review_text).lower() # Ensure it's lowercase for matching
    
    # Define keywords for common themes
    # These are examples, you would refine these based on your TF-IDF results
    themes = {
        'Account Access Issues': ['login', 'account', 'password', 'face id', 'fingerprint'],
        'Transaction Performance': ['transfer', 'send', 'receive', 'transaction', 'slow', 'fast', 'delay'],
        'User Interface & Experience': ['ui', 'interface', 'design', 'easy', 'user friendly', 'layout', 'bug', 'crashes', 'update'],
        'Customer Support': ['support', 'customer service', 'help', 'call', 'response'],
        'Feature Requests': ['budgeting', 'new feature', 'qr', 'international', 'bill pay', 'online banking']
    }
    
    identified_themes = []
    for theme, keywords in themes.items():
        if any(keyword in text for keyword in keywords):
            identified_themes.append(theme)
            
    if not identified_themes:
        return 'Other/General Feedback' # Default theme if no specific keywords match
    return ', '.join(identified_themes) # Join multiple themes if applicable

print("\n--- Assigning Themes to Reviews ---")
df['identified_themes'] = df['cleaned_review_text'].apply(lambda x: assign_theme(x, x)) # bank_name not used in this simplified func

print("\nThemes assigned. First 5 rows with identified_themes:")
print(df[['review_text', 'identified_themes']].head())

# KPI Check: 3+ themes per bank with examples (examples will be seen in your manual review of themes)
# This check confirms that the theme assignment process is working, and themes exist.
print("\nTheme distribution by bank:")
theme_counts_by_bank = df.groupby('bank_name')['identified_themes'].value_counts().unstack(fill_value=0)
print(theme_counts_by_bank)

for bank in df['bank_name'].unique():
    num_themes = len(df[df['bank_name'] == bank]['identified_themes'].unique())
    if num_themes >= 3:
        print(f"KPI Met for {bank}: {num_themes} themes identified.")
    else:
        print(f"KPI Warning for {bank}: Only {num_themes} themes identified.")


# --- Step 6: Aggregate Sentiment by Bank and Rating ---
print("\n--- Aggregating Sentiment by Bank and Rating ---")

# Aggregate by bank and rating
sentiment_by_bank_rating = df.groupby(['bank_name', 'rating'])['sentiment_compound'].mean().unstack(fill_value=0)
print("\nAverage Compound Sentiment Score by Bank and Rating:")
print(sentiment_by_bank_rating)

# Aggregate overall sentiment per bank
overall_sentiment_by_bank = df.groupby('bank_name')['sentiment_compound'].mean().reset_index()
print("\nOverall Average Compound Sentiment Score by Bank:")
print(overall_sentiment_by_bank)

# Aggregate sentiment label counts per bank
sentiment_label_counts = df.groupby(['bank_name', 'sentiment_label']).size().unstack(fill_value=0)
print("\nSentiment Label Counts by Bank:")
print(sentiment_label_counts)


# --- Step 7: Save Results ---
print(f"\n--- Saving Processed Reviews with Sentiment and Themes to: {SENTIMENT_THEMES_CSV} ---")

# Select columns as specified in deliverables: review_id, review_text, sentiment_label, sentiment_score, identified_theme(s)
# Note: 'sentiment_score' is the compound score here.
final_output_df = df[['review_id', 'review_text', 'sentiment_label', 'sentiment_compound', 'identified_themes', 'bank_name', 'rating', 'date', 'source']]

final_output_df.to_csv(SENTIMENT_THEMES_CSV, index=False)
print("File saved successfully.")

print("\n--- Task 2: Sentiment and Thematic Analysis Complete ---")
print("Remember to commit your work to your 'task-2' branch!")
