1. Data Loading & Preprocessing

In [1]:
!pip install spacy transformers tqdm



In [2]:
import pandas as pd
import spacy
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import numpy as np
from tqdm import tqdm

# Initialize NLP components
nlp = spacy.load("en_core_web_sm")
sentiment_analyzer = pipeline("sentiment-analysis",
                             model="distilbert-base-uncased-finetuned-sst-2-english")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


2. Sentiment Analysis

In [25]:
def load_data(filepath):
    """Load review data from CSV file"""
    df = pd.read_csv(filepath)

    # Basic cleaning
    df['review'] = df['review'].str.strip()
    df = df.dropna(subset=['review'])

    return df

def preprocess_text(text):
    """Basic text preprocessing with spaCy"""
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc
             if not token.is_stop and not token.is_punct and not token.is_space]
    return " ".join(tokens)

3. Keyword Extraction


In [26]:
def analyze_sentiment(text):
    """Get sentiment label and score using DistilBERT"""
    result = sentiment_analyzer(text)[0]
    # Convert to positive/negative/neutral (neutral if score close to 0.5)
    if result['score'] < 0.55 and result['score'] > 0.45:
        return 'neutral', result['score']
    return result['label'], result['score']

def apply_sentiment_analysis(df):
    """Apply sentiment analysis to all reviews"""
    tqdm.pandas(desc="Analyzing sentiment")
    sentiment_results = df['review'].progress_apply(analyze_sentiment)
    df['sentiment_label'] = sentiment_results.apply(lambda x: x[0])
    df['sentiment_score'] = sentiment_results.apply(lambda x: x[1])
    return df

4. Theme Clustering

In [27]:
def extract_keywords(text, ngram_range=(1,2)):
    """Extract keywords using TF-IDF"""
    vectorizer = TfidfVectorizer(ngram_range=ngram_range, stop_words='english')
    tfidf_matrix = vectorizer.fit_transform([text])
    feature_array = np.array(vectorizer.get_feature_names_out())
    tfidf_sorting = np.argsort(tfidf_matrix.toarray()).flatten()[::-1]

    # Get top 5 keywords
    top_keywords = feature_array[tfidf_sorting][:5]
    return list(top_keywords)

def extract_keywords_spacy(text):
    """Extract keywords using spaCy"""
    doc = nlp(text)
    keywords = []

    # Extract noun phrases
    for chunk in doc.noun_chunks:
        if len(chunk.text.split()) <= 3:  # Limit to 3-word phrases
            keywords.append(chunk.text.lower())

    # Extract important verbs
    for token in doc:
        if token.pos_ in ['VERB', 'ADJ'] and not token.is_stop:
            keywords.append(token.lemma_.lower())

    return list(set(keywords))[:10]  # Return up to 10 unique keywords

5. Main Pipeline

In [28]:
def cluster_keywords_into_themes(keywords_list):
    """Group keywords into broader themes"""
    theme_keywords = {
        'Account Access Issues': ['login', 'password', 'account locked', 'authentication', 'access'],
        'Transaction Performance': ['transfer', 'slow transaction', 'payment', 'delay', 'processing time'],
        'User Interface': ['app', 'interface', 'design', 'navigation', 'user experience'],
        'Customer Support': ['support', 'service', 'representative', 'response time', 'help'],
        'Fees & Charges': ['fee', 'charge', 'cost', 'pricing', 'withdrawal fee'],
        'Security': ['security', 'fraud', 'hack', 'protection', 'secure'],
        'Features': ['feature', 'functionality', 'missing', 'update', 'version']
    }

    # Map keywords to themes
    theme_counts = defaultdict(int)
    keyword_theme_mapping = {}

    for keyword in keywords_list:
        matched = False
        for theme, theme_words in theme_keywords.items():
            for theme_word in theme_words:
                if theme_word in keyword or keyword in theme_word:
                    theme_counts[theme] += 1
                    keyword_theme_mapping[keyword] = theme
                    matched = True
                    break
            if matched:
                break
        if not matched:
            keyword_theme_mapping[keyword] = 'Other'

    # Get top 3-5 themes
    sorted_themes = sorted(theme_counts.items(), key=lambda x: x[1], reverse=True)
    top_themes = [theme[0] for theme in sorted_themes[:5]]

    return top_themes, keyword_theme_mapping

6. Aggregation Functions


In [29]:
def analyze_bank_reviews(input_path, output_path):
    """Main analysis pipeline"""
    # 1. Load data
    print("Loading data...")
    df = load_data(input_path)

    # 2. Sentiment analysis
    print("Performing sentiment analysis...")
    df = apply_sentiment_analysis(df)

    # 3. Keyword extraction
    print("Extracting keywords...")
    tqdm.pandas(desc="Extracting keywords")
    df['keywords'] = df['review'].progress_apply(extract_keywords_spacy)

    # 4. Theme clustering
    print("Clustering themes...")
    theme_data = []
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Identifying themes"):
        themes, _ = cluster_keywords_into_themes(row['keywords'])
        for theme in themes:
            theme_data.append({
                'reviewId': row['reviewId'],
                'theme': theme
            })

    # Create theme dataframe
    theme_df = pd.DataFrame(theme_data)
    # Merge themes back to original dataframe
    result_df = pd.merge(df, theme_df, on='reviewId', how='left')

    # 5. Save results
    print(f"Saving results to {output_path}")
    result_df.to_csv(output_path, index=False)

    return result_df

In [30]:
def aggregate_sentiment_by_bank_and_rating(df):
    """Aggregate sentiment by bank and rating"""
    return df.groupby(['bank_name', 'rating', 'sentiment_label'])['sentiment_score'] \
            .agg(['mean', 'count']) \
            .reset_index() \
            .rename(columns={'mean': 'avg_sentiment_score', 'count': 'review_count'})

def get_theme_summary(df):
    """Get theme frequency by bank"""
    return df.groupby(['bank_name', 'theme']).size().reset_index(name='count') \
            .sort_values(['bank_name', 'count'], ascending=[True, False])

In [33]:
# Example usage
arr = ["reviews_BOA.csv", "reviews_CBE.csv", "reviews_Dashen.csv"]

for input_file in arr:
  if input_file == "reviews_BOA.csv":
    output_file = f"analyzed_BOA.csv"

  if input_file == "reviews_CBE.csv":
    output_file = f"analyzed_CBE.csv"

  if input_file == "reviews_Dashen.csv":
    output_file = f"analyzed_Dashen.csv"

  # Run analysis
  analyzed_data = analyze_bank_reviews(input_file, output_file)

  # Generate summary reports
  print("Proceccing sentiment")
  sentiment_summary = aggregate_sentiment_by_bank_and_rating(analyzed_data)

  print("Proceccing theme")
  theme_summary = get_theme_summary(analyzed_data)

  # Save summaries
  sentiment_summary.to_csv(f"sentiment_summary_{input_file}.csv", index=False)
  theme_summary.to_csv(f"theme_summary_{input_file}.csv", index=False)

  print("Analysis complete!")

Loading data...
Performing sentiment analysis...


Analyzing sentiment: 100%|██████████| 400/400 [00:22<00:00, 18.03it/s]


Extracting keywords...


Extracting keywords: 100%|██████████| 400/400 [00:02<00:00, 140.52it/s]


Clustering themes...


Identifying themes: 100%|██████████| 400/400 [00:00<00:00, 12371.94it/s]


Saving results to analyzed_BOA.csv
Proceccing sentiment
Proceccing theme
Analysis complete!
Loading data...
Performing sentiment analysis...


Analyzing sentiment: 100%|██████████| 400/400 [00:21<00:00, 18.19it/s]


Extracting keywords...


Extracting keywords: 100%|██████████| 400/400 [00:02<00:00, 142.08it/s]


Clustering themes...


Identifying themes: 100%|██████████| 400/400 [00:00<00:00, 8550.86it/s]


Saving results to analyzed_CBE.csv
Proceccing sentiment
Proceccing theme
Analysis complete!
Loading data...
Performing sentiment analysis...


Analyzing sentiment: 100%|██████████| 400/400 [00:21<00:00, 18.39it/s]


Extracting keywords...


Extracting keywords: 100%|██████████| 400/400 [00:02<00:00, 141.01it/s]


Clustering themes...


Identifying themes: 100%|██████████| 400/400 [00:00<00:00, 12930.32it/s]

Saving results to analyzed_Dashen.csv
Proceccing sentiment
Proceccing theme
Analysis complete!



