In [3]:
# %pip install spacy

In [7]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import json

# Predefined theme categories
THEME_CATEGORIES = {
    'Account Access': ['login', 'password', 'authentic', 'biometric', 'access'],
    'Transaction': ['transfer', 'send', 'receive', 'payment', 'deposit'],
    'UI/UX': ['interface', 'design', 'layout', 'button', 'navigation'],
    'Performance': ['slow', 'crash', 'freeze', 'lag', 'responsive'],
    'Customer Support': ['support', 'service', 'representative', 'help', 'response']
}

def preprocess_text(text, nlp):
    """Clean and lemmatize text with spaCy"""
    doc = nlp(text.lower())
    return " ".join([
        token.lemma_ for token in doc 
        if not token.is_stop and token.is_alpha
    ])

def extract_keywords(df, nlp):
    """Extract significant keywords using TF-IDF"""
    # Preprocess all reviews
    df['processed_text'] = df['review'].apply(lambda x: preprocess_text(x, nlp))
    
    # TF-IDF Vectorizer
    tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=100)
    tfidf_matrix = tfidf.fit_transform(df['processed_text'])
    
    # Get top keywords per bank
    keywords_by_bank = {}
    for bank in df['bank'].unique():
        bank_indices = df[df['bank'] == bank].index
        bank_tfidf = tfidf_matrix[bank_indices]
        sums = bank_tfidf.sum(axis=0)
        keywords = [
            (word, sums[0, idx]) 
            for word, idx in tfidf.vocabulary_.items()
        ]
        keywords.sort(key=lambda x: x[1], reverse=True)
        keywords_by_bank[bank] = [w[0] for w in keywords[:20]]
    
    return keywords_by_bank

def map_to_themes(keywords_by_bank):
    """Map keywords to predefined themes"""
    theme_results = {}
    
    for bank, keywords in keywords_by_bank.items():
        bank_themes = defaultdict(list)
        
        for keyword in keywords:
            for theme, trigger_words in THEME_CATEGORIES.items():
                if any(trigger in keyword for trigger in trigger_words):
                    bank_themes[theme].append(keyword)
        
        theme_results[bank] = dict(bank_themes)
    
    return theme_results

if __name__ == "__main__":
    # Load data
    df = pd.read_csv('clean_reviews.csv')
    
    # Load NLP model
    nlp = ''
    # nlp = spacy.load("en_core_web_sm")
    
    # Extract keywords
    keywords_by_bank = extract_keywords(df, nlp)
    
    # Map to themes
    themes_by_bank = map_to_themes(keywords_by_bank)
    
    # Save results
    with open('../data/themes.json', 'w') as f:
        json.dump(themes_by_bank, f, indent=2)
    
    print("Thematic Analysis Results:")
    for bank, themes in themes_by_bank.items():
        print(f"\n{bank}:")
        for theme, keywords in themes.items():
            print(f"- {theme}: {', '.join(keywords[:3])}...")

TypeError: 'str' object is not callable