# Task 4: Insights and Visualization

This notebook focuses on deriving actionable insights from the analyzed Google Play Store reviews (Task 2 output) and creating visualizations to communicate these findings.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import os

# Configure plots
%matplotlib inline
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 7)

## 1. Load Analyzed Data

In [None]:
# Assuming the output from Task 2 (with sentiment and potentially themes) is saved here:
# Update this path if your file is named or located differently.
ANALYZED_DATA_PATH = '../data/reviews_with_sentiment_themes.csv'
import os # Ensure os is imported
import pandas as pd # Ensure pandas is imported

if os.path.exists(ANALYZED_DATA_PATH):
    df_analyzed = pd.read_csv(ANALYZED_DATA_PATH)
    print(f"Successfully loaded {len(df_analyzed)} reviews from {ANALYZED_DATA_PATH}")
    # Convert 'review_date' back to datetime if it's not already
    if 'review_date' in df_analyzed.columns:
        df_analyzed['review_date'] = pd.to_datetime(df_analyzed['review_date'])
    df_analyzed.head()
else:
    error_message_line1 = f"Error: Analyzed data file not found at {ANALYZED_DATA_PATH}."
    error_message_line2 = "Please ensure Task 2 was completed and the output CSV is correctly named and placed."
    print(error_message_line1)
    print(error_message_line2)
    
    # Create a placeholder DataFrame if file not found, to allow notebook to run partially
    # You'll need to replace this with actual data loading logic for full functionality.
    df_analyzed = pd.DataFrame({
        'review': ['Great app!', 'Terrible experience.', 'Okay, but needs improvement.', 'Love the new features!', 'Crashes too often.'],
        'rating': [5, 1, 3, 4, 2],
        'date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05']),
        'bank': ['Bank A', 'Bank B', 'Bank A', 'Bank C', 'Bank B'],
        'source': ['Google Play Store'] * 5,
        'sentiment_label': ['POSITIVE', 'NEGATIVE', 'NEUTRAL', 'POSITIVE', 'NEGATIVE'], # Example, replace NEUTRAL if not used
        'sentiment_score': [0.99, 0.98, 0.70, 0.95, 0.90],
        'themes': ['UI', 'Performance', 'Features', 'UI', 'Performance'] # Example themes
    })
    print("Using placeholder data as an example.")

## 2. Identify Drivers and Pain Points

In [None]:
# Example: Identify top positive/negative reviews or keywords within them
# This section will require more specific logic based on how themes were identified in Task 2
# or by analyzing reviews with high/low sentiment scores and ratings.

if 'sentiment_label' in df_analyzed.columns:
    print("\n--- Top 5 Positive Reviews (High Rating & Positive Sentiment) ---")
    positive_reviews = df_analyzed[(df_analyzed['rating'] >= 4) & (df_analyzed['sentiment_label'] == 'POSITIVE')].sort_values(by='sentiment_score', ascending=False)
    for i, row in positive_reviews.head().iterrows():
        print(f"Bank: {row['bank']}, Rating: {row['rating']}, Sentiment: {row['sentiment_label']} ({row['sentiment_score']:.2f})\nReview: {row['review'][:200]}...\n---")
    
    print("\n--- Top 5 Negative Reviews (Low Rating & Negative Sentiment) ---")
    negative_reviews = df_analyzed[(df_analyzed['rating'] <= 2) & (df_analyzed['sentiment_label'] == 'NEGATIVE')].sort_values(by='sentiment_score', ascending=False) # score is confidence, so still sort high for strong negative
    for i, row in negative_reviews.head().iterrows():
        print(f"Bank: {row['bank']}, Rating: {row['rating']}, Sentiment: {row['sentiment_label']} ({row['sentiment_score']:.2f})\nReview: {row['review'][:200]}...\n---")
else:
    print("Sentiment labels not found. Skipping driver/pain point identification based on sentiment.")

# Placeholder for identifying drivers/pain points based on themes if 'themes' column exists
# if 'themes' in df_analyzed.columns:
#     theme_counts = df_analyzed.groupby(['bank', 'themes', 'sentiment_label']).size().unstack(fill_value=0)
#     print("\nTheme distribution by sentiment:\n")
#     print(theme_counts)

## 3. Compare Banks

In [None]:
if 'bank' in df_analyzed.columns:
    print("\n--- Average Rating per Bank ---")
    avg_rating_bank = df_analyzed.groupby('bank')['rating'].mean().sort_values(ascending=False)
    print(avg_rating_bank)
    
    if 'sentiment_label' in df_analyzed.columns:
        print("\n--- Sentiment Distribution per Bank ---")
        sentiment_dist_bank = df_analyzed.groupby('bank')['sentiment_label'].value_counts(normalize=True).mul(100).unstack(fill_value=0)
        print(sentiment_dist_bank)
else:
    print("Bank or sentiment information not found for comparison.")

## 4. Suggest Improvements (Placeholder)

In [None]:
print("\n--- Suggested Improvements (Based on Analysis) ---")
# This section should be manually filled based on the identified pain points and drivers.
# Example: 
# 1. If 'login issue' is a common negative theme for Bank X, suggest 'Improve login reliability for Bank X'.
# 2. If 'good UI' is a common positive theme for Bank Y, suggest 'Maintain and enhance UI for Bank Y, consider for other banks'.
print("1. Suggestion 1: [Based on findings]")
print("2. Suggestion 2: [Based on findings]")

## 5. Visualizations

### 5.1. Rating Distribution

In [None]:
if 'rating' in df_analyzed.columns:
    plt.figure(figsize=(10,6))
    sns.countplot(data=df_analyzed, x='rating', hue='bank' if 'bank' in df_analyzed.columns else None, palette='viridis')
    plt.title('Distribution of Ratings by Bank')
    plt.xlabel('Rating')
    plt.ylabel('Number of Reviews')
    plt.show()
else:
    print("Rating data not found for visualization.")

### 5.2. Sentiment Trends

In [None]:
if 'sentiment_label' in df_analyzed.columns:
    plt.figure(figsize=(10,6))
    sns.countplot(data=df_analyzed, x='sentiment_label', hue='bank' if 'bank' in df_analyzed.columns else None, palette='coolwarm')
    plt.title('Distribution of Sentiment by Bank')
    plt.xlabel('Sentiment Label')
    plt.ylabel('Number of Reviews')
    plt.show()

    # Sentiment over time (if 'review_date' is available and meaningful)
    # if 'review_date' in df_analyzed.columns:
    #     df_analyzed.set_index('review_date', inplace=True)
    #     # Example: Resample by month and count sentiments (requires sentiment_label to be numeric or one-hot encoded)
    #     # sentiment_over_time = df_analyzed.groupby('bank')['sentiment_label'].resample('M').count().unstack(level=0, fill_value=0) # Adjust logic
    #     # sentiment_over_time.plot(kind='line', figsize=(14,7))
    #     # plt.title('Sentiment Trends Over Time by Bank')
    #     # plt.ylabel('Number of Reviews')
    #     # plt.show()\n    #     # df_analyzed.reset_index(inplace=True) # Reset index if changed
else:
    print("Sentiment data not found for visualization.")

### 5.3. Keyword Clouds

In [None]:
if 'review' in df_analyzed.columns and 'sentiment_label' in df_analyzed.columns:
    stopwords = set(STOPWORDS)
    # Add custom stopwords if needed: stopwords.update(['app', 'bank', 'review', 'etc'])

    positive_text = ' '.join(df_analyzed[df_analyzed['sentiment_label'] == 'POSITIVE']['review'].dropna())
    negative_text = ' '.join(df_analyzed[df_analyzed['sentiment_label'] == 'NEGATIVE']['review'].dropna())

    if positive_text:
        wordcloud_positive = WordCloud(width=800, height=400, background_color='white', stopwords=stopwords).generate(positive_text)
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud_positive, interpolation='bilinear')
        plt.axis('off')
        plt.title('Word Cloud for Positive Reviews')
        plt.show()
    else:
        print("No positive reviews found to generate word cloud.")

    if negative_text:
        wordcloud_negative = WordCloud(width=800, height=400, background_color='black', colormap='Reds', stopwords=stopwords).generate(negative_text)
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud_negative, interpolation='bilinear')
        plt.axis('off')
        plt.title('Word Cloud for Negative Reviews')
        plt.show()
    else:
        print("No negative reviews found to generate word cloud.")
else:
    print("Review text or sentiment labels not found for word cloud generation.")

## 6. Ethical Considerations

In [None]:
print("\n--- Ethical Considerations ---")
print("1. **Selection Bias**: Reviews on app stores might not represent all users. Users with very strong opinions (positive or negative) are more likely to leave reviews, potentially skewing the overall sentiment.")
print("2. **Demographic Bias**: The demographics of users leaving reviews might not match the bank's overall customer base.")
print("3. **Sentiment Model Limitations**: The sentiment model (DistilBERT) is trained on general text and might misinterpret domain-specific jargon or sarcasm prevalent in financial app reviews.")
print("4. **Small Sample Size for Certain Segments**: If a particular bank or rating category has few reviews, insights drawn from it might not be reliable.")
print("5. **Privacy**: While reviews are public, ensure aggregated insights do not inadvertently reveal personally identifiable information if combined with other data.")

## 7. KPIs Check (Placeholder)

In [None]:
# KPI: 2+ drivers/pain points with evidence. (To be verified after analysis)
# KPI: Clear, labeled visualizations. (To be verified after generating plots)
# KPI: Practical recommendations. (To be verified after formulating suggestions)