# Import Libraries

In [None]:
# Import libraries
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Add parent directory to path to import from src
sys.path.append('..')
from src.indobert_model import IndoBERTSentimentAnalyzer
from src.utils import load_data, save_results, setup_device, plot_confusion_matrix
from src.visualization import (plot_sentiment_time_series, plot_sentiment_distribution, 
                              create_sentiment_wordcloud, visualize_embeddings, 
                              create_interactive_dashboard)

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid'

# Load Data

In [None]:
# Load the preprocessed data
data_path = '../data/Press_Release.xlsx'
data = load_data(data_path)

# Display basic information
print(f"Dataset shape: {data.shape}")
print(f"Time period: {data['Tanggal'].min()} to {data['Tanggal'].max()}")
print(f"Number of press releases: {data.shape[0]}")

# Display first few rows
data.head()

# Initialize Sentiment Analyzer

In [None]:
# Check available devices
device = setup_device()

# Initialize sentiment analyzer
# Use fine-tuned model if available
model_path = '../models/fine-tuned-indobert/best_model'
if os.path.exists(model_path):
    print(f"Using fine-tuned model from {model_path}")
    analyzer = IndoBERTSentimentAnalyzer(model_path=model_path)
else:
    print("Using base IndoBERT model (not fine-tuned)")
    analyzer = IndoBERTSentimentAnalyzer()

# Load lexicon if available
lexicon_path = '../results/lexicon/sentiment_lexicon.xlsx'
if os.path.exists(lexicon_path):
    print(f"Loading sentiment lexicon from {lexicon_path}")
    analyzer.load_lexicon(lexicon_path)

# Analyze Single Document

In [None]:
# Function to analyze a single document
def analyze_document(text):
    result = analyzer.analyze_sentiment(text, use_bert=True, use_lexicon=True)
    
    print(f"Sentiment: {result['label']} (Score: {result['score']:.3f})")
    print(f"Analysis method: {result['method']}")
    
    if 'bert_result' in result:
        print("\nBERT Analysis:")
        probs = result['bert_result']['probabilities']
        for label, prob in probs.items():
            print(f"  {label}: {prob:.3f}")
    
    if 'lexicon_result' in result:
        print("\nLexicon Analysis:")
        counts = result['lexicon_result']['counts']
        print(f"  Positive words: {counts['positif']}")
        print(f"  Negative words: {counts['negatif']}")
        print(f"  Neutral words: {counts['netral']}")
    
    return result

# Analyze an example document
example_idx = 0  # Change this to analyze different documents
example_text = data.iloc[example_idx]['Isi']

print(f"Document title: {data.iloc[example_idx]['Judul']}")
print(f"Date: {data.iloc[example_idx]['Tanggal']}")
print("\nAnalysis:")
analysis_result = analyze_document(example_text)

# Batch Analysis of All Documents

In [1]:
# Check if sentiment analysis has already been done
if 'Sentiment' not in data.columns or 'Sentiment_Score' not in data.columns:
    print("Performing sentiment analysis on all documents...")
    
    # Analyze all documents
    results = analyzer.batch_analyze(data['Isi'], batch_size=4)
    
    # Add results to dataframe
    data['Sentiment'] = [r['label'] for r in results]
    data['Sentiment_Score'] = [r['score'] for r in results]
    
    # Save results
    save_results(data, '../results/sentiment_results.xlsx')
    print("Results saved to ../results/sentiment_results.xlsx")
else:
    print("Using existing sentiment analysis results")

# Show distribution of sentiment categories
sentiment_counts = data['Sentiment'].value_counts()
print("\nSentiment distribution:")
print(sentiment_counts)

# Plot sentiment distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='Sentiment', data=data, palette='viridis')
plt.title('Distribution of Sentiment Categories')
plt.ylabel('Count')
plt.grid(axis='y', alpha=0.3)
plt.show()

NameError: name 'data' is not defined

# Temporal Analysis

# Analyze sentiment over time
print("Analyzing sentiment trends over time...")

# Group by month
data['Tanggal'] = pd.to_datetime(data['Tanggal'])
data['Month'] = data['Tanggal'].dt.to_period('M')

# Calculate average sentiment score by month
monthly_sentiment = data.groupby('Month')['Sentiment_Score'].agg(['mean', 'count']).reset_index()
monthly_sentiment['Month'] = monthly_sentiment['Month'].dt.to_timestamp()

# Plot sentiment over time
plt.figure(figsize=(12, 6))
plt.plot(monthly_sentiment['Month'], monthly_sentiment['mean'], 'o-', color='blue')
plt.axhline(y=0, color='gray', linestyle='--', alpha=0.7)
plt.xlabel('Date')
plt.ylabel('Average Sentiment Score')
plt.title('Sentiment Trend Over Time')
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Count by sentiment category per month
sentiment_by_month = pd.crosstab(data['Month'], data['Sentiment']).reset_index()
sentiment_by_month = sentiment_by_month.melt(id_vars=['Month'], 
                                            var_name='Sentiment', 
                                            value_name='Count')
sentiment_by_month['Month'] = sentiment_by_month['Month'].dt.to_timestamp()

# Plot stacked bar chart
plt.figure(figsize=(12, 6))
sns.barplot(x='Month', y='Count', hue='Sentiment', data=sentiment_by_month, palette='viridis')
plt.xlabel('Date')
plt.ylabel('Count')
plt.title('Sentiment Categories Over Time')
plt.xticks(rotation=45)
plt.legend(title='Sentiment')
plt.tight_layout()
plt.show()

# WordCloud Analysis

# Generate wordclouds by sentiment category
from wordcloud import WordCloud

# Function to generate wordcloud for specific sentiment
def generate_sentiment_wordcloud(sentiment):
    # Filter data by sentiment
    filtered_data = data[data['Sentiment'] == sentiment]
    
    if len(filtered_data) == 0:
        print(f"No data for sentiment: {sentiment}")
        return
    
    # Combine all text
    text = " ".join(filtered_data['Isi'].fillna(""))
    
    # Create wordcloud
    wordcloud = WordCloud(width=800, height=400, 
                         background_color='white',
                         max_words=200,
                         contour_width=1,
                         contour_color='steelblue').generate(text)
    
    # Display
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(f"WordCloud for {sentiment} Sentiment")
    plt.tight_layout(pad=0)
    plt.show()
    
    return wordcloud

# Generate wordcloud for each sentiment
for sentiment in ['Positif', 'Netral', 'Negatif']:
    generate_sentiment_wordcloud(sentiment)

# In-depth Analysis of Spesific Documents

In [None]:
# Function to analyze paragraphs in a document
def analyze_paragraphs(document_idx):
    if document_idx >= len(data):
        print(f"Error: Document index {document_idx} out of range")
        return
    
    # Get document
    doc = data.iloc[document_idx]
    print(f"Analyzing document: {doc['Judul']}")
    print(f"Date: {doc['Tanggal']}")
    print(f"Overall sentiment: {doc['Sentiment']} (Score: {doc['Sentiment_Score']:.3f})")
    print("\nParagraph analysis:")
    
    # Analyze paragraphs
    result = analyzer.analyze_paragraphs(doc['Isi'])
    
    for i, para in enumerate(result['paragraphs']):
        print(f"\nParagraph {i+1}:")
        print(f"Sentiment: {para['sentiment']['label']} (Score: {para['sentiment']['score']:.3f})")
        print(f"Text: {para['text'][:100]}...")
    
    return result

# Analyze paragraphs in a specific document
document_idx = 0  # Change this to analyze different documents
paragraph_analysis = analyze_paragraphs(document_idx)

# Comparative Analysis

In [None]:
# Compare sentiment across different time periods
def compare_time_periods(start_date1, end_date1, start_date2, end_date2, label1="Period 1", label2="Period 2"):
    # Filter data for period 1
    period1 = data[(data['Tanggal'] >= start_date1) & (data['Tanggal'] <= end_date1)]
    
    # Filter data for period 2
    period2 = data[(data['Tanggal'] >= start_date2) & (data['Tanggal'] <= end_date2)]
    
    print(f"{label1}: {len(period1)} documents")
    print(f"{label2}: {len(period2)} documents")
    
    # Calculate sentiment statistics
    stats1 = {
        'mean': period1['Sentiment_Score'].mean(),
        'median': period1['Sentiment_Score'].median(),
        'std': period1['Sentiment_Score'].std(),
        'positive': (period1['Sentiment'] == 'Positif').sum() / len(period1) * 100,
        'neutral': (period1['Sentiment'] == 'Netral').sum() / len(period1) * 100,
        'negative': (period1['Sentiment'] == 'Negatif').sum() / len(period1) * 100
    }
    
    stats2 = {
        'mean': period2['Sentiment_Score'].mean(),
        'median': period2['Sentiment_Score'].median(),
        'std': period2['Sentiment_Score'].std(),
        'positive': (period2['Sentiment'] == 'Positif').sum() / len(period2) * 100,
        'neutral': (period2['Sentiment'] == 'Netral').sum() / len(period2) * 100,
        'negative': (period2['Sentiment'] == 'Negatif').sum() / len(period2) * 100
    }
    
    # Display statistics
    print("\nSentiment Statistics:")
    print(f"                {label1}      {label2}")
    print(f"Mean Score:     {stats1['mean']:.3f}       {stats2['mean']:.3f}")
    print(f"Median Score:   {stats1['median']:.3f}       {stats2['median']:.3f}")
    print(f"Std Dev:        {stats1['std']:.3f}       {stats2['std']:.3f}")
    print(f"% Positive:     {stats1['positive']:.1f}%      {stats2['positive']:.1f}%")
    print(f"% Neutral:      {stats1['neutral']:.1f}%      {stats2['neutral']:.1f}%")
    print(f"% Negative:     {stats1['negative']:.1f}%      {stats2['negative']:.1f}%")
    
    # Plot comparison
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    
    # Plot sentiment distribution
    sns.histplot(period1['Sentiment_Score'], ax=ax1, label=label1, alpha=0.5, kde=True)
    sns.histplot(period2['Sentiment_Score'], ax=ax1, label=label2, alpha=0.5, kde=True)
    ax1.axvline(x=0, color='gray', linestyle='--', alpha=0.7)
    ax1.set_xlabel('Sentiment Score')
    ax1.set_ylabel('Count')
    ax1.set_title('Sentiment Score Distribution')
    ax1.legend()
    
    # Plot sentiment category percentages
    categories = ['Positive', 'Neutral', 'Negative']
    period1_pcts = [stats1['positive'], stats1['neutral'], stats1['negative']]
    period2_pcts = [stats2['positive'], stats2['neutral'], stats2['negative']]
    
    x = np.arange(len(categories))
    width = 0.35
    
    ax2.bar(x - width/2, period1_pcts, width, label=label1)
    ax2.bar(x + width/2, period2_pcts, width, label=label2)
    
    ax2.set_xlabel('Sentiment Category')
    ax2.set_ylabel('Percentage')
    ax2.set_title('Sentiment Category Distribution')
    ax2.set_xticks(x)
    ax2.set_xticklabels(categories)
    ax2.legend()
    
    plt.tight_layout()
    plt.show()

# Example: Compare sentiment before and after a specific date
compare_time_periods(
    pd.to_datetime('2020-01-01'), pd.to_datetime('2020-12-31'),
    pd.to_datetime('2021-01-01'), pd.to_datetime('2021-12-31'),
    "2020", "2021"
)

# Feature Correlation Analysis

In [None]:
# Analyze correlation between sentiment and other features
# First, let's extract some additional features from the text
data['Word_Count'] = data['Isi'].apply(lambda x: len(str(x).split()) if isinstance(x, str) else 0)
data['Sentence_Count'] = data['Isi'].apply(lambda x: len(str(x).split('.')) if isinstance(x, str) else 0)
data['Avg_Word_Length'] = data['Isi'].apply(lambda x: np.mean([len(word) for word in str(x).split()]) if isinstance(x, str) and len(str(x).split()) > 0 else 0)

# Create correlation matrix
corr_features = ['Sentiment_Score', 'Word_Count', 'Sentence_Count', 'Avg_Word_Length']
corr_matrix = data[corr_features].corr()

# Plot correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, fmt='.2f')
plt.title('Correlation Between Features')
plt.tight_layout()
plt.show()

# Scatter plots
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.scatterplot(x='Word_Count', y='Sentiment_Score', data=data, hue='Sentiment', palette='viridis')
plt.title('Sentiment vs. Word Count')
plt.xlabel('Word Count')
plt.ylabel('Sentiment Score')

plt.subplot(1, 3, 2)
sns.scatterplot(x='Sentence_Count', y='Sentiment_Score', data=data, hue='Sentiment', palette='viridis')
plt.title('Sentiment vs. Sentence Count')
plt.xlabel('Sentence Count')
plt.ylabel('Sentiment Score')

plt.subplot(1, 3, 3)
sns.scatterplot(x='Avg_Word_Length', y='Sentiment_Score', data=data, hue='Sentiment', palette='viridis')
plt.title('Sentiment vs. Avg Word Length')
plt.xlabel('Average Word Length')
plt.ylabel('Sentiment Score')

plt.tight_layout()
plt.show()

# Interactive Dashboard

In [None]:
# Create an interactive dashboard using plotly
try:
    import plotly.express as px
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    
    # Create figure with subplots
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=("Sentiment Over Time", "Sentiment Distribution", 
                       "Sentiment by Word Count", "Sentiment Categories"),
        specs=[[{"type": "scatter"}, {"type": "histogram"}],
               [{"type": "scatter"}, {"type": "bar"}]]
    )
    
    # Add sentiment over time
    fig.add_trace(
        go.Scatter(
            x=monthly_sentiment['Month'], 
            y=monthly_sentiment['mean'],
            mode='lines+markers',
            name='Average Sentiment'
        ),
        row=1, col=1
    )
    
    # Add zero line
    fig.add_shape(
        type="line",
        x0=monthly_sentiment['Month'].min(),
        y0=0,
        x1=monthly_sentiment['Month'].max(),
        y1=0,
        line=dict(color="gray", width=1, dash="dash"),
        row=1, col=1
    )
    
    # Add sentiment distribution
    fig.add_trace(
        go.Histogram(
            x=data['Sentiment_Score'],
            nbinsx=20,
            name='Sentiment Distribution'
        ),
        row=1, col=2
    )
    
    # Add sentiment by word count
    fig.add_trace(
        go.Scatter(
            x=data['Word_Count'],
            y=data['Sentiment_Score'],
            mode='markers',
            name='Sentiment by Word Count',
            marker=dict(
                size=8,
                color=data['Sentiment_Score'],
                colorscale='Viridis',
                showscale=True
            )
        ),
        row=2, col=1
    )
    
    # Add sentiment categories
    sentiment_counts = data['Sentiment'].value_counts().reset_index()
    sentiment_counts.columns = ['Sentiment', 'Count']
    
    fig.add_trace(
        go.Bar(
            x=sentiment_counts['Sentiment'],
            y=sentiment_counts['Count'],
            name='Sentiment Categories',
            marker_color=['red', 'blue', 'green']
        ),
        row=2, col=2
    )
    
    # Update layout
    fig.update_layout(
        title_text="Bank Sentral Press Release Sentiment Dashboard",
        height=800,
        width=1200
    )
    
    fig.show()
    
except ImportError:
    print("Plotly is not installed. To use the interactive dashboard, install with: pip install plotly")

# Export Results and Summary

In [None]:
# Create a summary of the analysis
def create_summary_report():
    summary = {
        'total_documents': len(data),
        'time_period': f"{data['Tanggal'].min()} to {data['Tanggal'].max()}",
        'sentiment_distribution': {
            'positive': (data['Sentiment'] == 'Positif').sum(),
            'neutral': (data['Sentiment'] == 'Netral').sum(),
            'negative': (data['Sentiment'] == 'Negatif').sum()
        },
        'sentiment_stats': {
            'mean': data['Sentiment_Score'].mean(),
            'median': data['Sentiment_Score'].median(),
            'std': data['Sentiment_Score'].std(),
            'min': data['Sentiment_Score'].min(),
            'max': data['Sentiment_Score'].max()
        }
    }
    
    # Find most positive and negative documents
    most_positive_idx = data['Sentiment_Score'].idxmax()
    most_positive = {
        'title': data.loc[most_positive_idx, 'Judul'],
        'date': data.loc[most_positive_idx, 'Tanggal'],
        'score': data.loc[most_positive_idx, 'Sentiment_Score']
    }
    
    most_negative_idx = data['Sentiment_Score'].idxmin()
    most_negative = {
        'title': data.loc[most_negative_idx, 'Judul'],
        'date': data.loc[most_negative_idx, 'Tanggal'],
        'score': data.loc[most_negative_idx, 'Sentiment_Score']
    }
    
    summary['most_positive_document'] = most_positive
    summary['most_negative_document'] = most_negative
    
    # Create report text
    report = f"Sentiment Analysis Report\n{'='*50}\n"
    report += f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
    
    report += f"Total documents analyzed: {summary['total_documents']}\n"
    report += f"Time period: {summary['time_period']}\n\n"
    
    report += "Sentiment Distribution:\n"
    report += f"- Positive: {summary['sentiment_distribution']['positive']} ({summary['sentiment_distribution']['positive']/summary['total_documents']*100:.1f}%)\n"
    report += f"- Neutral: {summary['sentiment_distribution']['neutral']} ({summary['sentiment_distribution']['neutral']/summary['total_documents']*100:.1f}%)\n"
    report += f"- Negative: {summary['sentiment_distribution']['negative']} ({summary['sentiment_distribution']['negative']/summary['total_documents']*100:.1f}%)\n\n"
    
    report += "Sentiment Score Statistics:\n"
    report += f"- Mean: {summary['sentiment_stats']['mean']:.3f}\n"
    report += f"- Median: {summary['sentiment_stats']['median']:.3f}\n"
    report += f"- Std Dev: {summary['sentiment_stats']['std']:.3f}\n"
    report += f"- Min: {summary['sentiment_stats']['min']:.3f}\n"
    report += f"- Max: {summary['sentiment_stats']['max']:.3f}\n\n"
    
    report += "Most Positive Document:\n"
    report += f"- Title: {summary['most_positive_document']['title']}\n"
    report += f"- Date: {summary['most_positive_document']['date']}\n"
    report += f"- Score: {summary['most_positive_document']['score']:.3f}\n\n"
    
    report += "Most Negative Document:\n"
    report += f"- Title: {summary['most_negative_document']['title']}\n"
    report += f"- Date: {summary['most_negative_document']['date']}\n"
    report += f"- Score: {summary['most_negative_document']['score']:.3f}\n\n"
    
    return report, summary

# Generate and display summary report
report_text, summary_data = create_summary_report()
print(report_text)

# Save report to file
report_path = '../results/sentiment_analysis_report.txt'
os.makedirs(os.path.dirname(report_path), exist_ok=True)
with open(report_path, 'w') as f:
    f.write(report_text)
print(f"Report saved to {report_path}")