# Import Libraries

In [None]:
# Import necessary libraries
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Add parent directory to path to import from src
sys.path.append('..')
from src.utils import load_data

# Download NLTK resources if needed
nltk.download('punkt')
nltk.download('stopwords')

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')

# Load Data

In [None]:
# Load the data
data_path = '../data/Press_Release.xlsx'
data = load_data(data_path)

# Display basic information
print(f"Dataset shape: {data.shape}")
print(f"Time period: {data['Tanggal'].min()} to {data['Tanggal'].max()}")
print(f"Number of press releases: {data.shape[0]}")

# Display first few rows
data.head()

# Analyze Text Length and Distribution

In [None]:
# Add word count column
data['Word_Count'] = data['Isi'].apply(lambda x: len(str(x).split()) if isinstance(x, str) else 0)

# Display word count statistics
word_count_stats = data['Word_Count'].describe()
print("Word Count Statistics:")
print(word_count_stats)

# Plot word count distribution
plt.figure(figsize=(10, 6))
sns.histplot(data['Word_Count'], bins=30, kde=True)
plt.axvline(x=word_count_stats['mean'], color='red', linestyle='--', label=f'Mean: {word_count_stats["mean"]:.0f}')
plt.axvline(x=word_count_stats['50%'], color='green', linestyle='--', label=f'Median: {word_count_stats["50%"]:.0f}')
plt.title('Distribution of Word Count in Press Releases')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.legend()
plt.tight_layout()
plt.show()

# Define and Compare Stopwords Lists

In [None]:
# Get standard Indonesian stopwords from NLTK
try:
    indonesian_stopwords = set(stopwords.words('indonesian'))
    print(f"Standard Indonesian stopwords loaded: {len(indonesian_stopwords)} words")
except:
    print("Indonesian stopwords not available in NLTK")
    indonesian_stopwords = set()

# Define domain-specific stopwords for banking/finance
banking_stopwords = {
    'bank', 'sentral', 'bi', 'indonesia', 'gubernur', 'deputi', 'direktur',
    'rapat', 'dewan', 'the', 'and', 'for', 'that', 'dengan', 'dalam', 'pada',
    'dari', 'yang', 'dan', 'ini', 'itu', 'atau', 'juga', 'untuk', 'oleh', 'di',
    'ke', 'tidak', 'akan', 'telah', 'sebagai', 'atas', 'serta', 'sedangkan',
    'sementara', 'yaitu', 'yakni', 'bahwa', 'menteri', 'kebijakan',
    'perekonomian', 'keuangan', 'moneter', 'fiskal', 'inflasi', 'pertumbuhan',
    'ekonomi', 'rupiah', 'nilai', 'tukar', 'persen', 'bunga', 'suku', 'pasar'
}

# Combine stopwords
combined_stopwords = indonesian_stopwords.union(banking_stopwords)
print(f"Combined stopwords: {len(combined_stopwords)} words")

# Print some examples
print("\nExamples of standard Indonesian stopwords:")
print(list(indonesian_stopwords)[:20])

print("\nExamples of banking domain stopwords:")
print(list(banking_stopwords)[:20])

# Compare the lists
common_words = indonesian_stopwords.intersection(banking_stopwords)
print(f"\nWords appearing in both lists: {len(common_words)}")
if common_words:
    print(list(common_words))

# Tokenize and Analyze Word Frequency

In [None]:
# Function to tokenize and count words
def tokenize_and_count_words(text, remove_stopwords=False, stopwords_list=None):
    if not isinstance(text, str):
        return []
    
    # Tokenize
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords if requested
    if remove_stopwords and stopwords_list:
        tokens = [word for word in tokens if word not in stopwords_list]
    
    return tokens

# Get all tokens from the corpus
all_tokens_with_stopwords = []
all_tokens_without_stopwords = []

for text in data['Isi']:
    if isinstance(text, str):
        all_tokens_with_stopwords.extend(tokenize_and_count_words(text))
        all_tokens_without_stopwords.extend(tokenize_and_count_words(text, True, combined_stopwords))

# Count word frequencies
word_freq_with_stopwords = Counter(all_tokens_with_stopwords)
word_freq_without_stopwords = Counter(all_tokens_without_stopwords)

print(f"Total tokens with stopwords: {len(all_tokens_with_stopwords)}")
print(f"Total tokens without stopwords: {len(all_tokens_without_stopwords)}")
print(f"Stopwords account for {(len(all_tokens_with_stopwords) - len(all_tokens_without_stopwords))/len(all_tokens_with_stopwords)*100:.1f}% of all tokens")

# Get top words
top_words_with_stopwords = word_freq_with_stopwords.most_common(20)
top_words_without_stopwords = word_freq_without_stopwords.most_common(20)

# Create DataFrames for visualization
top_words_with_df = pd.DataFrame(top_words_with_stopwords, columns=['Word', 'Frequency'])
top_words_without_df = pd.DataFrame(top_words_without_stopwords, columns=['Word', 'Frequency'])

# Visualize Top Words With and Without stopwords

In [None]:
# Plot top words with stopwords
plt.figure(figsize=(12, 10))

plt.subplot(2, 1, 1)
sns.barplot(x='Frequency', y='Word', data=top_words_with_df, palette='viridis')
plt.title('Top 20 Words (Including Stopwords)')
plt.xlabel('Frequency')

plt.subplot(2, 1, 2)
sns.barplot(x='Frequency', y='Word', data=top_words_without_df, palette='viridis')
plt.title('Top 20 Words (Excluding Stopwords)')
plt.xlabel('Frequency')

plt.tight_layout()
plt.show()

# Create WordCloud Visualizations

In [None]:
# Create WordClouds with and without stopwords
from wordcloud import WordCloud

# Function to create and display wordcloud
def create_wordcloud(text, title, remove_stopwords=False, stopwords_list=None):
    if remove_stopwords:
        words = [word for word in word_tokenize(text.lower()) if word not in stopwords_list]
        text = ' '.join(words)
    
    wordcloud = WordCloud(width=800, height=400, 
                         background_color='white',
                         max_words=200,
                         contour_width=1,
                         contour_color='steelblue').generate(text)
    
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(title)
    plt.tight_layout(pad=0)
    plt.show()
    
    return wordcloud

# Create a single text corpus from all documents
corpus_text = ' '.join([str(text) for text in data['Isi'] if isinstance(text, str)])

# Generate wordclouds
wordcloud_with_stopwords = create_wordcloud(
    corpus_text, 
    "WordCloud with Stopwords"
)

wordcloud_without_stopwords = create_wordcloud(
    corpus_text, 
    "WordCloud without Stopwords",
    remove_stopwords=True,
    stopwords_list=combined_stopwords
)

# Analyze Stopwords Impact by Document

In [None]:
# Analyze the percentage of stopwords in each document
stopword_percentages = []

for text in data['Isi']:
    if isinstance(text, str):
        # Tokenize
        tokens = word_tokenize(text.lower())
        
        # Count stopwords
        stopword_count = sum(1 for token in tokens if token in combined_stopwords)
        
        # Calculate percentage
        if len(tokens) > 0:
            percentage = stopword_count / len(tokens) * 100
        else:
            percentage = 0
        
        stopword_percentages.append(percentage)
    else:
        stopword_percentages.append(0)

# Add to dataframe
data['Stopword_Percentage'] = stopword_percentages

# Display statistics
stopword_stats = data['Stopword_Percentage'].describe()
print("Stopword Percentage Statistics:")
print(stopword_stats)

# Plot distribution
plt.figure(figsize=(10, 6))
sns.histplot(data['Stopword_Percentage'], bins=20, kde=True)
plt.axvline(x=stopword_stats['mean'], color='red', linestyle='--', label=f'Mean: {stopword_stats["mean"]:.1f}%')
plt.axvline(x=stopword_stats['50%'], color='green', linestyle='--', label=f'Median: {stopword_stats["50%"]:.1f}%')
plt.title('Distribution of Stopword Percentage in Press Releases')
plt.xlabel('Stopword Percentage')
plt.ylabel('Frequency')
plt.legend()
plt.tight_layout()
plt.show()

# Stopwords Over Time Analysis

In [None]:
# Analyze stopword usage over time
data['Year'] = pd.to_datetime(data['Tanggal']).dt.year
data['Month'] = pd.to_datetime(data['Tanggal']).dt.month

# Group by year
stopwords_by_year = data.groupby('Year')['Stopword_Percentage'].mean().reset_index()

# Plot stopword percentage by year
plt.figure(figsize=(12, 6))
sns.lineplot(x='Year', y='Stopword_Percentage', data=stopwords_by_year, marker='o')
plt.title('Average Stopword Percentage by Year')
plt.xlabel('Year')
plt.ylabel('Stopword Percentage')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Check if stopword usage is correlated with document length
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Word_Count', y='Stopword_Percentage', data=data)
plt.title('Stopword Percentage vs. Document Length')
plt.xlabel('Word Count')
plt.ylabel('Stopword Percentage')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Custom Stopwords Analysis

In [None]:
# Analyze which stopwords are most common in the corpus
stopword_counts = {}

for stopword in combined_stopwords:
    count = sum(1 for token in all_tokens_with_stopwords if token == stopword)
    stopword_counts[stopword] = count

# Convert to DataFrame
stopword_freq = pd.DataFrame(list(stopword_counts.items()), columns=['Stopword', 'Frequency'])
stopword_freq = stopword_freq.sort_values('Frequency', ascending=False).reset_index(drop=True)

# Display top stopwords
print("Top 20 most frequent stopwords in the corpus:")
print(stopword_freq.head(20))

# Plot top stopwords
plt.figure(figsize=(12, 8))
sns.barplot(x='Frequency', y='Stopword', data=stopword_freq.head(20), palette='viridis')
plt.title('Top 20 Most Frequent Stopwords')
plt.xlabel('Frequency')
plt.tight_layout()
plt.show()

# Optimize Stopwords List

In [None]:
# Function to evaluate stopwords impact
def evaluate_stopword_impact(text, stopword):
    """Calculate impact of removing a single stopword"""
    if not isinstance(text, str):
        return 0
    
    tokens = word_tokenize(text.lower())
    count = sum(1 for token in tokens if token == stopword)
    return count / len(tokens) * 100 if len(tokens) > 0 else 0

# Evaluate impact of top stopwords
top_stopwords = stopword_freq.head(30)['Stopword'].tolist()
impact_data = []

for stopword in top_stopwords:
    # Calculate average impact across documents
    avg_impact = data['Isi'].apply(lambda x: evaluate_stopword_impact(x, stopword)).mean()
    impact_data.append({'Stopword': stopword, 'Impact': avg_impact})

impact_df = pd.DataFrame(impact_data)
impact_df = impact_df.sort_values('Impact', ascending=False).reset_index(drop=True)

print("Impact of removing top stopwords:")
print(impact_df.head(20))

# Plot impact
plt.figure(figsize=(12, 8))
sns.barplot(x='Impact', y='Stopword', data=impact_df.head(20), palette='viridis')
plt.title('Impact of Removing Top Stopwords (% of Text Removed)')
plt.xlabel('Average Impact (%)')
plt.tight_layout()
plt.show()

# Optimize Stopwords List for Banking Domain

In [None]:
# Create an optimized stopwords list
# Keep only stopwords with significant impact
impact_threshold = 0.1  # Minimum impact percentage to include in optimized list
optimized_stopwords = impact_df[impact_df['Impact'] >= impact_threshold]['Stopword'].tolist()

# Add domain-specific stopwords that might not be frequent but are irrelevant for analysis
domain_specific = [
    'bank', 'sentral', 'bi', 'indonesia', 'gubernur', 'deputi', 'direktur',
    'rapat', 'dewan', 'kebijakan', 'moneter', 'keuangan',
    'pertemuan', 'triwulan', 'kuartal', 'persen', 'tanggal', 'bulan'
]

# Combine lists
final_stopwords = set(optimized_stopwords + domain_specific)
print(f"Final optimized stopwords list: {len(final_stopwords)} words")
print(sorted(final_stopwords))

# Save the optimized stopwords list
stopwords_path = '../results/optimized_stopwords.txt'
os.makedirs(os.path.dirname(stopwords_path), exist_ok=True)

with open(stopwords_path, 'w') as f:
    for word in sorted(final_stopwords):
        f.write(f"{word}\n")

print(f"Optimized stopwords list saved to {stopwords_path}")

# Compare Text Processing With Different Stopwords

In [None]:
# Compare results of different stopwords lists
example_text = data['Isi'].iloc[0] if len(data) > 0 else "Contoh teks tidak tersedia"

# Define different stopwords lists to compare
stopwords_lists = {
    'No Stopwords': set(),
    'NLTK Indonesian': indonesian_stopwords,
    'Domain-specific': banking_stopwords,
    'Combined': combined_stopwords,
    'Optimized': final_stopwords
}

print("Example text:")
print(example_text[:500] + "...\n")

print("Text after removing different stopwords lists:")
for name, sw_list in stopwords_lists.items():
    # Tokenize and remove stopwords
    tokens = word_tokenize(example_text.lower())
    filtered_tokens = [word for word in tokens if word not in sw_list]
    filtered_text = ' '.join(filtered_tokens)
    
    # Calculate reduction
    reduction = (1 - len(filtered_tokens) / len(tokens)) * 100
    
    print(f"\n{name} ({len(sw_list)} words, {reduction:.1f}% reduction):")
    print(filtered_text[:500] + "...")

# Evaluate and Compare Results

In [None]:
# Evaluate impact of different stopwords lists on corpus
evaluation_results = []

for name, sw_list in stopwords_lists.items():
    # Count tokens with and without stopwords
    total_tokens = 0
    remaining_tokens = 0
    
    for text in data['Isi']:
        if isinstance(text, str):
            tokens = word_tokenize(text.lower())
            total_tokens += len(tokens)
            
            filtered_tokens = [word for word in tokens if word not in sw_list]
            remaining_tokens += len(filtered_tokens)
    
    # Calculate metrics
    reduction = (1 - remaining_tokens / total_tokens) * 100
    evaluation_results.append({
        'Stopwords List': name,
        'List Size': len(sw_list),
        'Total Tokens': total_tokens,
        'Remaining Tokens': remaining_tokens,
        'Reduction (%)': reduction
    })

# Convert to DataFrame
eval_df = pd.DataFrame(evaluation_results)
print("Evaluation of different stopwords lists:")
print(eval_df)

# Plot comparison
plt.figure(figsize=(12, 6))
sns.barplot(x='Stopwords List', y='Reduction (%)', data=eval_df, palette='viridis')
plt.title('Text Reduction by Different Stopwords Lists')
plt.xlabel('Stopwords List')
plt.ylabel('Reduction (%)')
plt.ylim(0, 100)
for i, v in enumerate(eval_df['Reduction (%)']):
    plt.text(i, v + 1, f"{v:.1f}%", ha='center')
plt.tight_layout()
plt.show()

# Generate Final Recommendations

In [None]:
# Create final recommendations
print("Recommendations for Stopwords in Bank Sentral NLP Analysis")
print("=" * 60)
print("\n1. Most Effective Stopwords List")
print("-" * 40)

# Identify most effective list based on size vs. reduction
efficiency = eval_df['Reduction (%)'] / eval_df['List Size']
most_efficient_idx = efficiency.idxmax()
most_efficient = eval_df.iloc[most_efficient_idx]['Stopwords List']

print(f"Most efficient stopwords list: {most_efficient}")
print(f"Provides {eval_df.iloc[most_efficient_idx]['Reduction (%)]:.1f}% text reduction with only {eval_df.iloc[most_efficient_idx]['List Size']} words")

print("\n2. Key Stopwords to Always Include")
print("-" * 40)
print("The following stopwords have the highest impact and should always be included:")
for _, row in impact_df.head(10).iterrows():
    print(f"- {row['Stopword']}: {row['Impact']:.2f}% impact")

print("\n3. Domain-Specific Consideration")
print("-" * 40)
print("For Bank Sentral communication analysis, consider the context:")
print("- Some common banking terms might be stopwords in general analysis but meaningful for specific tasks")
print("- Terms like 'inflasi', 'suku bunga' might be stopwords for document classification but crucial for sentiment analysis")

print("\n4. Final Recommendation")
print("-" * 40)
print("Use the optimized stopwords list for general preprocessing, but maintain flexibility:")
print(f"- {len(final_stopwords)} optimized stopwords provide good balance between reduction and specificity")
print("- For topic modeling: use more aggressive stopwords removal")
print("- For sentiment analysis: use more conservative stopwords removal")
print("- Consider task-specific stopwords lists for different analysis goals")

# Save recommendations
recommendations_path = '../results/stopwords_recommendations.txt'
with open(recommendations_path, 'w') as f:
    f.write("Recommendations for Stopwords in Bank Sentral NLP Analysis\n")
    f.write("=" * 60 + "\n\n")
    
    f.write("1. Most Effective Stopwords List\n")
    f.write("-" * 40 + "\n")
    f.write(f"Most efficient stopwords list: {most_efficient}\n")
    f.write(f"Provides {eval_df.iloc[most_efficient_idx]['Reduction (%)']:.1f}% text reduction with only {eval_df.iloc[most_efficient_idx]['List Size']} words\n\n")
    
    f.write("2. Key Stopwords to Always Include\n")
    f.write("-" * 40 + "\n")
    f.write("The following stopwords have the highest impact and should always be included:\n")
    for _, row in impact_df.head(10).iterrows():
        f.write(f"- {row['Stopword']}: {row['Impact']:.2f}% impact\n")
    
    f.write("\n3. Domain-Specific Consideration\n")
    f.write("-" * 40 + "\n")
    f.write("For Bank Sentral communication analysis, consider the context:\n")
    f.write("- Some common banking terms might be stopwords in general analysis but meaningful for specific tasks\n")
    f.write("- Terms like 'inflasi', 'suku bunga' might be stopwords for document classification but crucial for sentiment analysis\n\n")
    
    f.write("4. Final Recommendation\n")
    f.write("-" * 40 + "\n")
    f.write("Use the optimized stopwords list for general preprocessing, but maintain flexibility:\n")
    f.write(f"- {len(final_stopwords)} optimized stopwords provide good balance between reduction and specificity\n")
    f.write("- For topic modeling: use more aggressive stopwords removal\n")
    f.write("- For sentiment analysis: use more conservative stopwords removal\n")
    f.write("- Consider task-specific stopwords lists for different analysis goals\n")

print(f"\nRecommendations saved to {recommendations_path}")