In [35]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.offline as pyo

# NLP and ML libraries
from sentence_transformers import SentenceTransformer
import umap
import hdbscan
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
from keybert import KeyBERT

# Text processing
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")
pyo.init_notebook_mode(connected=True)

print("All libraries imported successfully!")

# Load the dataset
print("Loading UN General Assembly dataset...")
df = pd.read_csv('un.csv')

print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
df.head()

All libraries imported successfully!
Loading UN General Assembly dataset...
Dataset shape: (7507, 4)
Columns: ['session', 'year', 'country', 'text']

First few rows:


Unnamed: 0,session,year,country,text
0,44,1989,MDV,﻿It is indeed a pleasure for me and the member...
1,44,1989,FIN,"﻿\nMay I begin by congratulating you. Sir, on ..."
2,44,1989,NER,"﻿\nMr. President, it is a particular pleasure ..."
3,44,1989,URY,﻿\nDuring the debate at the fortieth session o...
4,44,1989,ZWE,﻿I should like at the outset to express my del...


In [36]:
# Part 2: Data Exploration and Cleaning

# Explore the dataset structure
print("Dataset Info:")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nColumn data types:")
print(df.dtypes)
print("\nMissing values:")
print(df.isnull().sum())

# Display basic statistics
print(f"\nYear range: {df['year'].min()} - {df['year'].max()}")
print(f"Number of countries: {df['country'].nunique()}")
print(f"Total speeches: {len(df)}")

# Check text length distribution
df['text_length'] = df['text'].str.len()
print(f"\nText length statistics:")
print(df['text_length'].describe())

# Visualize year and country distribution
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Speeches per Year', 'Top 20 Countries by Speech Count', 
                   'Text Length Distribution', 'Speeches per Year (Line Plot)'),
    specs=[[{"type": "bar"}, {"type": "bar"}],
           [{"type": "histogram"}, {"type": "scatter"}]]
)

# Speeches per year
year_counts = df['year'].value_counts().sort_index()
fig.add_trace(go.Bar(x=year_counts.index, y=year_counts.values, name="Speeches per Year"), row=1, col=1)

# Top countries
country_counts = df['country'].value_counts().head(20)
fig.add_trace(go.Bar(x=country_counts.values, y=country_counts.index, 
                     orientation='h', name="Top Countries"), row=1, col=2)

# Text length distribution
fig.add_trace(go.Histogram(x=df['text_length'], nbinsx=50, name="Text Length"), row=2, col=1)

# Speeches timeline
fig.add_trace(go.Scatter(x=year_counts.index, y=year_counts.values, 
                        mode='lines+markers', name="Timeline"), row=2, col=2)

fig.update_layout(height=800, showlegend=False, title_text="Dataset Overview")
fig.show()

# Filter data for analysis
print("\n" + "="*50)
print("FILTERING DATA FOR ANALYSIS")
print("="*50)

# Filter by year (2010-2020)
df_filtered = df[(df['year'] >= 2010) & (df['year'] <= 2020)].copy()
print(f"After year filtering (2010-2020): {len(df_filtered)} speeches")

# Remove very short speeches (< 500 characters)
df_filtered = df_filtered[df_filtered['text_length'] >= 500].copy()
print(f"After removing short speeches: {len(df_filtered)} speeches")

# Remove null texts
df_filtered = df_filtered.dropna(subset=['text']).copy()
print(f"After removing null texts: {len(df_filtered)} speeches")

# Clean text function
def clean_text(text):
    """Clean and preprocess text for better clustering"""
    if pd.isna(text):
        return ""
    
    # Convert to string and lowercase
    text = str(text).lower()
    
    # Remove common diplomatic phrases that appear in most speeches
    diplomatic_phrases = [
        'mr. president', 'mr president', 'madam president', 'distinguished delegates',
        'excellencies', 'ladies and gentlemen', 'thank you mr. president',
        'thank you madam president', 'general assembly', 'united nations'
    ]
    
    for phrase in diplomatic_phrases:
        text = text.replace(phrase, '')
    
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove very common words that don't add semantic value
    common_words = ['country', 'countries', 'world', 'international', 'global']
    for word in common_words:
        text = re.sub(r'\b' + word + r'\b', '', text)
    
    return text.strip()

# Apply text cleaning
print("\nCleaning text...")
df_filtered['cleaned_text'] = df_filtered['text'].apply(clean_text)

# Remove speeches that became too short after cleaning
df_filtered = df_filtered[df_filtered['cleaned_text'].str.len() >= 200].copy()
print(f"Final dataset size: {len(df_filtered)} speeches")

# Final statistics
print(f"\nFinal dataset statistics:")
print(f"Years: {df_filtered['year'].min()} - {df_filtered['year'].max()}")
print(f"Countries: {df_filtered['country'].nunique()}")
print(f"Average text length: {df_filtered['cleaned_text'].str.len().mean():.0f} characters")

# Display sample of cleaned data
print(f"\nSample of cleaned data:")
sample_df = df_filtered[['country', 'year', 'cleaned_text']].head(3)
for idx, row in sample_df.iterrows():
    print(f"\n{row['country']} ({row['year']}):")
    print(f"{row['cleaned_text'][:200]}...")

# Save the filtered dataset for later use
df_final = df_filtered.copy()
print(f"\nFiltered dataset ready with {len(df_final)} speeches!")

Dataset Info:
Shape: (7507, 4)
Columns: ['session', 'year', 'country', 'text']

Column data types:
session     int64
year        int64
country    object
text       object
dtype: object

Missing values:
session    0
year       0
country    0
text       0
dtype: int64

Year range: 1970 - 2015
Number of countries: 199
Total speeches: 7507

Text length statistics:
count     7507.000000
mean     17967.281604
std       7860.038463
min       2362.000000
25%      12077.000000
50%      16424.000000
75%      22479.500000
max      72041.000000
Name: text_length, dtype: float64



FILTERING DATA FOR ANALYSIS
After year filtering (2010-2020): 1158 speeches
After removing short speeches: 1158 speeches
After removing null texts: 1158 speeches

Cleaning text...
Final dataset size: 1158 speeches

Final dataset statistics:
Years: 2010 - 2015
Countries: 196
Average text length: 12151 characters

Sample of cleaned data:

SUR (2013):
allow me at the outset, on behalf of the president of the republic of suriname, his excellency mr. desiré delano bouterse, and the people and the government of suriname, to congratulate you, , on your...

KOR (2013):
may i first congratulate you, sir, on your election as president of the at its sixty-eighth session. i am confident that the current session will be a fruitful one under your able leadership. the repu...

GNB (2013):
i would like to begin my statement by congratulating ambassador john william ashe on his election as president of the general assembly at its sixty-eighth session. i would also like to extend our cong...

Filtered 

In [37]:
import numpy as np
import torch 
from sentence_transformers import SentenceTransformer


In [38]:
# Part 3: Generate BERT Embeddings

print("="*50)
print("GENERATING BERT EMBEDDINGS")
print("="*50)

# Initialize the SentenceTransformer model
# Using all-mpnet-base-v2 for better quality semantic representations
print("Loading SentenceTransformer model...")
model = SentenceTransformer('all-mpnet-base-v2')
print("Model loaded successfully!")

# Check if we need to batch the embedding generation
n_speeches = len(df_final)
print(f"Generating embeddings for {n_speeches} speeches...")

# Generate embeddings with progress tracking
# We'll batch process to handle memory efficiently
batch_size = 32
embeddings_list = []

from tqdm import tqdm

# Process in batches
for i in tqdm(range(0, n_speeches, batch_size), desc="Generating embeddings"):
    batch_texts = df_final['cleaned_text'].iloc[i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch_texts, show_progress_bar=False)
    embeddings_list.extend(batch_embeddings)

# Convert to numpy array
embeddings = np.array(embeddings_list)
print(f"Embeddings shape: {embeddings.shape}")
print(f"Embedding dimension: {embeddings.shape[1]}")

# Save embeddings for later use (optional)
np.save('un_speech_embeddings.npy', embeddings)
print("Embeddings saved to 'un_speech_embeddings.npy'")

# Basic statistics about embeddings
print(f"\nEmbedding statistics:")
print(f"Mean: {embeddings.mean():.4f}")
print(f"Std: {embeddings.std():.4f}")
print(f"Min: {embeddings.min():.4f}")
print(f"Max: {embeddings.max():.4f}")

# Compute pairwise similarities for a sample to verify quality
from sklearn.metrics.pairwise import cosine_similarity

print(f"\nVerifying embedding quality with sample similarities...")
sample_indices = np.random.choice(len(embeddings), size=5, replace=False)
sample_embeddings = embeddings[sample_indices]
sample_similarities = cosine_similarity(sample_embeddings)

print("Sample cosine similarities between random speeches:")
for i in range(len(sample_similarities)):
    for j in range(i+1, len(sample_similarities)):
        country_i = df_final.iloc[sample_indices[i]]['country']
        country_j = df_final.iloc[sample_indices[j]]['country']
        year_i = df_final.iloc[sample_indices[i]]['year']
        year_j = df_final.iloc[sample_indices[j]]['year']
        similarity = sample_similarities[i, j]
        print(f"{country_i} ({year_i}) vs {country_j} ({year_j}): {similarity:.3f}")

print(f"\nEmbeddings generated successfully! Ready for dimensionality reduction.")

GENERATING BERT EMBEDDINGS
Loading SentenceTransformer model...
Model loaded successfully!
Generating embeddings for 1158 speeches...


Generating embeddings: 100%|██████████| 37/37 [00:40<00:00,  1.09s/it]


Embeddings shape: (1158, 768)
Embedding dimension: 768
Embeddings saved to 'un_speech_embeddings.npy'

Embedding statistics:
Mean: -0.0003
Std: 0.0361
Min: -0.1849
Max: 0.1898

Verifying embedding quality with sample similarities...
Sample cosine similarities between random speeches:
BFA (2015) vs MAR (2011): 0.549
BFA (2015) vs NLD (2010): 0.561
BFA (2015) vs ZAF (2011): 0.615
BFA (2015) vs JOR (2013): 0.487
MAR (2011) vs NLD (2010): 0.674
MAR (2011) vs ZAF (2011): 0.800
MAR (2011) vs JOR (2013): 0.736
NLD (2010) vs ZAF (2011): 0.676
NLD (2010) vs JOR (2013): 0.503
ZAF (2011) vs JOR (2013): 0.690

Embeddings generated successfully! Ready for dimensionality reduction.


In [39]:
# Part 4: Dimensionality Reduction with UMAP

print("="*50)
print("DIMENSIONALITY REDUCTION WITH UMAP")
print("="*50)

# UMAP for dimensionality reduction
# UMAP is better than PCA for clustering as it preserves local structure
print("Applying UMAP for dimensionality reduction...")

# Configure UMAP parameters
# - n_neighbors: larger values preserve more global structure
# - min_dist: smaller values allow tighter clustering
# - metric: cosine is good for high-dimensional semantic embeddings
umap_reducer = umap.UMAP(
    n_neighbors=15,        # Balance between local and global structure
    min_dist=0.1,          # Allow tight clusters
    n_components=2,        # 2D for visualization
    metric='cosine',       # Good for semantic embeddings
    random_state=42,       # Reproducibility
    verbose=True
)

# Fit and transform the embeddings
print("Fitting UMAP...")
embeddings_2d = umap_reducer.fit_transform(embeddings)
df_final['umap_x'] = embeddings_2d[:, 0]
df_final['umap_y'] = embeddings_2d[:, 1]

print(f"UMAP reduction complete!")
print(f"Original shape: {embeddings.shape}")
print(f"Reduced shape: {embeddings_2d.shape}")

# Also create 3D embeddings for alternative visualization
print("\nCreating 3D UMAP embedding...")
umap_3d = umap.UMAP(
    n_neighbors=15,
    min_dist=0.1,
    n_components=3,
    metric='cosine',
    random_state=42
)

embeddings_3d = umap_3d.fit_transform(embeddings)
print(f"3D embedding shape: {embeddings_3d.shape}")

# Visualize the UMAP reduction
print("\nVisualizing UMAP results...")

# Create a basic scatter plot colored by year
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('UMAP Colored by Year', 'UMAP Colored by Country (Sample)'),
    specs=[[{"type": "scatter"}, {"type": "scatter"}]]
)

# Plot colored by year
fig.add_trace(
    go.Scatter(
        x=embeddings_2d[:, 0],
        y=embeddings_2d[:, 1],
        mode='markers',
        marker=dict(
            size=4,
            color=df_final['year'].values,
            colorscale='Viridis',
            colorbar=dict(title="Year"),
            opacity=0.7
        ),
        text=[f"{country} ({year})" for country, year in zip(df_final['country'], df_final['year'])],
        hovertemplate='<b>%{text}</b><br>X: %{x:.2f}<br>Y: %{y:.2f}<extra></extra>',
        name="By Year"
    ),
    row=1, col=1
)

# Plot colored by country (sample of top countries)
top_countries = df_final['country'].value_counts().head(10).index
country_colors = {country: i for i, country in enumerate(top_countries)}
colors = [country_colors.get(country, -1) for country in df_final['country']]

fig.add_trace(
    go.Scatter(
        x=embeddings_2d[:, 0],
        y=embeddings_2d[:, 1],
        mode='markers',
        marker=dict(
            size=4,
            color=colors,
            colorscale='Viridis',
            opacity=0.7
        ),
        text=[f"{country} ({year})" for country, year in zip(df_final['country'], df_final['year'])],
        hovertemplate='<b>%{text}</b><br>X: %{x:.2f}<br>Y: %{y:.2f}<extra></extra>',
        name="By Country"
    ),
    row=1, col=2
)

fig.update_layout(
    height=500,
    title_text="UMAP Dimensionality Reduction Results",
    showlegend=False
)

fig.show()

# Save the reduced embeddings for clustering
print(f"\nSaving UMAP embeddings...")
np.save('un_speech_embeddings_2d.npy', embeddings_2d)
np.save('un_speech_embeddings_3d.npy', embeddings_3d)

# Add UMAP coordinates to dataframe for later analysis
df_final['umap_x'] = embeddings_2d[:, 0]
df_final['umap_y'] = embeddings_2d[:, 1]
df_final['umap_z'] = embeddings_3d[:, 2]

print(f"UMAP dimensionality reduction complete!")
print(f"2D coordinates range - X: [{embeddings_2d[:, 0].min():.2f}, {embeddings_2d[:, 0].max():.2f}]")
print(f"2D coordinates range - Y: [{embeddings_2d[:, 1].min():.2f}, {embeddings_2d[:, 1].max():.2f}]")

DIMENSIONALITY REDUCTION WITH UMAP
Applying UMAP for dimensionality reduction...
Fitting UMAP...
UMAP(angular_rp_forest=True, metric='cosine', n_jobs=1, random_state=42, verbose=True)
Fri Jun 13 11:58:40 2025 Construct fuzzy simplicial set
Fri Jun 13 11:58:42 2025 Finding Nearest Neighbors
Fri Jun 13 11:58:42 2025 Finished Nearest Neighbor Search
Fri Jun 13 11:58:42 2025 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Fri Jun 13 11:58:43 2025 Finished embedding
UMAP reduction complete!
Original shape: (1158, 768)
Reduced shape: (1158, 2)

Creating 3D UMAP embedding...


KeyboardInterrupt: 

In [None]:
# Part 5: Clustering with HDBSCAN

print("="*50)
print("CLUSTERING WITH HDBSCAN")
print("="*50)

# HDBSCAN is chosen over K-means because:
# 1. It can find clusters of varying density and size
# 2. It doesn't require specifying the number of clusters
# 3. It can identify noise/outliers
# 4. It works well with UMAP embeddings

print("Applying HDBSCAN clustering...")

# Configure HDBSCAN parameters
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=15,    # Minimum speeches per cluster
    min_samples=5,          # Core point threshold
    cluster_selection_epsilon=0.5,  # Distance threshold
    metric='euclidean',     # Works well with UMAP output
    cluster_selection_method='eom'  # Excess of Mass method
)

# Fit the clusterer
cluster_labels = clusterer.fit_predict(embeddings_2d)

# Add cluster labels to dataframe
df_final['cluster'] = cluster_labels

# Analyze clustering results
n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
n_noise = list(cluster_labels).count(-1)

print(f"Clustering complete!")
print(f"Number of clusters: {n_clusters}")
print(f"Number of noise points: {n_noise}")
print(f"Percentage of points clustered: {((len(cluster_labels) - n_noise) / len(cluster_labels)) * 100:.1f}%")

# Cluster size distribution
cluster_sizes = pd.Series(cluster_labels).value_counts().sort_index()
print(f"\nCluster sizes:")
for cluster_id, size in cluster_sizes.items():
    if cluster_id == -1:
        print(f"Noise: {size} speeches")
    else:
        print(f"Cluster {cluster_id}: {size} speeches")

# Calculate clustering quality metrics
if n_clusters > 1:
    # Silhouette score (exclude noise points)
    mask = cluster_labels != -1
    if np.sum(mask) > 0:
        silhouette_avg = silhouette_score(embeddings_2d[mask], cluster_labels[mask])
        print(f"\nSilhouette Score: {silhouette_avg:.3f}")

# Visualize clusters
print(f"\nVisualizing clusters...")

# Create interactive cluster visualization
fig = go.Figure()

# Define colors for clusters
colors = px.colors.qualitative.Set3
if n_clusters > len(colors):
    colors = colors * (n_clusters // len(colors) + 1)

# Add each cluster
for cluster_id in sorted(set(cluster_labels)):
    mask = cluster_labels == cluster_id
    
    if cluster_id == -1:
        # Noise points
        fig.add_trace(go.Scatter(
            x=embeddings_2d[mask, 0],
            y=embeddings_2d[mask, 1],
            mode='markers',
            marker=dict(
                size=4,
                color='lightgray',
                opacity=0.5
            ),
            text=[f"{country} ({year})" for country, year in 
                  zip(df_final[mask]['country'], df_final[mask]['year'])],
            hovertemplate='<b>%{text}</b><br>Cluster: Noise<br>X: %{x:.2f}<br>Y: %{y:.2f}<extra></extra>',
            name='Noise',
            showlegend=True
        ))
    else:
        fig.add_trace(go.Scatter(
            x=embeddings_2d[mask, 0],
            y=embeddings_2d[mask, 1],
            mode='markers',
            marker=dict(
                size=5,
                color=colors[cluster_id % len(colors)],
                opacity=0.8
            ),
            text=[f"{country} ({year})" for country, year in 
                  zip(df_final[mask]['country'], df_final[mask]['year'])],
            hovertemplate=f'<b>%{{text}}</b><br>Cluster: {cluster_id}<br>X: %{{x:.2f}}<br>Y: %{{y:.2f}}<extra></extra>',
            name=f'Cluster {cluster_id}',
            showlegend=True
        ))

fig.update_layout(
    title=f'UN Speech Clusters (HDBSCAN) - {n_clusters} Clusters Found',
    xaxis_title='UMAP Dimension 1',
    yaxis_title='UMAP Dimension 2',
    width=900,
    height=600,
    hovermode='closest'
)

fig.show()

# Analyze cluster composition by country and year
print(f"\nAnalyzing cluster composition...")

def analyze_cluster_composition(cluster_id):
    """Analyze the composition of a specific cluster"""
    if cluster_id == -1:
        cluster_data = df_final[df_final['cluster'] == -1]
        print(f"\nNOISE POINTS ({len(cluster_data)} speeches):")
    else:
        cluster_data = df_final[df_final['cluster'] == cluster_id]
        print(f"\nCLUSTER {cluster_id} ({len(cluster_data)} speeches):")
    
    # Top countries in this cluster
    top_countries = cluster_data['country'].value_counts().head(5)
    print(f"Top countries: {dict(top_countries)}")
    
    # Year distribution
    year_dist = cluster_data['year'].value_counts().sort_index()
    print(f"Year range: {year_dist.index.min()} - {year_dist.index.max()}")
    
    # Sample speech excerpt
    sample = cluster_data.sample(1).iloc[0]
    print(f"Sample speech ({sample['country']}, {sample['year']}):")
    print(f"'{sample['cleaned_text'][:200]}...'")

# Analyze each cluster
for cluster_id in sorted(set(cluster_labels)):
    if cluster_id != -1:  # Skip noise for now
        analyze_cluster_composition(cluster_id)

print(f"\nClustering analysis complete! Found {n_clusters} distinct themes in UN speeches.")

CLUSTERING WITH HDBSCAN
Applying HDBSCAN clustering...
Clustering complete!
Number of clusters: 2
Number of noise points: 3
Percentage of points clustered: 99.7%

Cluster sizes:
Noise: 3 speeches
Cluster 0: 28 speeches
Cluster 1: 1127 speeches

Silhouette Score: 0.221

Visualizing clusters...



Analyzing cluster composition...

CLUSTER 0 (28 speeches):
Top countries: {'ISR': 6, 'PSE': 5, 'QAT': 2, 'SAU': 2, 'OMN': 2}
Year range: 2010 - 2015
Sample speech (CZE, 2011):
'allow me to congratulate you, sir, on your election to the very important post of the highest representative of the at its sixty-sixth session. a few days ago new york city and the entire  remembered ...'

CLUSTER 1 (1127 speeches):
Top countries: {'SUR': 6, 'KGZ': 6, 'ITA': 6, 'AZE': 6, 'ALB': 6}
Year range: 2010 - 2015
Sample speech (MAR, 2011):
'it is a great pleasure for me to extend to mr. al-nasser my sincere congratulations on his election as president of the at its sixty-sixth session. his election reflects the esteem enjoyed by the brot...'

Clustering analysis complete! Found 2 distinct themes in UN speeches.


In [None]:
df_final['cluster'] = cluster_labels
df_final.to_csv("df_final_with_clusters.csv", index=False)  # 💾 Save for Part 6


In [None]:
# Enhanced Thematic Classification for UN General Assembly Speeches
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from collections import defaultdict, Counter

print("="*50)
print("ENHANCED THEMATIC CLASSIFICATION SYSTEM")
print("="*50)

# Define comprehensive theme categories with keywords and patterns
THEME_DEFINITIONS = {
    'Security & Conflict': {
        'keywords': [
            'security', 'conflict', 'war', 'peace', 'terrorism', 'violence', 'military',
            'armed', 'battlefield', 'ceasefire', 'peacekeeping', 'troops', 'insurgency',
            'civil war', 'aggression', 'hostile', 'threat', 'defense', 'combat',
            'counterterrorism', 'extremism', 'insurgent', 'militant', 'warfare'
        ],
        'phrases': [
            'international security', 'regional conflict', 'peace process',
            'security council', 'armed conflict', 'peacekeeping operations',
            'counter terrorism', 'regional security', 'conflict resolution'
        ]
    },
    
    'Climate & Environment': {
        'keywords': [
            'climate', 'environment', 'warming', 'emissions', 'carbon', 'greenhouse',
            'sustainability', 'renewable', 'pollution', 'biodiversity', 'ecosystem',
            'deforestation', 'conservation', 'green', 'clean energy', 'fossil fuel',
            'paris agreement', 'kyoto protocol', 'environmental protection'
        ],
        'phrases': [
            'climate change', 'global warming', 'environmental protection',
            'sustainable development', 'renewable energy', 'carbon emissions',
            'environmental degradation', 'climate action', 'green technology'
        ]
    },
    
    'Economic Development': {
        'keywords': [
            'economic', 'development', 'trade', 'investment', 'growth', 'poverty',
            'finance', 'market', 'economy', 'business', 'commerce', 'industry',
            'employment', 'jobs', 'fiscal', 'monetary', 'gdp', 'inflation',
            'recession', 'recovery', 'prosperity', 'wealth', 'income'
        ],
        'phrases': [
            'economic development', 'poverty reduction', 'economic growth',
            'international trade', 'foreign investment', 'market access',
            'economic cooperation', 'financial stability', 'sustainable development'
        ]
    },
    
    'Human Rights & Democracy': {
        'keywords': [
            'human rights', 'democracy', 'freedom', 'justice', 'equality', 'liberty',
            'civil rights', 'political rights', 'discrimination', 'oppression',
            'authoritarian', 'democratic', 'election', 'voting', 'citizen',
            'constitution', 'rule of law', 'judicial', 'legal system'
        ],
        'phrases': [
            'human rights', 'democratic governance', 'rule of law',
            'civil liberties', 'political freedom', 'social justice',
            'equal rights', 'democratic institutions', 'constitutional reform'
        ]
    },
    
    'Health & Pandemic': {
        'keywords': [
            'health', 'pandemic', 'disease', 'medical', 'healthcare', 'epidemic',
            'virus', 'infection', 'vaccine', 'treatment', 'medicine', 'hospital',
            'public health', 'mortality', 'illness', 'outbreak', 'contagious',
            'immunization', 'WHO', 'world health organization'
        ],
        'phrases': [
            'public health', 'global health', 'health systems', 'pandemic response',
            'health security', 'medical assistance', 'health infrastructure',
            'epidemic control', 'health emergency'
        ]
    },
    
    'Nuclear & Disarmament': {
        'keywords': [
            'nuclear', 'disarmament', 'weapons', 'arms', 'proliferation',
            'non-proliferation', 'atomic', 'missile', 'warhead', 'uranium',
            'plutonium', 'nuclear power', 'nuclear energy', 'IAEA',
            'nuclear weapons', 'arms control', 'weapon systems'
        ],
        'phrases': [
            'nuclear disarmament', 'arms control', 'nuclear non-proliferation',
            'nuclear weapons', 'disarmament treaty', 'arms reduction',
            'nuclear security', 'weapon systems', 'nuclear technology'
        ]
    },
    
    'Infrastructure & Technology': {
        'keywords': [
            'infrastructure', 'technology', 'digital', 'internet', 'communication',
            'transport', 'roads', 'bridges', 'railways', 'airports', 'ports',
            'telecommunications', 'broadband', 'connectivity', 'innovation',
            'research', 'development', 'science', 'technical cooperation'
        ],
        'phrases': [
            'infrastructure development', 'digital divide', 'technology transfer',
            'scientific cooperation', 'innovation systems', 'digital transformation',
            'transport infrastructure', 'communication networks', 'research collaboration'
        ]
    },
    
    'Education & Culture': {
        'keywords': [
            'education', 'school', 'university', 'learning', 'knowledge',
            'culture', 'cultural', 'heritage', 'diversity', 'language',
            'literacy', 'educational', 'academic', 'scholarship', 'student',
            'teacher', 'curriculum', 'UNESCO', 'cultural exchange'
        ],
        'phrases': [
            'education for all', 'cultural diversity', 'educational development',
            'cultural heritage', 'knowledge sharing', 'educational cooperation',
            'cultural exchange', 'lifelong learning', 'educational reform'
        ]
    },
    
    'Migration & Refugees': {
        'keywords': [
            'migration', 'refugee', 'displacement', 'asylum', 'immigrant',
            'migrant', 'border', 'humanitarian', 'internally displaced',
            'stateless', 'repatriation', 'resettlement', 'UNHCR',
            'human trafficking', 'forced migration'
        ],
        'phrases': [
            'refugee crisis', 'forced displacement', 'asylum seekers',
            'migration flows', 'humanitarian assistance', 'refugee protection',
            'internal displacement', 'migration policy', 'refugee camps'
        ]
    },
    
    'Regional Cooperation': {
        'keywords': [
            'regional', 'cooperation', 'partnership', 'alliance', 'bilateral',
            'multilateral', 'neighboring', 'bloc', 'union', 'integration',
            'collaboration', 'joint', 'shared', 'collective', 'community'
        ],
        'phrases': [
            'regional cooperation', 'international partnership', 'bilateral relations',
            'multilateral cooperation', 'regional integration', 'joint initiatives',
            'collective action', 'shared responsibility', 'regional development'
        ]
    }
}

def calculate_theme_scores(text, theme_definitions):
    """Calculate relevance scores for each theme based on text content"""
    text_lower = text.lower()
    scores = {}
    
    for theme, definition in theme_definitions.items():
        score = 0
        
        # Score based on individual keywords
        for keyword in definition['keywords']:
            # Count occurrences with word boundaries to avoid partial matches
            keyword_pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
            matches = len(re.findall(keyword_pattern, text_lower))
            score += matches * 1.0  # Base weight for keywords
        
        # Score based on phrases (higher weight)
        for phrase in definition['phrases']:
            phrase_pattern = re.escape(phrase.lower())
            matches = len(re.findall(phrase_pattern, text_lower))
            score += matches * 2.0  # Higher weight for phrases
        
        scores[theme] = score
    
    return scores

def classify_speech_themes(df, min_score=1.0, max_themes=3):
    """Classify speeches into themes based on content analysis"""
    
    print("Analyzing speech content for thematic classification...")
    
    # Initialize results
    theme_results = []
    
    for idx, row in df.iterrows():
        text = row['cleaned_text']
        
        # Calculate theme scores
        theme_scores = calculate_theme_scores(text, THEME_DEFINITIONS)
        
        # Sort themes by score
        sorted_themes = sorted(theme_scores.items(), key=lambda x: x[1], reverse=True)
        
        # Select relevant themes (above minimum score)
        relevant_themes = [(theme, score) for theme, score in sorted_themes 
                          if score >= min_score][:max_themes]
        
        # Determine primary theme
        if relevant_themes:
            primary_theme = relevant_themes[0][0]
            primary_score = relevant_themes[0][1]
            all_themes = [theme for theme, score in relevant_themes]
        else:
            primary_theme = "General/Other"
            primary_score = 0
            all_themes = ["General/Other"]
        
        theme_results.append({
            'index': idx,
            'primary_theme': primary_theme,
            'primary_score': primary_score,
            'all_themes': all_themes,
            'theme_scores': theme_scores
        })
    
    return theme_results

def enhanced_clustering_with_themes(df, theme_results):
    """Create enhanced clustering that considers both content similarity and themes"""
    
    print("Creating enhanced thematic clusters...")
    
    # Add theme information to dataframe
    df_enhanced = df.copy()
    
    for result in theme_results:
        idx = result['index']
        df_enhanced.loc[idx, 'primary_theme'] = result['primary_theme']
        df_enhanced.loc[idx, 'theme_confidence'] = result['primary_score']
        df_enhanced.loc[idx, 'all_themes'] = ', '.join(result['all_themes'])
    
    # Create theme-based clusters
    theme_clusters = {}
    cluster_id = 0
    
    for theme in THEME_DEFINITIONS.keys():
        theme_mask = df_enhanced['primary_theme'] == theme
        if theme_mask.sum() > 0:
            df_enhanced.loc[theme_mask, 'thematic_cluster'] = cluster_id
            theme_clusters[cluster_id] = theme
            cluster_id += 1
    
    # Handle General/Other category
    general_mask = df_enhanced['primary_theme'] == 'General/Other'
    if general_mask.sum() > 0:
        df_enhanced.loc[general_mask, 'thematic_cluster'] = cluster_id
        theme_clusters[cluster_id] = 'General/Other'
    
    return df_enhanced, theme_clusters

def create_theme_analysis_report(df_enhanced, theme_clusters):
    """Create comprehensive theme analysis report"""
    
    print("\n" + "="*70)
    print("COMPREHENSIVE THEME ANALYSIS REPORT")
    print("="*70)
    
    total_speeches = len(df_enhanced)
    
    # Theme distribution
    theme_distribution = df_enhanced['primary_theme'].value_counts()
    
    print(f"\n📊 THEME DISTRIBUTION (Total Speeches: {total_speeches})")
    print("-" * 50)
    
    for theme, count in theme_distribution.items():
        percentage = (count / total_speeches) * 100
        print(f"{theme:25} | {count:4d} speeches ({percentage:5.1f}%)")
    
    # Top countries by theme
    print(f"\n🌍 TOP COUNTRIES BY THEME")
    print("-" * 50)
    
    for theme in theme_distribution.index[:5]:  # Top 5 themes
        theme_data = df_enhanced[df_enhanced['primary_theme'] == theme]
        top_countries = theme_data['country'].value_counts().head(3)
        
        print(f"\n{theme}:")
        for country, count in top_countries.items():
            print(f"  • {country}: {count} speeches")
    
    # Temporal analysis
    print(f"\n📅 TEMPORAL THEME EVOLUTION")
    print("-" * 50)
    
    # Show how themes evolved over years
    theme_year_analysis = df_enhanced.groupby(['year', 'primary_theme']).size().unstack(fill_value=0)
    
    for theme in theme_distribution.index[:3]:  # Top 3 themes
        theme_yearly = theme_year_analysis[theme] if theme in theme_year_analysis.columns else None
        if theme_yearly is not None:
            peak_year = theme_yearly.idxmax()
            peak_count = theme_yearly.max()
            print(f"{theme}: Peak in {peak_year} ({peak_count} speeches)")
    
    # High-confidence classifications
    high_confidence = df_enhanced[df_enhanced['theme_confidence'] >= 5.0]
    print(f"\n🎯 HIGH-CONFIDENCE CLASSIFICATIONS")
    print("-" * 50)
    print(f"Speeches with strong thematic signals: {len(high_confidence)} ({len(high_confidence)/total_speeches*100:.1f}%)")
    
    return theme_distribution

def extract_theme_keywords(df_enhanced):
    """Extract representative keywords for each theme"""
    
    print(f"\n🔍 THEME KEYWORD EXTRACTION")
    print("-" * 50)
    
    theme_keywords = {}
    
    for theme in df_enhanced['primary_theme'].unique():
        if theme == 'General/Other':
            continue
            
        theme_texts = df_enhanced[df_enhanced['primary_theme'] == theme]['cleaned_text'].tolist()
        
        if len(theme_texts) > 0:
            # Combine all texts for this theme
            combined_text = ' '.join(theme_texts)
            
            # Simple keyword extraction using frequency
            words = re.findall(r'\b[a-zA-Z]{4,}\b', combined_text.lower())
            word_freq = Counter(words)
            
            # Filter out common words and get top keywords
            common_words = {'that', 'this', 'with', 'have', 'will', 'from', 'they', 'been', 'their', 'would', 'could', 'should', 'also', 'more', 'such', 'only', 'like', 'time', 'very', 'can', 'may', 'must', 'through', 'over', 'under', 'within', 'between', 'among', 'during', 'before', 'after', 'above', 'below', 'while', 'where', 'when', 'what', 'which', 'whom', 'whose', 'these', 'those', 'some', 'many', 'much', 'most', 'other', 'another', 'every', 'each', 'both', 'all', 'any', 'none', 'one', 'two', 'three', 'first', 'second', 'third', 'last', 'next', 'previous', 'following', 'above', 'below', 'here', 'there', 'now', 'then', 'today', 'tomorrow', 'yesterday'}
            
            filtered_words = [(word, freq) for word, freq in word_freq.most_common(20) 
                            if word not in common_words and len(word) > 3]
            
            theme_keywords[theme] = filtered_words[:10]
            
            print(f"\n{theme}:")
            print(f"  Keywords: {', '.join([word for word, freq in filtered_words[:8]])}")
    
    return theme_keywords

# Apply the enhanced classification system
print("Starting enhanced thematic classification...")

# Classify speeches into themes
theme_results = classify_speech_themes(df_final, min_score=1.0, max_themes=3)

# Create enhanced clustering
df_enhanced, theme_clusters = enhanced_clustering_with_themes(df_final, theme_results)

# Generate comprehensive analysis report
theme_distribution = create_theme_analysis_report(df_enhanced, theme_clusters)

# Extract theme keywords
theme_keywords = extract_theme_keywords(df_enhanced)

# Update the main dataframe with new classifications
df_final = df_enhanced.copy()
df_final['theme'] = df_final['primary_theme']  # For compatibility with existing code

print("\n" + "="*70)
print("ENHANCED CLASSIFICATION COMPLETE!")
print("="*70)
print("✅ All speeches have been classified into thematic categories")
print("✅ No speeches marked as 'outliers' - all content is analyzed")
print("✅ Multi-dimensional theme analysis completed")
print("✅ Ready for visualization and further analysis")

# Show sample classifications
print(f"\n📋 SAMPLE CLASSIFICATIONS:")
print("-" * 50)

for theme in theme_distribution.head(3).index:
    sample_speech = df_final[df_final['theme'] == theme].iloc[0]
    print(f"\n{theme} - {sample_speech['country']} ({sample_speech['year']}):")
    print(f"  Preview: {sample_speech['cleaned_text'][:150]}...")
    print(f"  Confidence: {sample_speech['theme_confidence']:.1f}")

ENHANCED THEMATIC CLASSIFICATION SYSTEM
Starting enhanced thematic classification...
Analyzing speech content for thematic classification...


KeyboardInterrupt: 

In [None]:
# Part 7: Interactive Dashboard and Visualizations

print("="*50)
print("CREATING INTERACTIVE DASHBOARD")
print("="*50)

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

# 1. Main Cluster Visualization with Theme Labels
print("Creating main cluster visualization...")

fig_main = go.Figure()

# Create color mapping for themes
unique_themes = df_final['theme'].unique()
theme_colors = px.colors.qualitative.Set3[:len(unique_themes)]
theme_color_map = dict(zip(unique_themes, theme_colors))

# Add traces for each theme
for theme in unique_themes:
    if theme == "Noise":
        continue
    
    theme_data = df_final[df_final['theme'] == theme]
    
    fig_main.add_trace(go.Scatter(
        x=theme_data['umap_x'],
        y=theme_data['umap_y'],
        mode='markers',
        marker=dict(
            size=6,
            color=theme_color_map[theme],
            opacity=0.8,
            line=dict(width=0.5, color='white')
        ),
        text=[f"<b>{country} ({year})</b><br>Theme: {theme}<br>Cluster: {cluster}" 
              for country, year, cluster in zip(theme_data['country'], theme_data['year'], theme_data['cluster'])],
        hovertemplate='%{text}<br>X: %{x:.2f}<br>Y: %{y:.2f}<extra></extra>',
        name=theme,
        showlegend=True
    ))

# Add noise points if any
noise_data = df_final[df_final['theme'] == "Noise"]
if len(noise_data) > 0:
    fig_main.add_trace(go.Scatter(
        x=noise_data['umap_x'],
        y=noise_data['umap_y'],
        mode='markers',
        marker=dict(
            size=4,
            color='lightgray',
            opacity=0.4
        ),
        text=[f"<b>{country} ({year})</b><br>Theme: Noise" 
              for country, year in zip(noise_data['country'], noise_data['year'])],
        hovertemplate='%{text}<br>X: %{x:.2f}<br>Y: %{y:.2f}<extra></extra>',
        name="Noise",
        showlegend=True
    ))

fig_main.update_layout(
    title=dict(
        text="UN General Assembly Speech Themes (2010-2020)<br><sub>Interactive Cluster Analysis</sub>",
        x=0.5,
        font=dict(size=20)
    ),
    xaxis_title='UMAP Dimension 1',
    yaxis_title='UMAP Dimension 2',
    width=1000,
    height=700,
    hovermode='closest',
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=1.01
    )
)

fig_main.show()

# 2. Multi-faceted Analysis Dashboard
print("Creating comprehensive analysis dashboard...")

# Create subplot dashboard
fig_dashboard = make_subplots(
    rows=3, cols=2,
    subplot_titles=(
        'Theme Distribution Over Time',
        'Top Countries by Theme',
        'Cluster Sizes',
        'Year Distribution by Theme',
        'Country Diversity by Theme',
        'Theme Evolution Timeline'
    ),
    specs=[[{"type": "scatter"}, {"type": "bar"}],
           [{"type": "bar"}, {"type": "bar"}],
           [{"type": "bar"}, {"type": "scatter"}]]
)

# 1. Theme distribution over time
theme_year_data = df_final[df_final['theme'] != 'Noise'].groupby(['year', 'theme']).size().reset_index(name='count')
for theme in theme_year_data['theme'].unique():
    theme_subset = theme_year_data[theme_year_data['theme'] == theme]
    fig_dashboard.add_trace(
        go.Scatter(
            x=theme_subset['year'],
            y=theme_subset['count'],
            mode='lines+markers',
            name=theme,
            line=dict(color=theme_color_map.get(theme, 'gray')),
            showlegend=False
        ),
        row=1, col=1
    )

# 2. Top countries by theme (stacked bar)
country_theme_data = df_final[df_final['theme'] != 'Noise'].groupby(['country', 'theme']).size().reset_index(name='count')
top_countries = df_final['country'].value_counts().head(10).index

for theme in unique_themes:
    if theme == 'Noise':
        continue
    theme_country_data = country_theme_data[
        (country_theme_data['theme'] == theme) & 
        (country_theme_data['country'].isin(top_countries))
    ]
    fig_dashboard.add_trace(
        go.Bar(
            x=theme_country_data['country'],
            y=theme_country_data['count'],
            name=theme,
            marker_color=theme_color_map.get(theme, 'gray'),
            showlegend=False
        ),
        row=1, col=2
    )

# 3. Cluster sizes
cluster_sizes = df_final[df_final['cluster'] != -1].groupby(['cluster', 'theme']).size().reset_index(name='size')
fig_dashboard.add_trace(
    go.Bar(
        x=[f"C{row['cluster']}: {row['theme']}" for _, row in cluster_sizes.iterrows()],
        y=cluster_sizes['size'],
        marker_color=[theme_color_map.get(theme, 'gray') for theme in cluster_sizes['theme']],
        showlegend=False
    ),
    row=2, col=1
)

# 4. Year distribution by theme
theme_year_totals = df_final[df_final['theme'] != 'Noise'].groupby('theme')['year'].apply(list).reset_index()
for _, row in theme_year_totals.iterrows():
    theme = row['theme']
    years = row['year']
    year_counts = pd.Series(years).value_counts().sort_index()
    fig_dashboard.add_trace(
        go.Bar(
            x=year_counts.index,
            y=year_counts.values,
            name=theme,
            marker_color=theme_color_map.get(theme, 'gray'),
            showlegend=False,
            opacity=0.7
        ),
        row=2, col=2
    )

# 5. Country diversity by theme
theme_diversity = df_final[df_final['theme'] != 'Noise'].groupby('theme')['country'].nunique().reset_index()
theme_diversity.columns = ['theme', 'unique_countries']
fig_dashboard.add_trace(
    go.Bar(
        x=theme_diversity['theme'],
        y=theme_diversity['unique_countries'],
        marker_color=[theme_color_map.get(theme, 'gray') for theme in theme_diversity['theme']],
        showlegend=False
    ),
    row=3, col=1
)

# 6. Theme evolution (proportion over time)
theme_proportions = df_final[df_final['theme'] != 'Noise'].groupby(['year', 'theme']).size().unstack(fill_value=0)
theme_proportions = theme_proportions.div(theme_proportions.sum(axis=1), axis=0)

for theme in theme_proportions.columns:
    fig_dashboard.add_trace(
        go.Scatter(
            x=theme_proportions.index,
            y=theme_proportions[theme],
            mode='lines+markers',
            name=theme,
            line=dict(color=theme_color_map.get(theme, 'gray')),
            showlegend=False
        ),
        row=3, col=2
    )

# Update layout
fig_dashboard.update_layout(
    height=1200,
    title_text="UN Speech Analysis Dashboard",
    showlegend=False
)

# Update axes labels
fig_dashboard.update_xaxes(title_text="Year", row=1, col=1)
fig_dashboard.update_yaxes(title_text="Number of Speeches", row=1, col=1)
fig_dashboard.update_xaxes(title_text="Country", row=1, col=2)
fig_dashboard.update_yaxes(title_text="Number of Speeches", row=1, col=2)
fig_dashboard.update_xaxes(title_text="Cluster", row=2, col=1)
fig_dashboard.update_yaxes(title_text="Size", row=2, col=1)
fig_dashboard.update_xaxes(title_text="Year", row=2, col=2)
fig_dashboard.update_yaxes(title_text="Number of Speeches", row=2, col=2)
fig_dashboard.update_xaxes(title_text="Theme", row=3, col=1)
fig_dashboard.update_yaxes(title_text="Unique Countries", row=3, col=1)
fig_dashboard.update_xaxes(title_text="Year", row=3, col=2)
fig_dashboard.update_yaxes(title_text="Proportion", row=3, col=2)

fig_dashboard.show()

# 3. Country-Theme Heatmap
print("Creating country-theme analysis...")

# Create country-theme matrix for top countries
top_countries_list = df_final['country'].value_counts().head(15).index
country_theme_matrix = df_final[
    (df_final['country'].isin(top_countries_list)) & 
    (df_final['theme'] != 'Noise')
].pivot_table(
    index='country', 
    columns='theme', 
    values='year',  # Use year as values, just to count
    aggfunc='count', 
    fill_value=0
)

# Normalize by row to show proportions
country_theme_proportions = country_theme_matrix.div(country_theme_matrix.sum(axis=1), axis=0)

fig_heatmap = go.Figure(data=go.Heatmap(
    z=country_theme_proportions.values,
    x=country_theme_proportions.columns,
    y=country_theme_proportions.index,
    colorscale='Viridis',
    hoverongaps=False,
    hovertemplate='Country: %{y}<br>Theme: %{x}<br>Proportion: %{z:.2f}<extra></extra>'
))

fig_heatmap.update_layout(
    title='Country Focus by Theme (Proportional)',
    xaxis_title='Theme',
    yaxis_title='Country',
    width=800,
    height=600
)

fig_heatmap.show()

print("Interactive dashboard created successfully!")
print("\nDashboard includes:")
print("1. Main cluster visualization with theme labels")
print("2. Multi-faceted analysis dashboard")
print("3. Country-theme heatmap")
print("\nAll visualizations are interactive - hover, zoom, and click to explore!")

CREATING INTERACTIVE DASHBOARD
Creating main cluster visualization...


Creating comprehensive analysis dashboard...


Creating country-theme analysis...


Interactive dashboard created successfully!

Dashboard includes:
1. Main cluster visualization with theme labels
2. Multi-faceted analysis dashboard
3. Country-theme heatmap

All visualizations are interactive - hover, zoom, and click to explore!


In [None]:
# Part 8: Generate Insights and Executive Summary

print("="*50)
print("GENERATING INSIGHTS AND EXECUTIVE SUMMARY")
print("="*50)

# Calculate key statistics for the executive summary
total_speeches = len(df_final)
total_countries = df_final['country'].nunique()
total_years = df_final['year'].nunique()
total_clusters = len([c for c in df_final['cluster'].unique() if c != -1])
noise_percentage = (df_final['cluster'] == -1).sum() / len(df_final) * 100

# Theme statistics
theme_stats = df_final[df_final['theme'] != 'Noise']['theme'].value_counts()
most_common_theme = theme_stats.index[0] if len(theme_stats) > 0 else "None"

print("EXECUTIVE SUMMARY")
print("="*50)
print(f"📊 Dataset Overview:")
print(f"   • Total speeches analyzed: {total_speeches:,}")
print(f"   • Countries represented: {total_countries}")
print(f"   • Time period: {df_final['year'].min()}-{df_final['year'].max()}")
print(f"   • Clusters identified: {total_clusters}")
print(f"   • Noise/outliers: {noise_percentage:.1f}%")

print(f"\n🎯 Key Findings:")
print(f"   • Most common theme: {most_common_theme} ({theme_stats.iloc[0]} speeches)")
print(f"   • Theme diversity: {len(theme_stats)} distinct themes identified")

# Temporal Analysis
print(f"\n📈 Temporal Trends:")
theme_evolution = df_final[df_final['theme'] != 'Noise'].groupby(['year', 'theme']).size().unstack(fill_value=0)
theme_trends = {}

for theme in theme_evolution.columns:
    early_period = theme_evolution[theme][2010:2013].sum()
    late_period = theme_evolution[theme][2017:2020].sum()
    if early_period > 0:
        trend = ((late_period - early_period) / early_period) * 100
        theme_trends[theme] = trend

# Sort by trend
growing_themes = sorted([(k, v) for k, v in theme_trends.items() if v > 20], key=lambda x: x[1], reverse=True)
declining_themes = sorted([(k, v) for k, v in theme_trends.items() if v < -20], key=lambda x: x[1])

if growing_themes:
    print(f"   • Growing themes: {growing_themes[0][0]} (+{growing_themes[0][1]:.0f}%)")
if declining_themes:
    print(f"   • Declining themes: {declining_themes[0][0]} ({declining_themes[0][1]:.0f}%)")

# Country Analysis
print(f"\n🌍 Country Patterns:")
# Find countries with most diverse themes
country_diversity = df_final[df_final['theme'] != 'Noise'].groupby('country')['theme'].nunique().sort_values(ascending=False)
most_diverse_country = country_diversity.index[0] if len(country_diversity) > 0 else "None"
print(f"   • Most thematically diverse: {most_diverse_country} ({country_diversity.iloc[0]} themes)")

# Find countries most focused on single themes
country_focus = {}
for country in df_final['country'].value_counts().head(10).index:
    country_data = df_final[(df_final['country'] == country) & (df_final['theme'] != 'Noise')]
    if len(country_data) > 0:
        dominant_theme = country_data['theme'].value_counts().iloc[0]
        total_speeches = len(country_data)
        focus_ratio = dominant_theme / total_speeches
        if focus_ratio > 0.5:  # More than 50% focused on one theme
            country_focus[country] = (country_data['theme'].value_counts().index[0], focus_ratio)

if country_focus:
    most_focused = max(country_focus.items(), key=lambda x: x[1][1])
    print(f"   • Most focused country: {most_focused[0]} ({most_focused[1][0]}, {most_focused[1][1]:.1%})")

# Generate detailed insights for each theme
print(f"\n" + "="*50)
print("DETAILED THEME ANALYSIS")
print("="*50)

def generate_theme_insights(theme):
    """Generate detailed insights for a specific theme"""
    theme_data = df_final[df_final['theme'] == theme]
    
    if len(theme_data) == 0:
        return
    
    print(f"\n🔍 {theme.upper()}")
    print(f"   Size: {len(theme_data)} speeches ({len(theme_data)/len(df_final)*100:.1f}% of total)")
    
    # Top countries
    top_countries = theme_data['country'].value_counts().head(3)
    print(f"   Leading countries: {', '.join([f'{c} ({n})' for c, n in top_countries.items()])}")
    
    # Time trend
    yearly_counts = theme_data['year'].value_counts().sort_index()
    peak_year = yearly_counts.idxmax()
    print(f"   Peak year: {peak_year} ({yearly_counts[peak_year]} speeches)")
    
    # Geographic spread
    unique_countries = theme_data['country'].nunique()
    print(f"   Geographic spread: {unique_countries} countries")
    
    # Get cluster info
    theme_clusters = theme_data['cluster'].unique()
    if len(theme_clusters) == 1 and theme_clusters[0] != -1:
        cluster_id = theme_clusters[0]
        if cluster_id in cluster_keywords:
            top_keywords = cluster_keywords[cluster_id]['keybert'][:5]
            print(f"   Key concepts: {', '.join([kw[0] for kw in top_keywords])}")
    
    # Representative speech
    if len(theme_data) > 0:
        # Find most representative speech (closest to cluster center if available)
        sample_speech = theme_data.sample(1).iloc[0]
        print(f"   Example ({sample_speech['country']}, {sample_speech['year']}):")
        print(f"   '{sample_speech['cleaned_text'][:150]}...'")

# Generate insights for each theme
for theme in sorted(theme_stats.index):
    generate_theme_insights(theme)

# Cross-cutting Analysis
print(f"\n" + "="*50)
print("CROSS-CUTTING INSIGHTS")
print("="*50)

print(f"\n🔄 Theme Relationships:")
# Analyze which countries appear in multiple themes
country_theme_diversity = df_final[df_final['theme'] != 'Noise'].groupby('country')['theme'].apply(list).to_dict()

multi_theme_countries = []
for country, themes in country_theme_diversity.items():
    if len(set(themes)) > 2:  # Countries with 3+ different themes
        theme_counts = pd.Series(themes).value_counts()
        multi_theme_countries.append((country, len(set(themes)), theme_counts.index[0]))

multi_theme_countries.sort(key=lambda x: x[1], reverse=True)

if multi_theme_countries:
    print(f"Countries with diverse thematic focus:")
    for country, theme_count, dominant_theme in multi_theme_countries[:5]:
        print(f"   • {country}: {theme_count} themes (primarily {dominant_theme})")

print(f"\n🌐 Regional Patterns:")
# Simple regional analysis based on common country groupings
regions = {
    'North America': ['United States of America', 'Canada', 'Mexico'],
    'Europe': ['United Kingdom', 'Germany', 'France', 'Italy', 'Spain', 'Netherlands', 'Sweden', 'Norway'],
    'Asia': ['China', 'Japan', 'India', 'South Korea', 'Indonesia', 'Thailand', 'Singapore'],
    'Middle East': ['Saudi Arabia', 'Iran', 'Turkey', 'Israel', 'United Arab Emirates', 'Egypt'],
    'Africa': ['South Africa', 'Nigeria', 'Kenya', 'Ethiopia', 'Morocco'],
    'Latin America': ['Brazil', 'Argentina', 'Chile', 'Colombia', 'Peru']
}

for region, countries in regions.items():
    region_data = df_final[df_final['country'].isin(countries) & (df_final['theme'] != 'Noise')]
    if len(region_data) > 0:
        dominant_theme = region_data['theme'].value_counts().index[0]
        theme_percentage = region_data['theme'].value_counts().iloc[0] / len(region_data) * 100
        print(f"   • {region}: {dominant_theme} ({theme_percentage:.0f}% of speeches)")

# Summary Statistics for Export
print(f"\n" + "="*50)
print("SUMMARY STATISTICS")
print("="*50)

summary_stats = {
    'total_speeches': total_speeches,
    'total_countries': total_countries,
    'total_clusters': total_clusters,
    'noise_percentage': noise_percentage,
    'themes': dict(theme_stats),
    'most_common_theme': most_common_theme,
    'time_period': f"{df_final['year'].min()}-{df_final['year'].max()}",
    'cluster_quality': silhouette_avg if 'silhouette_avg' in locals() else 'N/A'
}

print("Summary statistics dictionary created for export:")
for key, value in summary_stats.items():
    if key != 'themes':  # Skip the detailed themes dict for readability
        print(f"   {key}: {value}")

# Save results for further analysis
print(f"\n📁 Saving Results:")
df_final.to_csv('un_speeches_clustered.csv', index=False)
print("   • Clustered dataset saved as 'un_speeches_clustered.csv'")

# Save cluster profiles
import json
with open('cluster_profiles.json', 'w') as f:
    # Convert to JSON-serializable format
    json_profiles = {}
    for cluster_id, profile in cluster_profiles.items():
        json_profiles[str(cluster_id)] = {
            'theme': profile['theme'],
            'size': profile['size'],
            'keywords_keybert': profile['keywords_keybert'],
            'keywords_tfidf': profile['keywords_tfidf'],
            'top_countries': profile['top_countries'],
            'year_range': profile['year_range']
        }
    json.dump(json_profiles, f, indent=2, default=lambda o: o.item() if hasattr(o, 'item') else str(o))

print("   • Cluster profiles saved as 'cluster_profiles.json'")

print(f"\n✅ Analysis Complete!")
print(f"Found {total_clusters} distinct themes in UN General Assembly speeches from 2010-2020")
print(f"The analysis reveals key global diplomatic priorities and their evolution over time.")

GENERATING INSIGHTS AND EXECUTIVE SUMMARY
EXECUTIVE SUMMARY
📊 Dataset Overview:
   • Total speeches analyzed: 1,158
   • Countries represented: 196
   • Time period: 2010-2015
   • Clusters identified: 2
   • Noise/outliers: 0.3%

🎯 Key Findings:
   • Most common theme: General (1155 speeches)
   • Theme diversity: 1 distinct themes identified

📈 Temporal Trends:

🌍 Country Patterns:
   • Most thematically diverse: AFG (1 themes)
   • Most focused country: SUR (General, 100.0%)

DETAILED THEME ANALYSIS

🔍 GENERAL
   Size: 1155 speeches (99.7% of total)
   Leading countries: SUR (6), CZE (6), PSE (6)
   Peak year: 2012 (194 speeches)
   Geographic spread: 196 countries
   Example (ECU, 2013):
   'it is an honour, , for me to address the . things have changed since i was first here, and not only because time has passed and we have all become a l...'

CROSS-CUTTING INSIGHTS

🔄 Theme Relationships:

🌐 Regional Patterns:

SUMMARY STATISTICS
Summary statistics dictionary created for export:


In [None]:
# Part 9: Final Interactive Explorer

print("="*50)
print("CREATING FINAL INTERACTIVE EXPLORER")
print("="*50)

# This creates the ultimate interactive visualization that combines everything
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import pandas as pd

# Create the master interactive explorer
def create_master_explorer():
    """Create a comprehensive interactive explorer"""
    
    # Main scatter plot with multiple view options
    fig = go.Figure()
    
    # Add buttons for different view modes
    buttons = []
    
    # 1. View by Theme (default)
    for theme in sorted(df_final['theme'].unique()):
        if theme == "Noise":
            continue
            
        theme_data = df_final[df_final['theme'] == theme]
        
        fig.add_trace(go.Scatter(
            x=theme_data['umap_x'],
            y=theme_data['umap_y'],
            mode='markers',
            marker=dict(
                size=8,
                color=theme_color_map.get(theme, 'gray'),
                opacity=0.8,
                line=dict(width=1, color='white'),
                symbol='circle'
            ),
            text=[
                f"<b>{country} ({year})</b><br>" +
                f"Theme: {theme}<br>" +
                f"Cluster: {cluster}<br>" +
                f"Text preview: {text[:100]}..."
                for country, year, cluster, text in zip(
                    theme_data['country'], 
                    theme_data['year'], 
                    theme_data['cluster'],
                    theme_data['cleaned_text']
                )
            ],
            hovertemplate='%{text}<br>Coordinates: (%{x:.2f}, %{y:.2f})<extra></extra>',
            name=theme,
            visible=True
        ))
    
    # Add noise points
    noise_data = df_final[df_final['theme'] == "Noise"]
    if len(noise_data) > 0:
        fig.add_trace(go.Scatter(
            x=noise_data['umap_x'],
            y=noise_data['umap_y'],
            mode='markers',
            marker=dict(
                size=4,
                color='lightgray',
                opacity=0.3,
                symbol='x'
            ),
            text=[f"<b>{country} ({year})</b><br>Status: Outlier" 
                  for country, year in zip(noise_data['country'], noise_data['year'])],
            hovertemplate='%{text}<br>Coordinates: (%{x:.2f}, %{y:.2f})<extra></extra>',
            name="Outliers",
            visible=True
        ))
    
    # Create dropdown for different view modes
    dropdown_buttons = [
        dict(
            label="View by Theme",
            method="update",
            args=[{"visible": [True] * len(fig.data)}]
        )
    ]
    
    # Add view by year option
    year_traces = []
    for year in sorted(df_final['year'].unique()):
        year_data = df_final[df_final['year'] == year]
        
        year_trace = go.Scatter(
            x=year_data['umap_x'],
            y=year_data['umap_y'],
            mode='markers',
            marker=dict(
                size=6,
                color=year,
                colorscale='Viridis',
                opacity=0.8,
                colorbar=dict(title="Year")
            ),
            text=[f"<b>{country} ({year})</b><br>Theme: {theme}" 
                  for country, theme in zip(year_data['country'], year_data['theme'])],
            hovertemplate='%{text}<br>Coordinates: (%{x:.2f}, %{y:.2f})<extra></extra>',
            name=str(year),
            visible=False
        )
        fig.add_trace(year_trace)
    
    # Add year view to dropdown
    year_visibility = [False] * len([t for t in fig.data if 'Noise' not in t.name and t.name in df_final['theme'].unique()]) + [False] + [True] * len(year_traces)
    dropdown_buttons.append(
        dict(
            label="View by Year",
            method="update",
            args=[{"visible": year_visibility}]
        )
    )
    
    # Update layout with controls
    fig.update_layout(
        title=dict(
            text="UN General Assembly Speech Analysis Explorer<br><sub>Interactive clustering of diplomatic themes (2010-2020)</sub>",
            x=0.5,
            font=dict(size=20)
        ),
        xaxis=dict(
            title='UMAP Dimension 1',
            gridcolor='lightgray',
            gridwidth=0.5
        ),
        yaxis=dict(
            title='UMAP Dimension 2',
            gridcolor='lightgray',
            gridwidth=0.5
        ),
        width=1200,
        height=800,
        hovermode='closest',
        legend=dict(
            yanchor="top",
            y=0.99,
            xanchor="left",
            x=1.01,
            bgcolor="rgba(255,255,255,0.8)"
        ),
        updatemenus=[
            dict(
                buttons=dropdown_buttons,
                direction="down",
                showactive=True,
                x=0.02,
                y=0.98,
                bgcolor="rgba(255,255,255,0.8)",
                bordercolor="gray",
                borderwidth=1
            )
        ],
        annotations=[
            dict(
                text="Hover over points for details<br>Use legend to filter themes<br>Use dropdown to change view mode",
                showarrow=False,
                x=0.02,
                y=0.02,
                xref="paper",
                yref="paper",
                bgcolor="rgba(255,255,255,0.8)",
                bordercolor="gray",
                borderwidth=1,
                font=dict(size=10)
            )
        ]
    )
    
    return fig

# Create and display the master explorer
print("Creating master interactive explorer...")
master_fig = create_master_explorer()
master_fig.show()

# Create a summary table
print("\nCreating summary table...")

summary_table_data = []
for theme in sorted(df_final['theme'].unique()):
    if theme == "Noise":
        continue
    
    theme_data = df_final[df_final['theme'] == theme]
    
    # Get most common countries for this theme
    top_countries = theme_data['country'].value_counts().head(3)
    top_countries_str = ", ".join([f"{country} ({count})" for country, count in top_countries.items()])
    
    # Get year range
    year_range = f"{theme_data['year'].min()}-{theme_data['year'].max()}"
    
    # Get sample keywords (if available)
    sample_text = " ".join(theme_data['cleaned_text'].head(5))
    # Simple keyword extraction (you might want to use more sophisticated methods)
    words = sample_text.lower().split()
    common_words = pd.Series(words).value_counts().head(5)
    keywords = ", ".join(common_words.index[:5])
    
    summary_table_data.append({
        'Theme': theme,
        'Count': len(theme_data),
        'Percentage': f"{len(theme_data)/len(df_final)*100:.1f}%",
        'Year Range': year_range,
        'Top Countries': top_countries_str,
        'Sample Keywords': keywords
    })

# Create summary DataFrame
summary_df = pd.DataFrame(summary_table_data)
summary_df = summary_df.sort_values('Count', ascending=False)

print("\n" + "="*100)
print("THEME SUMMARY TABLE")
print("="*100)
print(summary_df.to_string(index=False))

# Create additional visualizations
def create_theme_timeline():
    """Create a timeline showing theme evolution over years"""
    
    # Prepare data for timeline
    timeline_data = df_final[df_final['theme'] != 'Noise'].groupby(['year', 'theme']).size().reset_index(name='count')
    
    fig = px.line(timeline_data, x='year', y='count', color='theme',
                  title='Theme Evolution Over Time',
                  labels={'count': 'Number of Speeches', 'year': 'Year'},
                  markers=True)
    
    fig.update_layout(
        width=1000,
        height=600,
        legend=dict(
            yanchor="top",
            y=0.99,
            xanchor="left",
            x=1.01
        )
    )
    
    return fig

def create_country_participation():
    """Create visualization of country participation by theme"""
    
    # Get top participating countries
    country_theme_counts = df_final[df_final['theme'] != 'Noise'].groupby(['country', 'theme']).size().reset_index(name='count')
    top_countries = df_final['country'].value_counts().head(15).index
    
    country_theme_filtered = country_theme_counts[country_theme_counts['country'].isin(top_countries)]
    
    fig = px.bar(country_theme_filtered, x='country', y='count', color='theme',
                 title='Top 15 Countries by Theme Participation',
                 labels={'count': 'Number of Speeches', 'country': 'Country'})
    
    fig.update_layout(
        width=1200,
        height=600,
        xaxis_tickangle=-45,
        legend=dict(
            yanchor="top",
            y=0.99,
            xanchor="left",
            x=1.01
        )
    )
    
    return fig

def create_theme_heatmap():
    """Create a heatmap showing theme distribution by year"""
    
    # Prepare heatmap data
    heatmap_data = df_final[df_final['theme'] != 'Noise'].pivot_table(
        index='theme', columns='year', values='country', aggfunc='count', fill_value=0
    )
    
    fig = px.imshow(heatmap_data,
                    title='Theme Distribution Heatmap by Year',
                    labels={'x': 'Year', 'y': 'Theme', 'color': 'Number of Speeches'},
                    aspect='auto')
    
    fig.update_layout(
        width=1000,
        height=600
    )
    
    return fig

# Create and display additional visualizations
print("\nCreating theme timeline...")
timeline_fig = create_theme_timeline()
timeline_fig.show()

print("\nCreating country participation chart...")
country_fig = create_country_participation()
country_fig.show()

print("\nCreating theme heatmap...")
heatmap_fig = create_theme_heatmap()
heatmap_fig.show()

# Print final statistics
print("\n" + "="*80)
print("FINAL ANALYSIS STATISTICS")
print("="*80)

total_speeches = len(df_final)
total_themes = len(df_final['theme'].unique()) - (1 if 'Noise' in df_final['theme'].unique() else 0)
total_countries = df_final['country'].nunique()
year_span = df_final['year'].max() - df_final['year'].min() + 1
noise_percentage = (len(df_final[df_final['theme'] == 'Noise']) / total_speeches * 100) if 'Noise' in df_final['theme'].unique() else 0

print(f"📊 Total Speeches Analyzed: {total_speeches:,}")
print(f"🏛️ Unique Countries: {total_countries}")
print(f"📅 Years Covered: {year_span} ({df_final['year'].min()}-{df_final['year'].max()})")
print(f"🎯 Themes Identified: {total_themes}")
print(f"📈 Successfully Clustered: {100-noise_percentage:.1f}%")
print(f"🔍 Outliers (Noise): {noise_percentage:.1f}%")

print(f"\n🏆 Most Active Countries:")
top_countries = df_final['country'].value_counts().head(5)
for i, (country, count) in enumerate(top_countries.items(), 1):
    print(f"   {i}. {country}: {count} speeches")

print(f"\n📋 Largest Themes:")
theme_counts = df_final[df_final['theme'] != 'Noise']['theme'].value_counts().head(5)
for i, (theme, count) in enumerate(theme_counts.items(), 1):
    percentage = count / total_speeches * 100
    print(f"   {i}. {theme}: {count} speeches ({percentage:.1f}%)")

print("\n" + "="*80)
print("ANALYSIS COMPLETE! 🎉")
print("="*80)
print("All visualizations have been generated and are ready for exploration.")
print("Use the interactive features to dive deeper into the diplomatic themes!")

CREATING FINAL INTERACTIVE EXPLORER
Creating master interactive explorer...



Creating summary table...

THEME SUMMARY TABLE
  Theme  Count Percentage Year Range             Top Countries      Sample Keywords
General   1155      99.7%  2010-2015 SUR (6), CZE (6), PSE (6) the, of, and, to, in

Creating theme timeline...



Creating country participation chart...



Creating theme heatmap...



FINAL ANALYSIS STATISTICS
📊 Total Speeches Analyzed: 1,158
🏛️ Unique Countries: 196
📅 Years Covered: 6 (2010-2015)
🎯 Themes Identified: 1
📈 Successfully Clustered: 99.7%
🔍 Outliers (Noise): 0.3%

🏆 Most Active Countries:
   1. SUR: 6 speeches
   2. CZE: 6 speeches
   3. PSE: 6 speeches
   4. IRL: 6 speeches
   5. PRK: 6 speeches

📋 Largest Themes:
   1. General: 1155 speeches (99.7%)

ANALYSIS COMPLETE! 🎉
All visualizations have been generated and are ready for exploration.
Use the interactive features to dive deeper into the diplomatic themes!
