In [None]:
!pip install bertopic sentence-transformers scikit-learn matplotlib pandas openpyxl

In [None]:

import sys

def check_and_install_packages():
    """Check if required packages are installed and provide installation instructions"""
    required_packages = {
        'bertopic': 'bertopic',
        'sentence_transformers': 'sentence-transformers',
        'sklearn': 'scikit-learn',
        'matplotlib': 'matplotlib',
        'pandas': 'pandas',
        'openpyxl': 'openpyxl'
    }
    
    missing_packages = []
    
    for package, pip_name in required_packages.items():
        try:
            __import__(package)
        except ImportError:
            missing_packages.append(pip_name)
    
    if missing_packages:
        print("Missing required packages. Please install them using:")
        print(f"pip install {' '.join(missing_packages)}")
        print("\nOr with conda:")
        print(f"conda install -c conda-forge {' '.join(missing_packages)}")
        return False
    return True

if not check_and_install_packages():
    sys.exit("Please install the required packages and restart the kernel.")

import pandas as pd
import numpy as np
from bertopic import BERTopic
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
import warnings
warnings.filterwarnings('ignore')

class NPMIOptimizer:
    def __init__(self, data_path, text_columns=['title'], selftext_column='selftext', min_cluster_size=30):
        """
        Initialize the NPMI optimizer for BERTopic with larger minimum cluster size
        
        Parameters:
        data_path: Path to the Excel file
        text_columns: List of columns to use for clustering (default: ['title'])
        selftext_column: Optional selftext column name
        min_cluster_size: Minimum size for clusters (default: 30)
        """
        self.data_path = data_path
        self.text_columns = text_columns
        self.selftext_column = selftext_column
        self.min_cluster_size = min_cluster_size
        self.df = None
        self.documents = None
        self.embeddings = None
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        
    def load_and_preprocess_data(self):
        """Load data and create combined text for clustering"""
        print("Loading data...")
        self.df = pd.read_excel(self.data_path)
        print(f"Loaded {len(self.df)} documents")
        
        combined_texts = []
        for idx, row in self.df.iterrows():
            text_parts = []
            
            for col in self.text_columns:
                if col in self.df.columns and pd.notna(row[col]):
                    text_parts.append(str(row[col]))
            
            if (self.selftext_column in self.df.columns and 
                pd.notna(row[self.selftext_column]) and 
                str(row[self.selftext_column]).strip() != ''):
                text_parts.append(str(row[self.selftext_column]))
            
            combined_text = ' '.join(text_parts)
            combined_texts.append(combined_text if combined_text.strip() else 'empty')
        
        self.documents = combined_texts
        print("Text preprocessing completed")
        
        print("Generating embeddings...")
        self.embeddings = self.embedding_model.encode(self.documents, show_progress_bar=True)
        print("Embeddings generated")
        
    def reassign_outliers(self, df_temp, embeddings, topic_col='topic'):
        """Reassign outlier documents (-1 topic) to nearest clusters"""
        outlier_indices = np.where(df_temp[topic_col] == -1)[0]
        
        if len(outlier_indices) == 0:
            return df_temp.copy()
        
        outlier_embeddings = np.array(embeddings)[outlier_indices]

        topic_ids = sorted(set(df_temp[topic_col]) - {-1})
        
        if len(topic_ids) == 0:
            return df_temp.copy()
        
        topic_centroids = []
        for topic in topic_ids:
            idxs = df_temp[df_temp[topic_col] == topic].index
            topic_embeds = np.array(embeddings)[idxs]
            centroid = topic_embeds.mean(axis=0)
            topic_centroids.append(centroid)
        
        topic_centroids = np.vstack(topic_centroids)
        
        sims = cosine_similarity(outlier_embeddings, topic_centroids)
        
        reassigned_topics = [topic_ids[i] for i in sims.argmax(axis=1)]
        
        df_result = df_temp.copy()
        df_result.loc[outlier_indices, topic_col] = reassigned_topics
        
        return df_result
    
    def calculate_npmi(self, documents, topics, top_words_per_topic=10):
        """
        Calculate NPMI (Normalized Pointwise Mutual Information) for topic coherence
        
        Parameters:
        documents: List of documents
        topics: List of topic assignments
        top_words_per_topic: Number of top words to consider per topic
        """
        vectorizer = CountVectorizer(max_features=5000, stop_words='english', 
                                   min_df=2, max_df=0.95, ngram_range=(1,2))
        doc_word_matrix = vectorizer.fit_transform(documents)
        vocab = vectorizer.get_feature_names_out()
        
        topic_docs = defaultdict(list)
        for doc_idx, topic in enumerate(topics):
            if topic != -1:
                topic_docs[topic].append(doc_idx)
        
        if len(topic_docs) == 0:
            return 0.0
        
        topic_npmis = []
        
        for topic_id, doc_indices in topic_docs.items():
            if len(doc_indices) < 2:
                continue
                
            topic_matrix = doc_word_matrix[doc_indices]
            word_freqs = np.array(topic_matrix.sum(axis=0)).flatten()
            
            top_word_indices = word_freqs.argsort()[-top_words_per_topic:][::-1]
            top_words = [(idx, word_freqs[idx]) for idx in top_word_indices if word_freqs[idx] > 0]
            
            if len(top_words) < 2:
                continue
            
            npmi_scores = []
            total_docs = len(documents)
            
            for i, (word1_idx, freq1) in enumerate(top_words):
                for j, (word2_idx, freq2) in enumerate(top_words):
                    if i >= j:
                        continue
                    
                    cooc_count = 0
                    for doc_idx in doc_indices:
                        if (doc_word_matrix[doc_idx, word1_idx] > 0 and 
                            doc_word_matrix[doc_idx, word2_idx] > 0):
                            cooc_count += 1
                    
                    if cooc_count == 0:
                        continue
                    
                    p_w1 = freq1 / total_docs
                    p_w2 = freq2 / total_docs
                    p_w1_w2 = cooc_count / total_docs
                    
                    if p_w1 > 0 and p_w2 > 0 and p_w1_w2 > 0:
                        pmi = np.log(p_w1_w2 / (p_w1 * p_w2))
                        npmi = pmi / (-np.log(p_w1_w2))
                        npmi_scores.append(npmi)
            
            if npmi_scores:
                topic_npmis.append(np.mean(npmi_scores))

        return np.mean(topic_npmis) if topic_npmis else 0.0
    
    def find_optimal_clusters(self, min_topics=2, max_topics=20, step=1):
        """
        Find optimal number of clusters using NPMI with minimum cluster size constraint
        
        Parameters:
        min_topics: Minimum number of topics to test
        max_topics: Maximum number of topics to test
        step: Step size for topic range
        """
        if self.documents is None:
            self.load_and_preprocess_data()
        
        results = []
        topic_range = range(min_topics, max_topics + 1, step)
        
        print(f"Testing {len(topic_range)} different cluster configurations...")
        print(f"Range: {min_topics} to {max_topics} topics")
        print(f"Minimum cluster size: {self.min_cluster_size}")
        
        for n_topics in topic_range:
            print(f"\nTesting {n_topics} topics...")
            
            try:
                from hdbscan import HDBSCAN
                
                hdbscan_model = HDBSCAN(
                    min_cluster_size=self.min_cluster_size,
                    metric='euclidean',
                    cluster_selection_method='eom'
                )
                
                topic_model = BERTopic(
                    nr_topics=n_topics,
                    hdbscan_model=hdbscan_model,
                    embedding_model=self.embedding_model,
                    calculate_probabilities=False,
                    verbose=False
                )
                
                topics, probabilities = topic_model.fit_transform(self.documents, self.embeddings)

                df_temp = pd.DataFrame({'topic': topics})
                
                df_reassigned = self.reassign_outliers(df_temp, self.embeddings)
                final_topics = df_reassigned['topic'].tolist()
                
                npmi_score = self.calculate_npmi(self.documents, final_topics)
                
                unique_topics = len(set(final_topics) - {-1})
                outlier_count = sum(1 for t in final_topics if t == -1)
                
                topic_counts = Counter([t for t in final_topics if t != -1])
                min_topic_size = min(topic_counts.values()) if topic_counts else 0
                max_topic_size = max(topic_counts.values()) if topic_counts else 0
                
                results.append({
                    'n_topics_requested': n_topics,
                    'n_topics_actual': unique_topics,
                    'npmi_score': npmi_score,
                    'outliers_before': sum(1 for t in topics if t == -1),
                    'outliers_after': outlier_count,
                    'min_topic_size': min_topic_size,
                    'max_topic_size': max_topic_size,
                    'topic_model': topic_model
                })
                
                print(f"  NPMI: {npmi_score:.4f}, Actual topics: {unique_topics}")
                print(f"  Topic sizes - Min: {min_topic_size}, Max: {max_topic_size}")
                print(f"  Outliers before/after: {results[-1]['outliers_before']}/{outlier_count}")
                
            except Exception as e:
                print(f"  Error with {n_topics} topics: {str(e)}")
                continue

        if results:
            valid_results = [r for r in results if r['n_topics_actual'] > 0]
            
            if valid_results:
                best_result = max(valid_results, key=lambda x: x['npmi_score'])
                
                print(f"\n" + "="*60)
                print("NPMI OPTIMIZATION RESULTS (Min Cluster Size: 30)")
                print("="*60)
                print(f"Requested topics: {best_result['n_topics_requested']}")
                print(f"ACTUAL TOPICS GENERATED: {best_result['n_topics_actual']} (This is what you got!)")
                print(f"Best NPMI score: {best_result['npmi_score']:.4f}")
                print(f"Topic size range: {best_result['min_topic_size']} - {best_result['max_topic_size']}")
                print(f"Outliers reassigned: {best_result['outliers_before']}")
                
                same_actual_topics = [r for r in valid_results if r['n_topics_actual'] == best_result['n_topics_actual']]
                if len(same_actual_topics) > 1:
                    print(f"\nNote: {len(same_actual_topics)} different requests produced {best_result['n_topics_actual']} topics:")
                    for r in same_actual_topics[:5]:  # Show first 5
                        print(f"  Request {r['n_topics_requested']} → {r['n_topics_actual']} topics (NPMI: {r['npmi_score']:.4f})")
                    if len(same_actual_topics) > 5:
                        print(f"  ... and {len(same_actual_topics) - 5} more")
                
                return results, best_result
            else:
                print("No valid results found!")
                return [], None
        else:
            print("No valid results found!")
            return [], None
    
    def plot_results(self, results):
        """Plot NPMI scores across different numbers of topics"""
        if not results:
            print("No results to plot!")
            return
        
        n_topics = [r['n_topics_requested'] for r in results]
        npmi_scores = [r['npmi_score'] for r in results]
        
        plt.figure(figsize=(15, 5))
        
        plt.subplot(1, 3, 1)
        actual_topics = [r['n_topics_actual'] for r in results]
        npmi_scores = [r['npmi_score'] for r in results]
        
        scatter = plt.scatter(actual_topics, npmi_scores, c=n_topics, cmap='viridis', s=60, alpha=0.7)
        plt.xlabel('Actual Topics Generated')
        plt.ylabel('NPMI Score')
        plt.title('NPMI Score vs Actual Topics Generated\n(Color = Requested Topics)')
        plt.grid(True, alpha=0.3)
        plt.colorbar(scatter, label='Requested Topics')
        
        best_idx = np.argmax(npmi_scores)
        plt.scatter(actual_topics[best_idx], npmi_scores[best_idx], 
                   color='red', s=100, marker='*', label=f'Best: {actual_topics[best_idx]} actual topics')
        plt.legend()
        
        plt.subplot(1, 3, 2)
        actual_topics = [r['n_topics_actual'] for r in results]
        plt.plot(n_topics, actual_topics, 'go-', linewidth=2, markersize=8, label='Actual')
        plt.plot(n_topics, n_topics, 'r--', alpha=0.5, label='Requested')
        plt.xlabel('Requested Topics')
        plt.ylabel('Actual Topics')
        plt.title('Requested vs Actual Topics')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        plt.subplot(1, 3, 3)
        min_sizes = [r['min_topic_size'] for r in results]
        max_sizes = [r['max_topic_size'] for r in results]
        plt.plot(n_topics, min_sizes, 'b-', label='Min Topic Size', marker='o')
        plt.plot(n_topics, max_sizes, 'r-', label='Max Topic Size', marker='s')
        plt.axhline(y=30, color='green', linestyle='--', alpha=0.7, label='Min Cluster Size (30)')
        plt.xlabel('Number of Topics')
        plt.ylabel('Topic Size')
        plt.title('Topic Size Ranges')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
    
    def get_final_model(self, best_result):
        """Get the final model with reassigned outliers"""
        if best_result is None:
            return None, None
        
        topic_model = best_result['topic_model']
        
        topics, probabilities = topic_model.fit_transform(self.documents, self.embeddings)
        df_temp = pd.DataFrame({'topic': topics})
        df_final = self.reassign_outliers(df_temp, self.embeddings)
        
        self.df['topic'] = df_final['topic']
        
        return topic_model, self.df

if __name__ == "__main__":
    optimizer = NPMIOptimizer(
        data_path="",
        text_columns=['title'], 
        selftext_column='selftext',  
        min_cluster_size=30  
    )
    
    optimizer.load_and_preprocess_data()
    

    results, best_result = optimizer.find_optimal_clusters(
        min_topics=1,  
        max_topics=40,  
        step=1    
    )
    
    optimizer.plot_results(results)
    
    if best_result:
        final_model, final_df = optimizer.get_final_model(best_result)
        
        print("\nFinal topic distribution:")
        print(final_df['topic'].value_counts().sort_index())
        
        print("\nTop keywords per topic:")
        for topic_id in sorted(final_df['topic'].unique()):
            if topic_id != -1:
                keywords = final_model.get_topic(topic_id)
                top_words = [word for word, score in keywords[:5]]
                print(f"Topic {topic_id}: {', '.join(top_words)}")
                

In [None]:
import sys

def check_and_install_openai():
    """Check if OpenAI package is installed"""
    try:
        import openai
        from openai import OpenAI
        return True
    except ImportError:
        print("Missing OpenAI package. Please install it using:")
        print("pip install openai")
        print("\nOr with conda:")
        print("conda install -c conda-forge openai")
        return False

if not check_and_install_openai():
    sys.exit("Please install the OpenAI package and restart the kernel.")

import openai
import time
import pandas as pd
import numpy as np
from openai import OpenAI
from scipy import stats
import random

class TopicLabeler:
    def __init__(self, api_key, final_df, documents, confidence_level=0.95):
        """
        Initialize the topic labeler with ChatGPT-4o
        
        Parameters:
        api_key: Your OpenAI API key
        final_df: DataFrame with topic assignments
        documents: List of original documents
        confidence_level: Confidence level for sampling (default: 0.95)
        """
        self.client = OpenAI(api_key=api_key)
        self.final_df = final_df.copy()
        self.documents = documents
        self.confidence_level = confidence_level
        self.topic_labels = {}
        
        self.final_df['document'] = documents
        
    def calculate_sample_size(self, population_size, confidence_level=0.95, margin_error=0.05):
        """
        Calculate required sample size for given confidence level and margin of error
        
        Parameters:
        population_size: Size of the topic cluster
        confidence_level: Confidence level (default: 0.95 for 95% CI)
        margin_error: Margin of error (default: 0.05 for 5%)
        """
        z_scores = {0.90: 1.645, 0.95: 1.96, 0.99: 2.576}
        z = z_scores.get(confidence_level, 1.96)
        
        p = 0.5
        
        n = (z**2 * p * (1-p)) / (margin_error**2)
        
        if population_size <= n:
            return population_size
        else:
            n_corrected = n / (1 + (n - 1) / population_size)
            return max(int(np.ceil(n_corrected)), min(10, population_size))
    
    def sample_topic_posts(self, topic_id, max_sample_size=50):
        """
        Sample posts from a topic using statistical sampling for 95% CI
        
        Parameters:
        topic_id: The topic to sample from
        max_sample_size: Maximum number of posts to sample (to keep API calls manageable)
        """
        topic_posts = self.final_df[self.final_df['topic'] == topic_id]
        population_size = len(topic_posts)
        
        if population_size == 0:
            return []
        
        sample_size = self.calculate_sample_size(population_size, self.confidence_level)
        
        sample_size = min(sample_size, max_sample_size, population_size)
        
        sampled_posts = topic_posts.sample(n=sample_size, random_state=42)
        
        print(f"Topic {topic_id}: Population={population_size}, Sample={sample_size} ({sample_size/population_size*100:.1f}%)")
        
        return sampled_posts['document'].tolist()
    
    def create_labeling_prompt(self, topic_id, sample_posts, top_keywords):
        """
        Create a comprehensive prompt for ChatGPT-4o topic labeling
        
        Parameters:
        topic_id: The topic ID
        sample_posts: List of sampled posts for this topic
        top_keywords: Top keywords from BERTopic for this topic
        """
        keywords_str = ", ".join(top_keywords)
        posts_str = "\n\n".join([f"Post {i+1}: {post[:300]}{'...' if len(post) > 300 else ''}" 
                                for i, post in enumerate(sample_posts)])
        
        prompt = f"""You are a helpful assistant for labeling text topics of posts on Reddit that were labelled as relevant to the topic of percieved politicization of tech companies.
TOPIC INFORMATION:
- Topic ID: {topic_id}
- Top keywords from algorithm: {keywords_str}
- Number of posts in cluster: {len(self.final_df[self.final_df['topic'] == topic_id])}
- Sample size: {len(sample_posts)} posts (95% confidence interval)

SAMPLE POSTS FROM THIS CLUSTER:
{posts_str}

INSTRUCTIONS:
1. Analyze the sample posts to understand the main theme
2. Create a clear, concise topic label (2-6 words)
3. The label should capture the essence of what makes this cluster distinct
5. Be specific enough to distinguish from other tech/political topics

RESPONSE FORMAT:
Topic Label: [Your 2-6 word label]
Brief Explanation: [1-2 sentences explaining why this label fits]

Example format:
Topic Label: Facebook Election Misinformation
Brief Explanation: This cluster focuses on posts discussing Facebook's role in spreading misinformation during elections and the political debates around content moderation policies."""

        return prompt
    
    def generate_topic_labels(self, final_model, delay_between_calls=1.1, max_retries=3):
        """
        Generate labels for all topics using ChatGPT-4o
        
        Parameters:
        final_model: The fitted BERTopic model
        delay_between_calls: Delay between API calls (default: 1.1 seconds)
        max_retries: Maximum number of retries for failed calls
        """
        unique_topics = sorted([t for t in self.final_df['topic'].unique() if t != -1])
        
        print(f"Generating labels for {len(unique_topics)} topics using ChatGPT-4o...")
        print(f"Using {self.confidence_level*100}% confidence interval sampling")
        print("=" * 60)
        
        for topic_id in unique_topics:
            print(f"\\nProcessing Topic {topic_id}...")
            
            try:
                topic_keywords = final_model.get_topic(topic_id)
                top_keywords = [word for word, score in topic_keywords[:8]]
            except:
                top_keywords = [f"keyword_{i}" for i in range(5)]
            
            sample_posts = self.sample_topic_posts(topic_id)
            
            if not sample_posts:
                print(f"No posts found for Topic {topic_id}")
                continue
            
            prompt = self.create_labeling_prompt(topic_id, sample_posts, top_keywords)
            
            success = False
            for attempt in range(max_retries):
                try:
                    response = self.client.chat.completions.create(
                        model="gpt-4o",
                        messages=[
                            {"role": "system", "content": "You are an expert at analyzing social media posts about technology and politics. You excel at identifying the key themes and creating concise, descriptive labels for topic clusters."},
                            {"role": "user", "content": prompt}
                        ],
                        temperature=0.3,
                        max_tokens=200
                    )
                    
                    full_response = response.choices[0].message.content.strip()
                    
                    if "Topic Label:" in full_response:
                        label = full_response.split("Topic Label:")[1].split("\\n")[0].strip()
                    else:
                        label = full_response.split("\\n")[0].strip()
                    
                    self.topic_labels[topic_id] = {
                        'label': label,
                        'full_response': full_response,
                        'sample_size': len(sample_posts),
                        'population_size': len(self.final_df[self.final_df['topic'] == topic_id]),
                        'keywords': top_keywords
                    }
                    
                    print(f"✓ Topic {topic_id}: {label}")
                    success = True
                    break
                    
                except Exception as e:
                    print(f"Attempt {attempt + 1} failed for Topic {topic_id}: {e}")
                    if attempt < max_retries - 1:
                        time.sleep(delay_between_calls * 2)
            
            if not success:
                print(f"Failed to label Topic {topic_id} after {max_retries} attempts")
                self.topic_labels[topic_id] = {
                    'label': f"Topic {topic_id} (Failed)",
                    'full_response': "Failed to generate",
                    'sample_size': len(sample_posts),
                    'population_size': len(self.final_df[self.final_df['topic'] == topic_id]),
                    'keywords': top_keywords
                }
            
            if success:
                time.sleep(delay_between_calls)
        
        print("\\n" + "=" * 60)
        print("TOPIC LABELING COMPLETED")
        print("=" * 60)
        
        return self.topic_labels
    
    def display_results(self):
        """Display the generated topic labels in a nice format"""
        if not self.topic_labels:
            print("No topic labels generated yet. Run generate_topic_labels() first.")
            return
        
        print("\\n GENERATED TOPIC LABELS:")
        print("=" * 80)
        
        for topic_id in sorted(self.topic_labels.keys()):
            info = self.topic_labels[topic_id]
            print(f"\\n TOPIC {topic_id}")
            print(f"   Label: {info['label']}")
            print(f"   Size: {info['population_size']} posts (sampled: {info['sample_size']})")
            print(f"   Keywords: {', '.join(info['keywords'][:5])}")
            if 'Brief Explanation:' in info['full_response']:
                explanation = info['full_response'].split('Brief Explanation:')[1].strip()
                print(f"   Explanation: {explanation}")
    
    def export_results(self, filename="topic_labels_results.csv"):
        """Export results to CSV"""
        if not self.topic_labels:
            print("No topic labels to export.")
            return
        
        export_data = []
        for topic_id, info in self.topic_labels.items():
            export_data.append({
                'topic_id': topic_id,
                'label': info['label'],
                'population_size': info['population_size'],
                'sample_size': info['sample_size'],
                'sampling_percentage': (info['sample_size'] / info['population_size']) * 100,
                'top_keywords': ', '.join(info['keywords'][:5]),
                'full_response': info['full_response']
            })
        
        df_export = pd.DataFrame(export_data)
        df_export.to_csv(filename, index=False)
        print(f"\\n Results exported to: {filename}")
        
        return df_export

def label_your_topics(final_df, documents, final_model, api_key):
    """
    Convenience function to label your specific topics
    """
    labeler = TopicLabeler(
        api_key=api_key,
        final_df=final_df,
        documents=documents,
        confidence_level=0.95
    )
    
    topic_labels = labeler.generate_topic_labels(final_model)
    
    labeler.display_results()
    
    results_df = labeler.export_results()
    
    return labeler, results_df

if __name__ == "__main__":
    API_KEY = ""
    
    if 'final_df' in locals() and 'optimizer' in locals() and 'final_model' in locals():
        print("Starting topic labeling with ChatGPT-4o...")
        labeler, results_df = label_your_topics(
            final_df=final_df,
            documents=optimizer.documents, 
            final_model=final_model,
            api_key=API_KEY
        )
        print("\\n Topic labeling completed!")
    else:
        print("Please ensure you have 'final_df', 'optimizer', and 'final_model' variables loaded.")
        print("\\nTo use manually:")
        print("labeler = TopicLabeler(api_key, final_df, documents)")
        print("labels = labeler.generate_topic_labels(final_model)")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def create_sentiment_plot(final_df, sentiment_column='llm_sentiment', figsize=(16, 10)):
    """
    Create sentiment analysis plot for current topic clusters
    
    Parameters:
    final_df: DataFrame with topic assignments and sentiment data
    sentiment_column: Name of the sentiment column (default: 'llm_sentiment')
    figsize: Figure size tuple
    """
    
    topic_names = {
        0: "Elon Musk's Political Influence",
        1: "Zuckerberg's Political Entanglements", 
        2: "Google Political Bias Allegations",
        3: "Amazon's Political Influence and Controversies",
        4: "Big Tech Censorship and Politics",
        5: "Facebook's Political Influence and Bias",
        6: "AI's Political Influence and Ethics",
        7: "Microsoft Political and Social Stances",
        8: "Meta's Political Controversies",
        9: "Tech Companies and Israel-Palestine Conflict",
        10: "Social Media Bans on Trump",
        11: "Tech Companies and Vaccine Misinformation",
        12: "Instagram Political Bias and Censorship",
        13: "Tech Privacy and Political Influence",
        14: "Parler Deplatforming Controversy",
        15: "Tech Companies and LGBTQ+ Issues"
    }
    
    if sentiment_column not in final_df.columns:
        print(f"Error: Column '{sentiment_column}' not found in dataframe.")
        print(f"Available columns: {list(final_df.columns)}")
        return
    
    df_filtered = final_df[final_df['topic'] != -1].copy()

    sentiment_counts = df_filtered.groupby(['topic', sentiment_column]).size().unstack(fill_value=0)
    sentiment_props = sentiment_counts.div(sentiment_counts.sum(axis=1), axis=0)
    
    if 'Negative' in sentiment_props.columns:
        sentiment_props_sorted = sentiment_props.sort_values(by='Negative', ascending=True)
    elif 'negative' in sentiment_props.columns:
        sentiment_props_sorted = sentiment_props.sort_values(by='negative', ascending=True)
    else:
        sentiment_props_sorted = sentiment_props.sort_index()
        print("Warning: No 'Negative' sentiment column found. Sorting by topic ID instead.")
    
    topic_labels = []
    for topic_id in sentiment_props_sorted.index:
        if topic_id in topic_names:
            label = topic_names[topic_id]
            if len(label) > 35:
                label = label[:32] + "..."
            topic_labels.append(f"T{topic_id}: {label}")
        else:
            topic_labels.append(f"Topic {topic_id}")
    
    sentiment_props_sorted.index = topic_labels
    
    available_sentiments = sentiment_props_sorted.columns.tolist()
    
    sentiment_mapping = {}
    colors_mapping = {}
    
    for col in available_sentiments:
        col_lower = col.lower()
        if 'pos' in col_lower:
            sentiment_mapping['Positive'] = col
            colors_mapping['Positive'] = '#8EAC50'
        elif 'neu' in col_lower:
            sentiment_mapping['Neutral'] = col
            colors_mapping['Neutral'] = '#FFE17B'
        elif 'neg' in col_lower:
            sentiment_mapping['Negative'] = col
            colors_mapping['Negative'] = '#FD8D14'
    
    if not sentiment_mapping:
        sentiment_order = available_sentiments
        colors = ['#A0C878', '#DDEB9D', '#EB5B00'][:len(sentiment_order)]
    else:
        sentiment_order = [sentiment_mapping.get(s, s) for s in ['Positive', 'Neutral', 'Negative'] if sentiment_mapping.get(s) in available_sentiments]
        colors = [colors_mapping.get(s, '#DDEB9D') for s in ['Positive', 'Neutral', 'Negative'] if sentiment_mapping.get(s) in available_sentiments]

    fig, ax = plt.subplots(figsize=figsize)
    
    sentiment_props_sorted[sentiment_order].plot(
        kind='barh',
        stacked=True,
        color=colors,
        ax=ax
    )
    
    ax.set_title("Sentiment Proportion per Topic Cluster", fontsize=16, fontweight='bold', pad=20)
    ax.set_xlabel("Proportion of Posts", fontsize=12)
    ax.set_ylabel("Topic Cluster", fontsize=12)
    
    legend_labels = [s.title() for s in sentiment_order]
    ax.legend(legend_labels, title="Sentiment", bbox_to_anchor=(1.02, 1), loc='upper left')
    
    ax.grid(axis='x', alpha=0.3)
    
    plt.tight_layout()
    
    print("SENTIMENT ANALYSIS SUMMARY")
    print("=" * 50)
    print(f"Total posts analyzed: {len(df_filtered):,}")
    print(f"Topics analyzed: {len(sentiment_props_sorted)}")
    print(f"Sentiment categories: {sentiment_order}")
    print("\nMost negative topics (highest negative sentiment):")
    
    if 'Negative' in sentiment_mapping and sentiment_mapping['Negative'] in sentiment_props_sorted.columns:
        negative_col = sentiment_mapping['Negative']
        most_negative = sentiment_props_sorted.sort_values(by=negative_col, ascending=False).head(3)
        for i, (topic, row) in enumerate(most_negative.iterrows(), 1):
            print(f"  {i}. {topic}: {row[negative_col]:.1%} negative")
    
    plt.show()
    
    return sentiment_props_sorted

def print_sentiment_statistics(final_df, sentiment_column='llm_sentiment'):
    """
    Print detailed sentiment statistics for each topic
    """
    topic_names = {
        0: "Elon Musk's Political Influence",
        1: "Zuckerberg's Political Entanglements", 
        2: "Google Political Bias Allegations",
        3: "Amazon's Political Influence and Controversies",
        4: "Big Tech Censorship and Politics",
        5: "Facebook's Political Influence and Bias",
        6: "AI's Political Influence and Ethics",
        7: "Microsoft Political and Social Stances",
        8: "Meta's Political Controversies",
        9: "Tech Companies and Israel-Palestine Conflict",
        10: "Social Media Bans on Trump",
        11: "Tech Companies and Vaccine Misinformation",
        12: "Instagram Political Bias and Censorship",
        13: "Tech Privacy and Political Influence",
        14: "Parler Deplatforming Controversy",
        15: "Tech Companies and LGBTQ+ Issues"
    }
    
    if sentiment_column not in final_df.columns:
        print(f"Error: Column '{sentiment_column}' not found.")
        return
    
    df_filtered = final_df[final_df['topic'] != -1].copy()
    
    print("\nDETAILED SENTIMENT STATISTICS BY TOPIC")
    print("=" * 80)
    
    for topic_id in sorted(df_filtered['topic'].unique()):
        topic_data = df_filtered[df_filtered['topic'] == topic_id]
        topic_name = topic_names.get(topic_id, f"Topic {topic_id}")
        
        print(f"\n  TOPIC {topic_id}: {topic_name}")
        print("-" * 60)
        print(f"Total posts: {len(topic_data):,}")
        
        sentiment_counts = topic_data[sentiment_column].value_counts()
        sentiment_props = topic_data[sentiment_column].value_counts(normalize=True)
        
        for sentiment in sentiment_counts.index:
            count = sentiment_counts[sentiment]
            prop = sentiment_props[sentiment]
            print(f"  {sentiment}: {count:,} posts ({prop:.1%})")

if __name__ == "__main__":
    if 'final_df' in locals():
        print("Available columns in final_df:")
        print(final_df.columns.tolist())
        print()
        
        sentiment_cols = [col for col in final_df.columns if 'sentiment' in col.lower()]
        
        if sentiment_cols:
            print(f"Found sentiment columns: {sentiment_cols}")
            sentiment_col = sentiment_cols[0]
            print(f"Using column: {sentiment_col}")
            
            sentiment_props = create_sentiment_plot(final_df, sentiment_column=sentiment_col)
            
            print_sentiment_statistics(final_df, sentiment_column=sentiment_col)
            
        else:
            print("No sentiment columns found. Please ensure your dataframe has sentiment data.")
            print("Expected column names: 'llm_sentiment', 'sentiment', etc.")
    else:
        print("Please ensure 'final_df' variable is loaded.")
        print("\nTo use this script:")
        print("sentiment_props = create_sentiment_plot(final_df, sentiment_column='your_sentiment_column')")
        

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def create_weekly_topic_trends(final_df, date_column='created_utc', figsize=(14, 8)):
    """
    Create weekly topic trends plot for current topic clusters
    
    Parameters:
    final_df: DataFrame with topic assignments and date data
    date_column: Name of the date column (default: 'created_utc')
    figsize: Figure size tuple
    """

    topic_labels = {
        0: "Elon Musk's Political Influence",
        1: "Zuckerberg's Political Entanglements", 
        2: "Google Political Bias Allegations",
        3: "Amazon's Political Influence and Controversies",
        4: "Big Tech Censorship and Politics",
        5: "Facebook's Political Influence and Bias",
        6: "AI's Political Influence and Ethics",
        7: "Microsoft Political and Social Stances",
        8: "Meta's Political Controversies",
        9: "Tech Companies and Israel-Palestine Conflict",
        10: "Social Media Bans on Trump",
        11: "Tech Companies and Vaccine Misinformation",
        12: "Instagram Political Bias and Censorship",
        13: "Tech Privacy and Political Influence",
        14: "Parler Deplatforming Controversy",
        15: "Tech Companies and LGBTQ+ Issues"
    }
    
    if date_column not in final_df.columns:
        print(f"Error: Column '{date_column}' not found in dataframe.")
        print(f"Available columns: {list(final_df.columns)}")
        return
    
    df = final_df.copy()
    
    try:
        if df[date_column].dtype == 'object':
            df[date_column] = pd.to_datetime(df[date_column])
        elif df[date_column].dtype in ['int64', 'float64']:
            df[date_column] = pd.to_datetime(df[date_column], unit='s')
        else:
            df[date_column] = pd.to_datetime(df[date_column])
    except Exception as e:
        print(f"Error converting date column: {e}")
        return
    
    df_filtered = df[df[date_column].dt.year.between(2020, 2025)]
    
    df_filtered = df_filtered[df_filtered['topic'] != -1]
    
    print(f"Data range: {df_filtered[date_column].min()} to {df_filtered[date_column].max()}")
    print(f"Total posts: {len(df_filtered):,}")
    
    weekly_counts = df_filtered.groupby([pd.Grouper(key=date_column, freq='W'), 'topic']).size().unstack(fill_value=0)
    
    weekly_counts = weekly_counts.rename(columns=topic_labels)
    
    top_5 = weekly_counts.sum().sort_values(ascending=False).head(5).index.tolist()
    
    print(f"\\nTop 5 topics by volume:")
    for i, topic in enumerate(top_5, 1):
        total = weekly_counts[topic].sum()
        print(f"  {i}. {topic}: {total:,} posts")
    
    weekly_counts['Others'] = weekly_counts.drop(columns=top_5).sum(axis=1)
    weekly_top5 = weekly_counts[top_5 + ['Others']]
    
    weekly_top5_smoothed = weekly_top5.rolling(window=4, min_periods=1).mean()
    
    colors = {
        top_5[0]: '#A2C579',  # Light green
        top_5[1]: '#98ABEE',  # Light blue
        top_5[2]: '#1D24CA',  # Blue
        top_5[3]: '#201658',  # Dark blue
        top_5[4]: '#6554c0',  # Purple
        'Others': '#999999'   # Gray
    }
    
    fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=figsize, 
                                   gridspec_kw={'height_ratios': [1, 2]})
    
    weekly_top5_smoothed.plot.area(ax=ax1, stacked=True, legend=False, linewidth=0, 
                                   alpha=0.95, color=[colors[col] for col in weekly_top5_smoothed.columns])
    weekly_top5_smoothed.plot.area(ax=ax2, stacked=True, legend=True, linewidth=0, 
                                   alpha=0.95, color=[colors[col] for col in weekly_top5_smoothed.columns])
    
    max_val = weekly_top5_smoothed.sum(axis=1).max()
    ax1.set_ylim(max_val * 0.3, max_val + max_val * 0.1)
    ax2.set_ylim(0, max_val * 0.27)
    
    ax1.spines['bottom'].set_visible(False)
    ax2.spines['top'].set_visible(False)
    ax1.tick_params(labeltop=False)
    ax2.xaxis.tick_bottom()

    d = .015
    kwargs = dict(transform=ax1.transAxes, color='k', clip_on=False)
    ax1.plot((-d, +d), (-d, +d), **kwargs)
    ax1.plot((1 - d, 1 + d), (-d, +d), **kwargs)
    
    kwargs.update(transform=ax2.transAxes)
    ax2.plot((-d, +d), (1 - d, 1 + d), **kwargs)
    ax2.plot((1 - d, 1 + d), (1 - d, 1 + d), **kwargs)
    
    ax2.set_xlabel("Date", fontsize=12)
    ax2.set_ylabel("Post Count", fontsize=12)
    fig.suptitle("Weekly Post Volume by Topic (2020–2025)", fontsize=16, fontweight='bold')
    
    ax2.legend(title="Topic", bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
    
    plt.tight_layout()
    plt.subplots_adjust(hspace=0.05)
    
    total_weeks = len(weekly_top5_smoothed)
    avg_weekly = weekly_top5_smoothed.sum(axis=1).mean()
    
    print(f"\\nSummary:")
    print(f"  Time period: {total_weeks} weeks")
    print(f"  Average weekly posts: {avg_weekly:.1f}")
    print(f"  Peak week: {weekly_top5_smoothed.sum(axis=1).max():.0f} posts")
    
    plt.show()
    
    return weekly_top5_smoothed

def create_simple_topic_trends(final_df, date_column='created_utc', top_n=5):
    """
    Create a simpler version without broken axis
    """
    topic_labels = {
        0: "Elon Musk's Political Influence",
        1: "Zuckerberg's Political Entanglements", 
        2: "Google Political Bias Allegations",
        3: "Amazon's Political Influence and Controversies",
        4: "Big Tech Censorship and Politics",
        5: "Facebook's Political Influence and Bias",
        6: "AI's Political Influence and Ethics",
        7: "Microsoft Political and Social Stances",
        8: "Meta's Political Controversies",
        9: "Tech Companies and Israel-Palestine Conflict",
        10: "Social Media Bans on Trump",
        11: "Tech Companies and Vaccine Misinformation",
        12: "Instagram Political Bias and Censorship",
        13: "Tech Privacy and Political Influence",
        14: "Parler Deplatforming Controversy",
        15: "Tech Companies and LGBTQ+ Issues"
    }
    
    df = final_df.copy()
    
    if df[date_column].dtype in ['int64', 'float64']:
        df[date_column] = pd.to_datetime(df[date_column], unit='s')
    else:
        df[date_column] = pd.to_datetime(df[date_column])
    
    df_filtered = df[df[date_column].dt.year.between(2020, 2025)]
    df_filtered = df_filtered[df_filtered['topic'] != -1]
    
    weekly_counts = df_filtered.groupby([pd.Grouper(key=date_column, freq='W'), 'topic']).size().unstack(fill_value=0)
    weekly_counts = weekly_counts.rename(columns=topic_labels)
    
    top_topics = weekly_counts.sum().sort_values(ascending=False).head(top_n).index.tolist()
    weekly_top = weekly_counts[top_topics]
    
    weekly_smoothed = weekly_top.rolling(window=4, min_periods=1).mean()
    
    fig, ax = plt.subplots(figsize=(14, 8))
    
    colors = ['#A2C579', '#98ABEE', '#1D24CA', '#201658', '#6554c0']
    
    for i, topic in enumerate(weekly_smoothed.columns):
        ax.plot(weekly_smoothed.index, weekly_smoothed[topic], 
               label=topic, linewidth=2.5, color=colors[i % len(colors)])
    
    ax.set_xlabel("Date", fontsize=12)
    ax.set_ylabel("Weekly Post Count", fontsize=12)
    ax.set_title(f"Top {top_n} Topics: Weekly Post Trends (2020-2025)", fontsize=14, fontweight='bold')
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return weekly_smoothed

if __name__ == "__main__":
    if 'final_df' in locals():
        date_cols = [col for col in final_df.columns if 'date' in col.lower() or 'time' in col.lower() or 'created' in col.lower()]
        
        if date_cols:
            print(f"Found date columns: {date_cols}")
            date_col = date_cols[0]
            print(f"Using column: {date_col}")
            
            weekly_data = create_weekly_topic_trends(final_df, date_column=date_col)
            
        else:
            print("No date columns found. Please ensure your dataframe has date/time data.")
            print("Expected column names: 'created_utc', 'date', 'timestamp', etc.")
    else:
        print("Please ensure 'final_df' variable is loaded.")
        print("\\nTo use this script:")
        print("weekly_data = create_weekly_topic_trends(final_df, date_column='your_date_column')")

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def extract_representative_posts(final_df, embeddings, method='centroid_closest', n_posts=1):
    """
    Extract representative posts for each topic cluster
    
    Parameters:
    final_df: DataFrame with topic assignments
    embeddings: Document embeddings
    method: 'centroid_closest', 'highest_similarity', or 'random'
    n_posts: Number of representative posts per topic
    
    Returns:
    Dictionary with topic_id as key and representative posts as values
    """
    
    topic_names = {
        0: "Elon Musk's Political Influence",
        1: "Zuckerberg's Political Entanglements", 
        2: "Google Political Bias Allegations",
        3: "Amazon's Political Influence and Controversies",
        4: "Big Tech Censorship and Politics",
        5: "Facebook's Political Influence and Bias",
        6: "AI's Political Influence and Ethics",
        7: "Microsoft Political and Social Stances",
        8: "Meta's Political Controversies",
        9: "Tech Companies and Israel-Palestine Conflict",
        10: "Social Media Bans on Trump",
        11: "Tech Companies and Vaccine Misinformation",
        12: "Instagram Political Bias and Censorship",
        13: "Tech Privacy and Political Influence",
        14: "Parler Deplatforming Controversy",
        15: "Tech Companies and LGBTQ+ Issues"
    }
    
    df_with_docs = final_df.copy()
    if 'document' not in df_with_docs.columns:
        if 'optimizer' in globals():
            df_with_docs['document'] = optimizer.documents
        else:
            print("Warning: No document text found. Using index as placeholder.")
            df_with_docs['document'] = [f"Document {i}" for i in range(len(df_with_docs))]
    
    representative_posts = {}
    unique_topics = sorted([t for t in final_df['topic'].unique() if t != -1])
    
    print(f"Extracting {n_posts} representative post(s) per topic using '{method}' method...")
    print("=" * 70)
    
    for topic_id in unique_topics:
        topic_mask = final_df['topic'] == topic_id
        topic_indices = final_df[topic_mask].index
        topic_docs = df_with_docs[topic_mask]
        topic_embeddings = embeddings[topic_indices]
        
        if len(topic_docs) == 0:
            continue
        
        if method == 'centroid_closest':
            # Find documents closest to topic centroid
            centroid = np.mean(topic_embeddings, axis=0)
            similarities = cosine_similarity(topic_embeddings, centroid.reshape(1, -1)).flatten()
            top_indices = similarities.argsort()[-n_posts:][::-1]
            
        elif method == 'highest_similarity':
            avg_similarities = []
            for i, emb in enumerate(topic_embeddings):
                other_embeddings = np.delete(topic_embeddings, i, axis=0)
                if len(other_embeddings) > 0:
                    avg_sim = cosine_similarity(emb.reshape(1, -1), other_embeddings).mean()
                else:
                    avg_sim = 0
                avg_similarities.append(avg_sim)
            top_indices = np.argsort(avg_similarities)[-n_posts:][::-1]
            
        elif method == 'random':
            top_indices = np.random.choice(len(topic_docs), min(n_posts, len(topic_docs)), replace=False)
        
        rep_posts = []
        for idx in top_indices:
            doc_info = {
                'document_index': topic_indices[idx],
                'text': topic_docs.iloc[idx]['document'],
                'similarity_score': similarities[idx] if method == 'centroid_closest' else None
            }
            rep_posts.append(doc_info)
        
        representative_posts[topic_id] = {
            'topic_name': topic_names.get(topic_id, f"Topic {topic_id}"),
            'cluster_size': len(topic_docs),
            'representative_posts': rep_posts
        }
        
        topic_name = topic_names.get(topic_id, f"Topic {topic_id}")
        print(f"\n  TOPIC {topic_id}: {topic_name}")
        print(f"Cluster size: {len(topic_docs)} posts")
        print("-" * 50)
        
        for i, post in enumerate(rep_posts, 1):
            text = post['text']
            if len(text) > 200:
                text = text[:200] + "..."
            
            print(f"Representative Post {i}:")
            if post['similarity_score'] is not None:
                print(f"  Similarity to centroid: {post['similarity_score']:.4f}")
            print(f"  Text: {text}")
            print()
    
    return representative_posts

def export_representative_posts(representative_posts, filename="representative_posts.csv"):
    """
    Export representative posts to CSV file
    """
    export_data = []
    
    for topic_id, info in representative_posts.items():
        topic_name = info['topic_name']
        cluster_size = info['cluster_size']
        
        for i, post in enumerate(info['representative_posts'], 1):
            export_data.append({
                'topic_id': topic_id,
                'topic_name': topic_name,
                'cluster_size': cluster_size,
                'representative_post_rank': i,
                'document_index': post['document_index'],
                'similarity_score': post['similarity_score'],
                'post_text': post['text']
            })
    
    df_export = pd.DataFrame(export_data)
    df_export.to_csv(filename, index=False)
    print(f"\n Representative posts exported to: {filename}")
    
    return df_export

def compare_extraction_methods(final_df, embeddings, n_posts=1):
    """
    Compare different methods for extracting representative posts
    """
    methods = ['centroid_closest', 'highest_similarity', 'random']
    results = {}
    
    print("Comparing extraction methods...")
    print("=" * 50)
    
    for method in methods:
        print(f"\n Method: {method.upper()}")
        results[method] = extract_representative_posts(final_df, embeddings, method, n_posts)
    
    return results

def get_representative_posts_for_topic(final_df, embeddings, topic_id, n_posts=3):
    """
    Get multiple representative posts for a specific topic
    """
    topic_names = {
        0: "Elon Musk's Political Influence",
        1: "Zuckerberg's Political Entanglements", 
        2: "Google Political Bias Allegations",
        3: "Amazon's Political Influence and Controversies",
        4: "Big Tech Censorship and Politics",
        5: "Facebook's Political Influence and Bias",
        6: "AI's Political Influence and Ethics",
        7: "Microsoft Political and Social Stances",
        8: "Meta's Political Controversies",
        9: "Tech Companies and Israel-Palestine Conflict",
        10: "Social Media Bans on Trump",
        11: "Tech Companies and Vaccine Misinformation",
        12: "Instagram Political Bias and Censorship",
        13: "Tech Privacy and Political Influence",
        14: "Parler Deplatforming Controversy",
        15: "Tech Companies and LGBTQ+ Issues"
    }
    
    topic_mask = final_df['topic'] == topic_id
    topic_indices = final_df[topic_mask].index
    topic_embeddings = embeddings[topic_indices]
    
    if len(topic_indices) == 0:
        print(f"No documents found for topic {topic_id}")
        return
    
    if 'optimizer' in globals():
        documents = [optimizer.documents[i] for i in topic_indices]
    else:
        documents = [f"Document {i}" for i in topic_indices]
    
    centroid = np.mean(topic_embeddings, axis=0)
    similarities = cosine_similarity(topic_embeddings, centroid.reshape(1, -1)).flatten()
    top_indices = similarities.argsort()[-n_posts:][::-1]
    
    topic_name = topic_names.get(topic_id, f"Topic {topic_id}")
    print(f"\n  TOPIC {topic_id}: {topic_name}")
    print(f"Showing top {n_posts} representative posts:")
    print("=" * 60)
    
    for i, idx in enumerate(top_indices, 1):
        print(f"\n{i}. Similarity: {similarities[idx]:.4f}")
        print(f"   Document Index: {topic_indices[idx]}")
        print(f"   Text: {documents[idx]}")

if __name__ == "__main__":
    if 'final_df' in locals() and 'optimizer' in locals():
        print("Extracting representative posts for all topics...")
        
        rep_posts = extract_representative_posts(final_df, optimizer.embeddings, method='centroid_closest', n_posts=3)
        
        export_df = export_representative_posts(rep_posts)
        
        print(f"\n Extracted representative posts for {len(rep_posts)} topics")
        
    else:
        print("Please ensure 'final_df' and 'optimizer' variables are loaded.")
        print("\nAvailable functions:")
        print("• extract_representative_posts(final_df, embeddings)")
        print("• get_representative_posts_for_topic(final_df, embeddings, topic_id)")
        print("• compare_extraction_methods(final_df, embeddings)")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

def create_simple_topic_plot(embeddings, topic_assignments, figsize=(12, 8), save_path=None):
    """
    Create a simple static plot with small points and no labels on graph
    """
    topic_names = {
        0: "T0: Elon Musk's Political Influence",
        1: "T1: Zuckerberg's Political Entanglements", 
        2: "T2: Google Political Bias Allegations",
        3: "T3: Amazon's Political Influence and Controversies",
        4: "T4: Big Tech Censorship and Politics",
        5: "T5: Facebook's Political Influence and Bias",
        6: "T6: AI's Political Influence and Ethics",
        7: "T7: Microsoft Political and Social Stances",
        8: "T8: Meta's Political Controversies",
        9: "T9: Tech Companies and Israel-Palestine Conflict",
        10: "T10: Social Media Bans on Trump",
        11: "T11: Tech Companies and Vaccine Misinformation",
        12: "T12: Instagram Political Bias and Censorship",
        13: "T13: Tech Privacy and Political Influence",
        14: "T14: Parler Deplatforming Controversy",
        15: "T15: Tech Companies and LGBTQ+ Issues"
    }
    
    topic_sizes = {
        0: 1003, 1: 605, 2: 441, 3: 342, 4: 401, 5: 564,
        6: 219, 7: 134, 8: 155, 9: 106, 10: 171, 11: 88,
        12: 124, 13: 71, 14: 38, 15: 65
    }

    print("Computing t-SNE...")
    pca_50 = PCA(n_components=50, random_state=42)
    embeddings_50d = pca_50.fit_transform(embeddings)
    
    tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=1000)
    coords = tsne.fit_transform(embeddings_50d)
    print("t-SNE completed!")
    
    fig, ax = plt.subplots(figsize=figsize)
    
    unique_topics = sorted(set(topic_assignments))
    colors = plt.cm.tab20(np.linspace(0, 1, len(unique_topics)))
    
    for i, topic in enumerate(unique_topics):
        mask = np.array(topic_assignments) == topic
        topic_coords = coords[mask]
        
        if len(topic_coords) > 0:
            topic_name = topic_names.get(topic, f"Topic {topic}")
            topic_size = topic_sizes.get(topic, len(topic_coords))
            
            ax.scatter(topic_coords[:, 0], topic_coords[:, 1], 
                      c=[colors[i]], alpha=0.7, s=8,  # Small point size (8)
                      label=f"{topic_name} (n={topic_size})")
            
            centroid = np.mean(topic_coords, axis=0)
            ax.scatter(centroid[0], centroid[1], 
                      c=[colors[i]], s=200, marker='*', 
                      edgecolors='black', linewidth=1.5)
            

            display_name = topic_name
            if len(display_name) > 25:
                display_name = display_name[:22] + "..."
            
            ax.annotate(f"{display_name}", 
                       centroid, xytext=(5, 5), 
                       textcoords='offset points', fontsize=8,
                       bbox=dict(boxstyle='round,pad=0.3', 
                               facecolor=colors[i], alpha=0.9),
                       ha='left')
    
    ax.set_xlabel('t-SNE Component 1', fontsize=12)
    ax.set_ylabel('t-SNE Component 2', fontsize=12)
    ax.grid(True, alpha=0.3)
    
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=9)
    
        
    plt.tight_layout()
    plt.savefig('topic_scatter_plot.png', dpi=300, bbox_inches='tight')
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"Plot saved to: {save_path}")
    
    plt.show()



if 'optimizer' in locals() and 'final_df' in locals():
    create_simple_topic_plot(
        embeddings=optimizer.embeddings,
        topic_assignments=final_df['topic'].tolist(),
        figsize=(15, 10)
    )
else:
    print("Please ensure 'optimizer' and 'final_df' variables are loaded.")