In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os
import pickle
import re
from collections import Counter, defaultdict
from tqdm import tqdm
import networkx as nx
from typing import Dict, List, Tuple, Optional, Union, Any
import spacy
from spacy.tokens import Doc
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from textblob import TextBlob

import warnings
warnings.filterwarnings('ignore')

In [2]:
# ML & NLP
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch

In [3]:
# Seed
np.random.seed(42)

### 1. Data Understanding & Preparation

1. I will load and explore the dataset structure.
2. Then I will examine the articles content, sources, publication dates, and metadata.
3. I will clean the data by removing web crawl artifacts (HTML tags, etc).
4. I will filter out irrelevant articles that don't focus on AI's impact on industries.

In [4]:
import pandas as pd
df = pd.read_parquet('https://storage.googleapis.com/msca-bdp-data-open/news_final_project/news_final_project.parquet', engine='pyarrow')
print(f"Dataset shape: {df.shape}")
df.info() 

Dataset shape: (200083, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200083 entries, 0 to 200082
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   url       200083 non-null  object
 1   date      200083 non-null  object
 2   language  200083 non-null  object
 3   title     200083 non-null  object
 4   text      200083 non-null  object
dtypes: object(5)
memory usage: 7.6+ MB


In [5]:
df.head()  

Unnamed: 0,url,date,language,title,text
0,http://businessnewsthisweek.com/business/infog...,2023-05-20,en,Infogain AI Business Solutions Now Available i...,\n\nInfogain AI Business Solutions Now Availab...
1,https://allafrica.com/stories/202504250184.html,2025-04-25,en,Africa: AI Policies in Africa - Lessons From G...,\nAfrica: AI Policies in Africa - Lessons From...
2,https://asiatimes.com/2023/07/yang-lan-intervi...,2023-07-25,en,Yang Lan interviews academics on AI developmen...,\nYang Lan interviews academics on AI developm...
3,https://cdn.meritalk.com/articles/commerce-nom...,2025-02-04,en,Commerce Nominee Promises Increased Domestic A...,\nCommerce Nominee Promises Increased Domestic...
4,https://citylife.capetown/hmn/uncategorized/re...,2023-11-11,en,Revolutionizing the Manufacturing Industry: Th...,Revolutionizing the Manufacturing Industry:...


In [6]:
class AINewsAnalyzer:
    """Main class for analyzing AI-related news articles"""
    
    def __init__(self, data_path: str, cache_dir: str = "cache"):
        """Initialize the analyzer with data path and cache directory"""
        self.data_path = data_path
        self.cache_dir = cache_dir
        
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
        
        # SpaCy
        print("Loading SpaCy model...")
        self.nlp = spacy.load("en_core_web_md")
        
        # Dictionaries for sentiment analysis
        self._create_sentiment_dictionaries()
        
        # Data
        self.df = self._load_data()
        print(f"Loaded dataset with {len(self.df)} articles")
    
    def _load_data(self) -> pd.DataFrame:
        """Load the dataset from parquet file"""
        return pd.read_parquet(self.data_path, engine='pyarrow')
    
    def _get_cache_path(self, filename: str) -> str:
        """Get full path for a cache file"""
        return os.path.join(self.cache_dir, filename)
    
    def _save_to_cache(self, obj: Any, filename: str) -> None:
        """Save object to cache"""
        with open(self._get_cache_path(filename), 'wb') as f:
            pickle.dump(obj, f)
    
    def _load_from_cache(self, filename: str) -> Any:
        """Load object from cache if it exists"""
        cache_path = self._get_cache_path(filename)
        if os.path.exists(cache_path):
            with open(cache_path, 'rb') as f:
                return pickle.load(f)
        return None
    
    def _create_sentiment_dictionaries(self) -> None:
        """Create dictionaries for domain-specific sentiment analysis"""
        # Positive terms related to AI
        self.positive_terms = {
            'opportunity': 1.0, 'enhance': 0.8, 'improve': 0.8, 'augment': 0.7, 
            'assist': 0.6, 'empower': 0.9, 'efficiency': 0.8, 'productivity': 0.8, 
            'innovation': 0.9, 'growth': 0.7, 'advancement': 0.8, 'collaborate': 0.7, 
            'partnership': 0.6, 'upskill': 0.9, 'complement': 0.7, 'benefit': 0.8,
            'progress': 0.7, 'create': 0.6, 'advantage': 0.7, 'potential': 0.5,
            'solution': 0.6, 'revolutionize': 0.8
        }
        
        # Negative terms related to AI
        self.negative_terms = {
            'replace': -0.8, 'eliminate': -0.9, 'displace': -0.8, 'threaten': -0.7, 
            'risk': -0.6, 'job loss': -0.9, 'unemployment': -0.9, 'layoff': -0.9, 
            'downsizing': -0.8, 'automation': -0.5, 'obsolete': -0.8, 'disruption': -0.6, 
            'inequality': -0.7, 'bias': -0.7, 'surveillance': -0.8, 'danger': -0.7,
            'concern': -0.5, 'worry': -0.6, 'fear': -0.7, 'threat': -0.8,
            'controversy': -0.6, 'problem': -0.6, 'challenge': -0.4
        }
    

### Data Cleaning and Preprocessing

In [12]:
def clean_and_filter_data(self, 
                              force_recompute: bool = False) -> pd.DataFrame:
        """
        Main function to clean and filter the dataset
        
        Args:
            force_recompute: Whether to force recomputation even if cached results exist
            
        Returns:
            DataFrame with cleaned and filtered data
        """
        cache_file = "cleaned_data.pkl"
        
        if not force_recompute:
            df_clean = self._load_from_cache(cache_file)
            if df_clean is not None:
                print("Loaded cleaned data from cache")
                return df_clean
        
        print("Cleaning and filtering data...")
        
        # Clean text
        self.df['cleaned_text'] = self.df['text'].apply(self._clean_article)
        
        # Parsing dates
        self.df['date'] = pd.to_datetime(self.df['date'], errors='coerce')
        self.df = self.df.dropna(subset=['date'])
        
        # Time features
        self.df['year'] = self.df['date'].dt.year
        self.df['month'] = self.df['date'].dt.month
        self.df['yearmonth'] = self.df['date'].dt.strftime('%Y-%m')
        
        # Relevance
        self.df['is_relevant'] = self.df['cleaned_text'].apply(self._is_relevant)
        df_relevant = self.df[self.df['is_relevant']].copy()
        
        # Extract for source analysis
        df_relevant['source_domain'] = df_relevant['url'].apply(self._extract_domain)
        
        self._save_to_cache(df_relevant, cache_file)
        
        print(f"Filtered to {len(df_relevant)} relevant articles")
        return df_relevant
    
def _clean_article(self, text: str) -> str:
        """
        Clean article text by removing HTML, extra whitespace, etc.
        
        Args:
            text: Raw article text
            
        Returns:
            Cleaned text
        """
        # Handling none or empty strings
        if not text or pd.isna(text):
            return ""
        
        # Removing the HTML tags
        text = BeautifulSoup(text, "html.parser").get_text()
        
        # Removing the URLs
        text = re.sub(r'https?://\S+|www\.\S+', '', text)
        
        # Removing the extra whitespace and newlines
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Removing the special characters
        text = re.sub(r'[^\w\s.,!?;:\'\"()-]', '', text)
        
        return text
    
def _extract_domain(self, url: str) -> str:
        """Extract domain from URL"""
        if not url or pd.isna(url):
            return ""
        
        try:
            # Domain using regex
            domain_match = re.search(r'https?://(?:www\.)?([^/]+)', url)
            if domain_match:
                return domain_match.group(1)
        except:
            pass
        
        return ""
    
def _is_relevant(self, text: str) -> bool:
        """
        Check if article is relevant to AI's impact on industries/jobs
        
        Args:
            text: Cleaned article text
            
        Returns:
            Boolean indicating relevance
        """
        if not text or pd.isna(text):
            return False
        
        text_lower = text.lower()
        
        # Checking for AI related terms
        ai_terms = ['ai', 'artificial intelligence', 'machine learning', 'deep learning', 
                   'neural network', 'llm', 'large language model', 'chatgpt', 'generative ai']
        
        # Checking for industry impact terms
        impact_terms = ['impact', 'effect', 'transform', 'disrupt', 'replace', 'automate',
                       'job', 'employment', 'workforce', 'career', 'industry', 'sector', 
                       'profession', 'work', 'labor market', 'skill']
        
        contains_ai = any(term in text_lower for term in ai_terms)
        contains_impact = any(term in text_lower for term in impact_terms)
        
        if not (contains_ai and contains_impact):
            return False
        
        #Proximity within same paragraph for better accuracy
        paragraphs = text_lower.split('\n')
        
        for para in paragraphs:
            para_has_ai = any(term in para for term in ai_terms)
            para_has_impact = any(term in para for term in impact_terms)
            
            if para_has_ai and para_has_impact:
                return True
        
        # Fallback for short texts without paragraphs, checking the  sentence proximity
        sentences = text_lower.split('.')
        
        ai_sentences = [i for i, sent in enumerate(sentences) if any(term in sent for term in ai_terms)]
        impact_sentences = [i for i, sent in enumerate(sentences) if any(term in sent for term in impact_terms)]
        
        # AI and impact sentences are close to each other within 3 sentences
        for ai_idx in ai_sentences:
            for impact_idx in impact_sentences:
                if abs(ai_idx - impact_idx) <= 3:
                    return True
        
        return False

### Named Entity Recognition

In [13]:
def extract_entities(self, 
                         df: pd.DataFrame, 
                         sample_size: int = 5000,
                         force_recompute: bool = False) -> pd.DataFrame:
        """
        Extract named entities from articles
        
        Args:
            df: DataFrame with cleaned articles
            sample_size: Number of articles to process (SpaCy is computationally expensive)
            force_recompute: Whether to force recomputation
            
        Returns:
            DataFrame with extracted entities
        """
        cache_file = f"entity_sample_{sample_size}.pkl"
        
        # Results
        if not force_recompute:
            df_entities = self._load_from_cache(cache_file)
            if df_entities is not None:
                print(f"Loaded entity data for {len(df_entities)} articles from cache")
                return df_entities
        
        # Sample for entity extraction
        if sample_size and sample_size < len(df):
            print(f"Taking sample of {sample_size} articles for entity extraction")
            sample_df = df.sample(sample_size, random_state=42)
        else:
            sample_df = df.copy()
        
        print("Extracting named entities...")
        
        # Processing the articles with SpaCy
        entities_list = []
        for i, row in tqdm(sample_df.iterrows(), total=len(sample_df)):
            # Limiting the text length to manage memory for the 
            text = row['cleaned_text'][:10000]
            
            # Processing with SpaCy
            doc = self.nlp(text)
            
            # Extracting the entities
            entities = self._extract_entities_from_doc(doc)
            entities_list.append(entities)
        
        # Adding the entities
        sample_df['extracted_entities'] = entities_list
        
        # Separate columns for the top entities
        sample_df['top_organizations'] = sample_df['extracted_entities'].apply(
            lambda x: self._get_top_entities(x, 'organizations')
        )
        sample_df['top_people'] = sample_df['extracted_entities'].apply(
            lambda x: self._get_top_entities(x, 'people')
        )
        sample_df['top_locations'] = sample_df['extracted_entities'].apply(
            lambda x: self._get_top_entities(x, 'locations')
        )
        
        self._save_to_cache(sample_df, cache_file)
        
        return sample_df
    
def _extract_entities_from_doc(self, doc: Doc) -> Dict[str, Counter]:
        """Extract organizations, people, locations from SpaCy Doc"""
        entities = defaultdict(list)
        
        for ent in doc.ents:
            if ent.label_ == 'ORG':
                entities['organizations'].append(ent.text)
            elif ent.label_ == 'PERSON':
                entities['people'].append(ent.text)
            elif ent.label_ in ('GPE', 'LOC'):
                entities['locations'].append(ent.text)
            elif ent.label_ == 'DATE':
                entities['dates'].append(ent.text)
            elif ent.label_ == 'PRODUCT':
                entities['products'].append(ent.text)
        
        # Count frequencies
        entity_counts = {}
        for entity_type, items in entities.items():
            entity_counts[entity_type] = Counter(items)
        
        return entity_counts
    
def _get_top_entities(self, entity_dict: Dict[str, Counter], entity_type: str, n: int = 3) -> List[str]:
        """Get top n entities of a specific type"""
        if entity_type not in entity_dict:
            return []
        
        return [item for item, count in entity_dict[entity_type].most_common(n)]


### Industry and Job Detection

In [14]:
def extract_entities(self, 
                         df: pd.DataFrame, 
                         sample_size: int = 5000,
                         force_recompute: bool = False) -> pd.DataFrame:
        """
        Extract named entities from articles
        
        Args:
            df: DataFrame with cleaned articles
            sample_size: Number of articles to process (SpaCy is computationally expensive)
            force_recompute: Whether to force recomputation
            
        Returns:
            DataFrame with extracted entities
        """
        cache_file = f"entity_sample_{sample_size}.pkl"
        
        # Results
        if not force_recompute:
            df_entities = self._load_from_cache(cache_file)
            if df_entities is not None:
                print(f"Loaded entity data for {len(df_entities)} articles from cache")
                return df_entities
        
        # Sample for the entity extraction
        if sample_size and sample_size < len(df):
            print(f"Taking sample of {sample_size} articles for entity extraction")
            sample_df = df.sample(sample_size, random_state=42)
        else:
            sample_df = df.copy()
        
        print("Extracting named entities...")
        
        # Processing with SpaCy
        entities_list = []
        for i, row in tqdm(sample_df.iterrows(), total=len(sample_df)):
            # Limiting the text length to manage memory
            text = row['cleaned_text'][:10000]
            
            # Processing
            doc = self.nlp(text)
            
            # Extracting the entities
            entities = self._extract_entities_from_doc(doc)
            entities_list.append(entities)
        
        # Entities
        sample_df['extracted_entities'] = entities_list
        
        # Separatating columns for the top entities
        sample_df['top_organizations'] = sample_df['extracted_entities'].apply(
            lambda x: self._get_top_entities(x, 'organizations')
        )
        sample_df['top_people'] = sample_df['extracted_entities'].apply(
            lambda x: self._get_top_entities(x, 'people')
        )
        sample_df['top_locations'] = sample_df['extracted_entities'].apply(
            lambda x: self._get_top_entities(x, 'locations')
        )
        
        self._save_to_cache(sample_df, cache_file)
        
        return sample_df
    
def _extract_entities_from_doc(self, doc: Doc) -> Dict[str, Counter]:
        """Extract organizations, people, locations from SpaCy Doc"""
        entities = defaultdict(list)
        
        for ent in doc.ents:
            if ent.label_ == 'ORG':
                entities['organizations'].append(ent.text)
            elif ent.label_ == 'PERSON':
                entities['people'].append(ent.text)
            elif ent.label_ in ('GPE', 'LOC'):
                entities['locations'].append(ent.text)
            elif ent.label_ == 'DATE':
                entities['dates'].append(ent.text)
            elif ent.label_ == 'PRODUCT':
                entities['products'].append(ent.text)
        
        # Frequencies
        entity_counts = {}
        for entity_type, items in entities.items():
            entity_counts[entity_type] = Counter(items)
        
        return entity_counts
    
def _get_top_entities(self, entity_dict: Dict[str, Counter], entity_type: str, n: int = 3) -> List[str]:
        """Get top n entities of a specific type"""
        if entity_type not in entity_dict:
            return []
        
        return [item for item, count in entity_dict[entity_type].most_common(n)]


### Technology Identification

In [15]:
def identify_technologies(self, 
                             df: pd.DataFrame,
                             force_recompute: bool = False) -> pd.DataFrame:
        """
        Identify AI technologies mentioned in articles
        
        Args:
            df: DataFrame with cleaned articles
            force_recompute: Whether to force recomputation
            
        Returns:
            DataFrame with identified technologies
        """
        cache_file = "tech_data.pkl"
        
        # Results
        if not force_recompute:
            df_tech = self._load_from_cache(cache_file)
            if df_tech is not None:
                print("Loaded technology data from cache")
                return df_tech
        
        print("Identifying AI technologies...")
        
        # Technology categories and it's keywords
        technologies = {
            'machine_learning': [
                'machine learning', 'ml', 'supervised learning', 'unsupervised learning', 
                'reinforcement learning', 'decision trees', 'random forests', 'svm'
            ],
            'deep_learning': [
                'deep learning', 'neural networks', 'cnn', 'rnn', 'lstm', 'transformer models',
                'generative ai', 'gan', 'diffusion models'
            ],
            'nlp': [
                'natural language processing', 'nlp', 'language models', 'llm', 'chatbots',
                'sentiment analysis', 'named entity recognition', 'text classification'
            ],
            'computer_vision': [
                'computer vision', 'image recognition', 'object detection', 'facial recognition',
                'image segmentation', 'video analysis'
            ],
            'robotics': [
                'robotics', 'robots', 'automation', 'robotic process automation', 'rpa',
                'autonomous systems', 'drones', 'self-driving'
            ],
            'voice_ai': [
                'voice assistant', 'speech recognition', 'text to speech', 'voice synthesis',
                'voice computing', 'speech to text'
            ],
            'ai_infrastructure': [
                'gpu', 'tpu', 'cloud computing', 'ai chips', 'neural processors',
                'edge ai', 'ai hardware', 'quantum computing'
            ]
        }
        
        # Model names
        ai_models = [
            'gpt', 'chatgpt', 'gpt-4', 'gpt-3', 'llama', 'gemini', 'claude', 'bert', 
            'stable diffusion', 'dall-e', 'midjourney', 'bard', 'palm', 'chinchilla'
        ]
        
        # Technology identification
        df['ai_technologies'] = df['cleaned_text'].apply(
            lambda x: self._identify_technologies(x, technologies, ai_models)
        )
        
        self._save_to_cache(df, cache_file)
        
        return df
    
def _identify_technologies(self, 
                              text: str, 
                              technologies: Dict[str, List[str]], 
                              ai_models: List[str]) -> Dict[str, List[str]]:
        """Identify AI technologies mentioned in the text"""
        if not text or pd.isna(text):
            return {}
        
        text_lower = text.lower()
        
        found_techs = {}
        # Technology categories
        for tech_category, keywords in technologies.items():
            matched_keywords = [k for k in keywords if k in text_lower]
            if matched_keywords:
                found_techs[tech_category] = matched_keywords
        
        # AI models
        found_models = [model for model in ai_models if model.lower() in text_lower]
        if found_models:
            found_techs['specific_models'] = found_models
        
        return found_techs


### Sentiment Analysis

In [18]:
def analyze_sentiment(self, 
                         df: pd.DataFrame, 
                         sample_size: int = 10000,
                         force_recompute: bool = False) -> pd.DataFrame:
        """
        Analyze sentiment of articles regarding AI impact
        
        Args:
            df: DataFrame with cleaned articles
            sample_size: Number of articles to process
            force_recompute: Whether to force recomputation
            
        Returns:
            DataFrame with sentiment analysis
        """
        cache_file = f"sentiment_sample_{sample_size}.pkl"
        
        # Results
        if not force_recompute:
            df_sentiment = self._load_from_cache(cache_file)
            if df_sentiment is not None:
                print(f"Loaded sentiment data for {len(df_sentiment)} articles from cache")
                return df_sentiment
        
        if sample_size and sample_size < len(df):
            print(f"Taking sample of {sample_size} articles for sentiment analysis")
            sample_df = df.sample(sample_size, random_state=42)
        else:
            sample_df = df.copy()
        
        print("Analyzing sentiment...")
        
        # Sentiment analysis
        sample_df['sentiment_scores'] = sample_df.apply(
            lambda x: self._analyze_contextual_sentiment(
                x['cleaned_text'], 
                x['detected_industries'][0] if len(x['detected_industries']) > 0 else None
            ), 
            axis=1
        )
        
        # Extracting the components of sentiment
        sample_df['sentiment_overall'] = sample_df['sentiment_scores'].apply(lambda x: x['overall'])
        sample_df['sentiment_ai_impact'] = sample_df['sentiment_scores'].apply(lambda x: x['ai_impact'])
        sample_df['sentiment_industry'] = sample_df['sentiment_scores'].apply(lambda x: x['industry_context'])
        
        self._save_to_cache(sample_df, cache_file)
        
        return sample_df
    
def _analyze_contextual_sentiment(self, 
                                     text: str, 
                                     industry: Optional[str] = None) -> Dict[str, float]:
        """
        Analyze sentiment with context awareness for AI impact
        
        Args:
            text: Article text
            industry: Specific industry to contextualize sentiment
            
        Returns:
            Dictionary with different sentiment scores
        """
        if not text or pd.isna(text):
            return {'overall': 0, 'ai_impact': 0, 'industry_context': 0}
        
        # Base sentiment with TextBlob
        base_sentiment = TextBlob(text).sentiment.polarity
        
        # AI impact sentiment using my custom dictionaries
        text_lower = text.lower()
        
        # Occurrences and getting the weighted sentiment for domain-specific terms
        positive_sentiment = sum(
            value * text_lower.count(term) 
            for term, value in self.positive_terms.items()
        )
        
        negative_sentiment = sum(
            value * text_lower.count(term) 
            for term, value in self.negative_terms.items()
        )
        
        # Domain-specific sentiment
        ai_impact_mentions = sum(text_lower.count(term) for term in self.positive_terms) + \
                             sum(text_lower.count(term) for term in self.negative_terms)
        
        if ai_impact_mentions > 0:
            ai_impact_sentiment = (positive_sentiment + negative_sentiment) / ai_impact_mentions
        else:
            ai_impact_sentiment = 0
        
        # Industry context sentiment
        industry_context_sentiment = 0
        if industry:
            # Industry-specific terms
            industry_terms = []
            
            if industry == 'healthcare':
                industry_terms = ['patient', 'doctor', 'nurse', 'hospital', 'medical', 'diagnosis', 'treatment']
            elif industry == 'finance':
                industry_terms = ['bank', 'investment', 'trading', 'financial', 'insurance', 'loan']
            elif industry == 'manufacturing':
                industry_terms = ['factory', 'production', 'assembly', 'industrial', 'manufacturing']
            elif industry == 'technology':
                industry_terms = ['software', 'hardware', 'startup', 'tech', 'computing', 'digital']
            elif industry == 'education':
                industry_terms = ['student', 'teacher', 'school', 'learning', 'education', 'university']
            
            if industry_terms:
                # Finding sentences containing industry terms
                sentences = text.split('.')
                industry_sentences = [s for s in sentences if any(term in s.lower() for term in industry_terms)]
                
                if industry_sentences:
                    # Sentiment for industry-specific sentences
                    industry_sentiment = np.mean([TextBlob(s).sentiment.polarity for s in industry_sentences])
                    industry_context_sentiment = industry_sentiment
        
        # Sentiment is a weighted average of the base sentiment and AI impact sentiment
        overall_sentiment = 0.4 * base_sentiment + 0.6 * ai_impact_sentiment
        
        return {
            'overall': overall_sentiment,
            'ai_impact': ai_impact_sentiment,
            'industry_context': industry_context_sentiment
        }

### Topic Modeling

In [19]:
def run_topic_modeling(self, 
                          df: pd.DataFrame,
                          num_topics: int = 10,
                          force_recompute: bool = False) -> Tuple[Any, np.ndarray]:
        """
        Run topic modeling on articles
        
        Args:
            df: DataFrame with cleaned articles
            num_topics: Number of topics to extract
            force_recompute: Whether to force recomputation
            
        Returns:
            Tuple of (topic_model, document_topics)
        """
        cache_file = f"topic_model_{num_topics}.pkl"
        
        # Results
        if not force_recompute:
            cached_data = self._load_from_cache(cache_file)
            if cached_data is not None:
                print("Loaded topic model from cache")
                return cached_data
        
        print("Running topic modeling...")
        
        documents = df['cleaned_text'].tolist()
        
        # Embedding
        embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
        
        # Vectorizer with bigrams and trigrams
        vectorizer_model = CountVectorizer(
            stop_words="english", 
            ngram_range=(1, 3),
            min_df=5,
            max_df=0.7
        )
        
        # BERTopic model
        topic_model = BERTopic(
            embedding_model=embedding_model,
            vectorizer_model=vectorizer_model,
            nr_topics=num_topics
        )
        
        topics, probs = topic_model.fit_transform(documents)
        
        # Topic representations
        topic_model.update_topics(documents, topics, n_gram_range=(1, 3))
        
        self._save_to_cache((topic_model, topics), cache_file)
        
        return topic_model, topics

In [20]:
def create_visualizations(self, df: pd.DataFrame) -> Dict[str, plt.Figure]:
        figures = {}
        
        # Industry Impact Heat Map
        figures['industry_heatmap'] = self._create_industry_impact_heatmap(df)
        
        # Sentiment Timeline
        figures['sentiment_timeline'] = self._create_sentiment_timeline(df)
        
        # Technology Adoption Timeline
        figures['technology_timeline'] = self._create_technology_timeline(df)
        
        # AI organization Strategy Comparison
        if 'top_organizations' in df.columns and 'ai_technologies' in df.columns:
            figures['org_strategy'] = self._create_organization_ai_strategy_comparison(df)
        
        # Job Impact Network
        figures['job_impact'] = self._create_tech_job_impact_network(df)
        
        # Topic Distribution
        if hasattr(self, 'topics') and len(self.topics) == len(df):
            figures['topic_timeline'] = self._plot_topics_over_time(df, self.topics)
        
        return figures
    
def _create_industry_impact_heatmap(self, df: pd.DataFrame) -> plt.Figure:
        """Create a heatmap showing AI's impact on different industries based on sentiment"""
        # Creating one row per industry
        exploded_df = df.explode('detected_industries').dropna(subset=['detected_industries'])
        
        # Sentiment scores
        if 'sentiment_ai_impact' in exploded_df.columns:
            exploded_df['sentiment_score'] = exploded_df['sentiment_ai_impact']
        else:
            exploded_df['sentiment_score'] = exploded_df['sentiment_overall']
        
        # Articles per industry
        industry_counts = exploded_df['detected_industries'].value_counts()
        
        # Top industries
        top_industries = industry_counts.head(10).index.tolist()
        
        # Top industries
        industry_df = exploded_df[exploded_df['detected_industries'].isin(top_industries)]
        
        # Grouping by industry and year
        industry_df['year'] = industry_df['date'].dt.year
        industry_year_sentiment = industry_df.groupby(['detected_industries', 'year'])['sentiment_score'].mean().reset_index()
        
        pivot_df = industry_year_sentiment.pivot(index='detected_industries', columns='year', values='sentiment_score')
        
        #  Heatmap
        plt.figure(figsize=(12, 8))
        sns.heatmap(pivot_df, annot=True, cmap='RdYlGn', center=0, fmt='.2f')
        plt.title('AI Impact Sentiment by Industry and Year')
        plt.ylabel('Industry')
        plt.xlabel('Year')
        plt.tight_layout()
        
        return plt.gcf()

In [21]:
def _create_sentiment_timeline(self, df: pd.DataFrame) -> plt.Figure:
        """Create timeline of sentiment with key events"""
        # Average sentiment by month
        df['yearmonth'] = df['date'].dt.strftime('%Y-%m')
        
        # Sentiment scores
        if 'sentiment_ai_impact' in df.columns:
            sentiment_col = 'sentiment_ai_impact'
        elif 'sentiment_overall' in df.columns:
            sentiment_col = 'sentiment_overall'
        else:
            sentiment_col = 'sentiment' if 'sentiment' in df.columns else None
        
        if not sentiment_col:
            print("No sentiment column found for timeline")
            return plt.figure()
        
        # By month
        monthly_sentiment = df.groupby('yearmonth')[sentiment_col].mean().reset_index()
        
        # Datetime format
        monthly_sentiment['date'] = pd.to_datetime(monthly_sentiment['yearmonth'] + '-01')
        monthly_sentiment = monthly_sentiment.sort_values('date')
        
        # Key AI events to annotate
        key_events = [
            {'date': '2023-11', 'event': 'GPT-4 Release', 'y_pos': 0.1},
            {'date': '2024-03', 'event': 'AI Regulation Act', 'y_pos': -0.1},
            {'date': '2024-06', 'event': 'Major AI Job Study', 'y_pos': 0.2},
            {'date': '2024-09', 'event': 'New NLP Breakthrough', 'y_pos': -0.2}
        ]
        
        plt.figure(figsize=(15, 7))
        
        # Sentiment line
        plt.plot(monthly_sentiment['date'], monthly_sentiment[sentiment_col], 
                 marker='o', linestyle='-', color='blue', alpha=0.7)
        
        plt.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
        
        # Event annotations
        for event in key_events:
            event_date = pd.to_datetime(event['date'] + '-01')
            
            if event_date < monthly_sentiment['date'].min() or event_date > monthly_sentiment['date'].max():
                continue
                
            # Closest sentiment value
            closest_idx = (monthly_sentiment['date'] - event_date).abs().idxmin()
            event_sentiment = monthly_sentiment.loc[closest_idx, sentiment_col]
            
            plt.annotate(event['event'], 
                        xy=(event_date, event_sentiment),
                        xytext=(event_date, event_sentiment + event['y_pos']),
                        arrowprops=dict(arrowstyle='->', color='red'),
                        fontsize=10)
        
        plt.title('AI Sentiment Timeline with Key Events')
        plt.xlabel('Date')
        plt.ylabel('Sentiment Score (-1 to 1)')
        plt.grid(True, linestyle='--', alpha=0.5)
        plt.xticks(rotation=45)
        plt.tight_layout()
        
        return plt.gcf()

In [None]:
def _create_technology_timeline(self, df: pd.DataFrame) -> plt.Figure:
        """Create timeline showing the adoption/mention of AI technologies"""
        if 'ai_technologies' not in df.columns:
            print("No technology data found for timeline")
            return plt.figure()
        

        tech_rows = []
        
        for _, row in df.iterrows():
            date = row['date']
            techs = row.get('ai_technologies', {})
            
            for tech_category, tech_items in techs.items():
                if isinstance(tech_items, list):
                    for tech in tech_items:
                        tech_rows.append({
                            'date': date,
                            'tech_category': tech_category,
                            'tech_item': tech,
                            'count': 1
                        })
                else:
                    tech_rows.append({
                        'date': date,
                        'tech_category': tech_category,
                        'tech_item': str(tech_items),
                        'count': 1
                    })
        
        # Dataframe
        if not tech_rows:
            return plt.figure()
            
        tech_df = pd.DataFrame(tech_rows)
        
        # Grouping by month and technology category
        tech_df['yearmonth'] = tech_df['date'].dt.strftime('%Y-%m')
        monthly_tech = tech_df.groupby(['yearmonth', 'tech_category']).size().reset_index(name='count')
        
        # Date
        monthly_tech['date'] = pd.to_datetime(monthly_tech['yearmonth'] + '-01')
        monthly_tech = monthly_tech.sort_values('date')
        
        # Adoption over time
        monthly_tech['cumulative'] = monthly_tech.groupby('tech_category')['count'].cumsum()
        
        pivot_df = monthly_tech.pivot(index='date', columns='tech_category', values='cumulative').fillna(method='ffill').fillna(0)
        
        plt.figure(figsize=(14, 7))
        pivot_df.plot(kind='line', figsize=(14, 7))
        plt.title('AI Technology Adoption Over Time')
        plt.xlabel('Date')
        plt.ylabel('Cumulative Mentions')
        plt.legend(title='Technology', loc='upper left', bbox_to_anchor=(1, 1))
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.xticks(rotation=45)
        plt.tight_layout()
        
        return plt.gcf()
    
def _create_organization_ai_strategy_comparison(self, df: pd.DataFrame) -> plt.Figure:
        """Compare different organizations' AI strategies based on sentiment and technology focus"""
        # Ensure we have the necessary entity data
        if 'top_organizations' not in df.columns or 'ai_technologies' not in df.columns:
            print("Required columns missing for organization comparison")
            return plt.figure()
        
        # Explode organizations
        org_df = df.explode('top_organizations').dropna(subset=['top_organizations'])
        
        # Get top mentioned organizations
        top_orgs = org_df['top_organizations'].value_counts().head(10).index.tolist()
        
        # Filter for top orgs
        top_org_df = org_df[org_df['top_organizations'].isin(top_orgs)]
        
        # Create tech-org matrix
        org_tech_data = []
        
        for _, row in top_org_df.iterrows():
            org = row['top_organizations']
            techs = row.get('ai_technologies', {})
            
            if not techs:
                continue
                
            # Get sentiment
            sentiment = 0
            if 'sentiment_overall' in row:
                sentiment = row['sentiment_overall']
            elif 'sentiment' in row:
                sentiment = row['sentiment']
            
            # Add tech categories
            for tech in techs:
                org_tech_data.append({
                    'organization': org,
                    'technology': tech,
                    'sentiment': sentiment,
                    'count': 1
                })
        
        if not org_tech_data:
            return plt.figure()
            
        # Create dataframe
        org_tech_df = pd.DataFrame(org_tech_data)
        
        # Group by org and tech
        org_tech_summary = org_tech_df.groupby(['organization', 'technology']).agg(
            sentiment=('sentiment', 'mean'),
            count=('count', 'sum')
        ).reset_index()
        
        # Create visualization
        plt.figure(figsize=(15, 10))
        
        # Get top technologies
        top_techs = org_tech_df['technology'].value_counts().head(7).index.tolist()
        
        # Filter for top techs
        plot_data = org_tech_summary[org_tech_summary['technology'].isin(top_techs)]
        
        # Create bubble chart
        for tech in top_techs:
            tech_data = plot_data[plot_data['technology'] == tech]
            plt.scatter(tech_data['organization'], tech_data['sentiment'], 
                       s=tech_data['count'] * 50, alpha=0.7, label=tech)
        
        plt.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
        plt.title('Organization AI Strategy Comparison')
        plt.xlabel('Organization')
        plt.ylabel('Sentiment Score (-1 to 1)')
        plt.xticks(rotation=45, ha='right')
        plt.legend(title='Technology Focus', loc='upper left', bbox_to_anchor=(1, 1))
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        
        return plt.gcf()
    
    def _create_tech_job_impact_network(self, df: pd.DataFrame) -> plt.Figure:
        """Create network visualization showing how technologies impact different job roles"""
        if 'ai_technologies' not in df.columns or 'detected_jobs' not in df.columns:
            print("Required columns missing for job impact network")
            return plt.figure()
        
        # Create graph
        G = nx.Graph()
        
        # Iterate through articles
        for _, row in df.iterrows():
            techs = row.get('ai_technologies', {})
            jobs = row.get('detected_jobs', [])
            
            if not techs or not jobs:
                continue
            
            # Get sentiment
            sentiment = 0  # Default neutral
            if 'sentiment_overall' in row:
                sentiment = row['sentiment_overall']
            elif 'sentiment' in row:
                sentiment = row['sentiment']
            
            # Add edges between technologies and jobs
            for tech_category in techs:
                # Add tech node
                if not G.has_node(tech_category):
                    G.add_node(tech_category, type='technology')
                
                # Add job nodes and connect to tech
                for job in jobs:
                    if not G.has_node(job):
                        G.add_node(job, type='job')
                    
                    # Add or update edge
                    if G.has_edge(tech_category, job):
                        # Update with new sentiment info
                        current = G[tech_category][job]
                        current['weight'] = (current['weight'] * current['count'] + sentiment) / (current['count'] + 1)
                        current['count'] += 1
                    else:
                        G.add_edge(tech_category, job, weight=sentiment, count=1)
        
        # If graph is empty, return empty figure
        if len(G.edges()) == 0:
            return plt.figure()
        
        # Filter graph to show only significant relationships (those with multiple mentions)
        significant_edges = [(u, v) for u, v, d in G.edges(data=True) if d['count'] >= 2]
        
        # If no significant edges, use all edges
        if not significant_edges:
            significant_edges = G.edges()
            
        SG = G.edge_subgraph(significant_edges).copy()
        
        # Draw the network
        pos = nx.spring_layout(SG, seed=42)
        
        plt.figure(figsize=(15, 10))
        
        # Draw nodes with different colors for tech vs jobs
        tech_nodes = [n for n, d in SG.nodes(data=True) if d.get('type') == 'technology']
        job_nodes = [n for n, d in SG.nodes(data=True) if d.get('type') == 'job']
        
        # Size nodes by degree
        tech_sizes = [SG.degree(n) * 50 for n in tech_nodes]
        job_sizes = [SG.degree(n) * 50 for n in job_nodes]
        
        # Draw nodes
        nx.draw_networkx_nodes(SG, pos, nodelist=tech_nodes, node_color='lightblue', 
                              node_size=tech_sizes, alpha=0.8)
        nx.draw_networkx_nodes(SG, pos, nodelist=job_nodes, node_color='lightgreen',
                              node_size=job_sizes, alpha=0.8)
        
        # Draw edges with colors based on sentiment
        edge_colors = []
        edge_widths = []
        for u, v, d in SG.edges(data=True):
            if d['weight'] > 0.2:
                edge_colors.append('green')
            elif d['weight'] < -0.2:
                edge_colors.append('red')
            else:
                edge_colors.append('gray')
            
            # Edge width based on count
            edge_widths.append(1 + d['count'] * 0.5)
        
        nx.draw_networkx_edges(SG, pos, width=edge_widths, edge_color=edge_colors, alpha=0.6)
        
        # Add labels
        nx.draw_networkx_labels(SG, pos, font_size=10)
        
        plt.title('AI Technology Impact on Job Roles')
        plt.axis('off')
        plt.tight_layout()
        
        return plt.gcf()
    
    def _plot_topics_over_time(self, df: pd.DataFrame, topics: np.ndarray, top_n_topics: int = 5) -> plt.Figure:
        """Plot the prevalence of top topics over time"""
        # Create a dataframe with topics and dates
        topic_df = pd.DataFrame({'date': df['date'], 'topic': topics})
        
        # Get counts of each topic
        topic_counts = Counter(topics)
        top_topics = [topic for topic, count in topic_counts.most_common(top_n_topics) if topic != -1]
        
        # Filter for top topics and group by month
        topic_df = topic_df[topic_df['topic'].isin(top_topics)]
        topic_df['yearmonth'] = topic_df['date'].dt.strftime('%Y-%m')
        
        # Count topics per month
        topic_time = topic_df.groupby(['yearmonth', 'topic']).size().reset_index(name='count')
        
        # Convert yearmonth to datetime for better plotting
        topic_time['date'] = pd.to_datetime(topic_time['yearmonth'] + '-01')
        topic_time = topic_time.sort_values('date')
        
        # Pivot for plotting
        pivot_df = topic_time.pivot(index='date', columns='topic', values='count').fillna(0)
        
        # Get topic labels if available
        topic_labels = {}
        if hasattr(self, 'topic_model'):
            for topic in top_topics:
                if topic != -1:  # Skip outlier topic
                    # Get topic representation
                    words = self.topic_model.get_topic(topic)
                    if words:
                        # Create label from top 3 words
                        label = ', '.join([word for word, _ in words[:3]])
                        topic_labels[topic] = f"Topic {topic}: {label}"
                    else:
                        topic_labels[topic] = f"Topic {topic}"
        
        # Plot
        plt.figure(figsize=(12, 6))
        
        # If we have topic labels, rename columns
        if topic_labels:
            pivot_df = pivot_df.rename(columns=topic_labels)
            
        pivot_df.plot(kind='line', figsize=(12, 6))
        plt.title('Top Topics Over Time')
        plt.xlabel('Date')
        plt.ylabel('Article Count')
        plt.legend(title='Topic', loc='upper left', bbox_to_anchor=(1, 1))
        plt.xticks(rotation=45)
        plt.tight_layout()
        
        return plt.gcf()
