In [1]:
import pandas as pd
import numpy as np
import pickle
import os
import re
from collections import Counter, defaultdict
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from textblob import TextBlob

In [2]:
# Set random seed for reproducibility
np.random.seed(42)

# Create cache directory if it doesn't exist
cache_dir = "cache"
os.makedirs(cache_dir, exist_ok=True)
print(f"Using cache directory: {os.path.abspath(cache_dir)}")

def get_cache_path(filename):
    """Get full path for a cache file"""
    return os.path.join(cache_dir, filename)

def save_to_cache(obj, filename):
    """Save object to cache"""
    with open(get_cache_path(filename), 'wb') as f:
        pickle.dump(obj, f)
    print(f"Saved {filename} to cache")

def load_from_cache(filename):
    """Load object from cache if it exists"""
    cache_path = get_cache_path(filename)
    if os.path.exists(cache_path):
        with open(cache_path, 'rb') as f:
            return pickle.load(f)
    return None

Using cache directory: /Users/casey/Documents/GitHub/AI_impact_employment/cache


In [9]:
# Dictionaries for analysis
def create_dictionaries():
    """Create all the dictionaries needed for analysis"""
    print("Creating dictionaries for analysis...")
    
    dictionaries = {}
    
    # Sentiment Dictionaries
    dictionaries['sentiment'] = create_sentiment_dictionaries()
    
    # Industry Dictionaries
    dictionaries['industry'] = create_industry_dictionaries()
    
    # Job Dictionaries
    dictionaries['job'] = create_job_dictionaries()
    
    # Technology Dictionaries
    dictionaries['technology'] = create_technology_dictionaries()
    
    # Impact Dictionaries
    dictionaries['impact'] = create_impact_dictionaries()
    
    return dictionaries

from Visuals import (
    create_sentiment_dictionaries,
    create_industry_dictionaries,
    create_job_dictionaries,
    create_technology_dictionaries,
    create_impact_dictionaries
)

ModuleNotFoundError: No module named 'Visuals'

In [5]:
# Feature Extraction
def detect_industries(text, industry_terms, industry_term_weights=None):
    """
    Detect industries mentioned in text with weighted terms
    
    Args:
        text: Article text
        industry_terms: Dictionary of industries and terms
        industry_term_weights: Optional dictionary of weights for specific terms
        
    Returns:
        List of industries sorted by relevance
    """
    if not text or pd.isna(text):
        return []
        
    text_lower = text.lower()
    
    # Count occurrences of each category's keywords with weights
    category_scores = defaultdict(float)
    
    for category, terms in industry_terms.items():
        for term in terms:
            count = text_lower.count(term)
            if count > 0:
                # Apply term-specific weight if available
                weight = 1.0
                if industry_term_weights and category in industry_term_weights and term in industry_term_weights[category]:
                    weight = industry_term_weights[category][term]
                
                # Apply additional weight for longer, more specific terms
                length_weight = min(1.0, 0.5 + len(term) / 20.0)
                
                # Calculate final score
                score = count * weight * length_weight
                category_scores[category] += score
    
    # Sort by score (descending)
    sorted_categories = sorted(category_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Return the categories (without scores)
    return [category for category, _ in sorted_categories]

def detect_jobs(text, job_terms):
    """
    Detect job categories mentioned in text
    
    Args:
        text: Article text
        job_terms: Dictionary of job categories and terms
        
    Returns:
        List of job categories sorted by relevance
    """
    if not text or pd.isna(text):
        return []
        
    text_lower = text.lower()
    
    # Count occurrences of each category's keywords
    category_scores = defaultdict(float)
    
    for category, terms in job_terms.items():
        for term in terms:
            count = text_lower.count(term)
            if count > 0:
                # Apply additional weight for longer, more specific terms
                length_weight = min(1.0, 0.5 + len(term) / 20.0)
                
                # Calculate final score
                score = count * length_weight
                category_scores[category] += score
    
    # Sort by score (descending)
    sorted_categories = sorted(category_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Return the categories (without scores)
    return [category for category, _ in sorted_categories]

def identify_technologies(text, technology_terms, ai_models):
    """
    Identify AI technologies mentioned in the text
    
    Args:
        text: Article text
        technology_terms: Dictionary of technology categories and terms
        ai_models: List of AI model names
        
    Returns:
        Dictionary of technology categories and matched terms
    """
    if not text or pd.isna(text):
        return {}
        
    text_lower = text.lower()
    
    found_techs = {}
    
    # Technology categories
    for tech_category, keywords in technology_terms.items():
        matched_keywords = [k for k in keywords if k in text_lower]
        if matched_keywords:
            # Sort by length (longer terms are typically more specific)
            matched_keywords.sort(key=len, reverse=True)
            found_techs[tech_category] = matched_keywords
    
    # AI models
    found_models = [model for model in ai_models if model.lower() in text_lower]
    if found_models:
        found_techs['specific_models'] = found_models
    
    return found_techs

def extract_organizations(text):
    """
    Extract organization names using simple heuristics
    (Note: This is a simplified version without spaCy for faster processing)
    
    Args:
        text: Article text
        
    Returns:
        List of potential organization names
    """
    if not text or pd.isna(text):
        return []
    
    # Known major AI companies and organizations
    known_orgs = [
        'OpenAI', 'Google', 'Microsoft', 'Apple', 'Amazon', 'Meta', 'Facebook',
        'IBM', 'Anthropic', 'Cohere', 'NVIDIA', 'Intel', 'AMD', 'Baidu', 'Alibaba',
        'Tencent', 'Samsung', 'Tesla', 'DeepMind', 'Huawei', 'Oracle', 'SAP',
        'Salesforce', 'Adobe', 'Cisco', 'McKinsey', 'Accenture', 'Deloitte', 'PwC',
        'KPMG', 'MIT', 'Stanford', 'Harvard', 'Berkeley', 'Carnegie Mellon',
        'Oxford', 'Cambridge', 'ETH Zurich', 'Max Planck', 'DeepL', 'Stability AI'
    ]
    
    # Find known organizations in the text
    found_orgs = []
    for org in known_orgs:
        if org.lower() in text.lower():
            found_orgs.append(org)
    
    # Simple pattern for potential organizations (very simplified)
    org_pattern = r'\b([A-Z][a-z]+(?:\s[A-Z][a-z]+)*(?:\s(?:Inc|LLC|Ltd|Corp|Corporation|Company|Technologies|AI|Labs))?)\b'
    
    potential_orgs = re.findall(org_pattern, text)
    
    # Filter to only include potential organization names (longer than 1 word)
    additional_orgs = [org for org in potential_orgs if len(org.split()) > 1 and org not in found_orgs]
    
    # Combine the results, prioritizing known organizations
    all_orgs = found_orgs + additional_orgs[:5]  # Limit to avoid noise
    
    return all_orgs[:10]  # Limit to top 10

def analyze_sentiment(text, positive_terms, negative_terms, industry=None):
    """
    Analyze sentiment of the article regarding AI impact with domain-specific lexicon
    
    Args:
        text: Article text
        positive_terms: Dictionary of positive terms and their weights
        negative_terms: Dictionary of negative terms and their weights
        industry: Optional industry for industry-specific analysis
        
    Returns:
        Dictionary with sentiment scores
    """
    if not text or pd.isna(text):
        return {
            'overall': 0,
            'base': 0,
            'lexicon': 0,
            'proximity': 0,
            'industry': 0
        }
    
    # Base sentiment from TextBlob
    base_sentiment = TextBlob(text).sentiment.polarity
    
    # Custom domain-specific lexicon approach
    text_lower = text.lower()
    sentences = text.split('.')
    
    # 1. Calculate overall sentiment using domain-specific lexicon
    positive_matches = 0
    positive_score = 0
    negative_matches = 0
    negative_score = 0
    
    # Count and score positive terms
    for term, value in positive_terms.items():
        count = text_lower.count(term)
        if count > 0:
            positive_matches += count
            positive_score += value * count
    
    # Count and score negative terms
    for term, value in negative_terms.items():
        count = text_lower.count(term)
        if count > 0:
            negative_matches += count
            negative_score += value * count
    
    # 2. Calculate proximity between AI terms and impact terms
    ai_terms = ['ai', 'artificial intelligence', 'machine learning']
    impact_terms = ['job', 'work', 'employee', 'career', 'industry']
    
    proximity_score = 0
    proximity_count = 0
    
    # Check sentences containing both AI and impact terms
    for sentence in sentences:
        sentence = sentence.lower()
        has_ai = any(term in sentence for term in ai_terms)
        has_impact = any(term in sentence for term in impact_terms)
        
        if has_ai and has_impact:
            # Calculate sentiment for this sentence
            sent_sentiment = TextBlob(sentence).sentiment.polarity
            proximity_score += sent_sentiment
            proximity_count += 1
    
    # 3. Industry-specific sentiment (simplified)
    industry_sentiment = 0
    
    # 4. Calculate final weighted sentiment scores
    lexicon_sentiment = 0
    if (positive_matches + negative_matches) > 0:
        lexicon_sentiment = (positive_score + negative_score) / (positive_matches + negative_matches)
    
    proximity_sentiment = 0
    if proximity_count > 0:
        proximity_sentiment = proximity_score / proximity_count
    
    # Final weighted score
    weights = {
        'base': 0.2,
        'lexicon': 0.4,
        'proximity': 0.3,
        'industry': 0.1
    }
    
    final_sentiment = (
        weights['base'] * base_sentiment +
        weights['lexicon'] * lexicon_sentiment +
        weights['proximity'] * proximity_sentiment +
        weights['industry'] * industry_sentiment
    )
    
    return {
        'overall': final_sentiment,
        'base': base_sentiment,
        'lexicon': lexicon_sentiment,
        'proximity': proximity_sentiment,
        'industry': industry_sentiment
    }

def add_features_to_dataset(df, dictionaries):
    """
    Add all the missing features to the dataset
    
    Args:
        df: DataFrame with topic modeling results
        dictionaries: Dictionary of analysis dictionaries
        
    Returns:
        DataFrame with added features
    """
    print("Adding features to dataset...")
    
    # Make a copy to avoid modifying the original
    df_enhanced = df.copy()
    
    # Unpack dictionaries
    sentiment_dict = dictionaries['sentiment']
    industry_dict = dictionaries['industry']
    job_dict = dictionaries['job']
    technology_dict = dictionaries['technology']
    
    # 1. Detect industries
    print("Detecting industries and jobs...")
    tqdm.pandas(desc="Industries")
    df_enhanced['detected_industries'] = df_enhanced['cleaned_text'].progress_apply(
        lambda x: detect_industries(
            x, 
            industry_dict['industry_terms'], 
            industry_dict['industry_term_weights']
        )
    )
    
    # 2. Detect jobs
    tqdm.pandas(desc="Jobs")
    df_enhanced['detected_jobs'] = df_enhanced['cleaned_text'].progress_apply(
        lambda x: detect_jobs(x, job_dict['job_terms'])
    )
    
    # 3. Identify technologies
    print("Identifying AI technologies...")
    tqdm.pandas(desc="Technologies")
    df_enhanced['ai_technologies'] = df_enhanced['cleaned_text'].progress_apply(
        lambda x: identify_technologies(
            x, 
            technology_dict['technology_terms'],
            technology_dict['ai_models']
        )
    )
    
    # 4. Extract organizations (simplified version)
    print("Extracting organizations...")
    tqdm.pandas(desc="Organizations")
    df_enhanced['top_organizations'] = df_enhanced['cleaned_text'].progress_apply(extract_organizations)
    
    # 5. Analyze sentiment
    print("Analyzing sentiment...")
    tqdm.pandas(desc="Sentiment")
    df_enhanced['sentiment_scores'] = df_enhanced.progress_apply(
        lambda x: analyze_sentiment(
            x['cleaned_text'],
            sentiment_dict['positive_terms'],
            sentiment_dict['negative_terms'],
            x['detected_industries'][0] if len(x['detected_industries']) > 0 else None
        ),
        axis=1
    )
    
    # Extract components of sentiment
    df_enhanced['sentiment_overall'] = df_enhanced['sentiment_scores'].apply(lambda x: x['overall'])
    df_enhanced['sentiment_base'] = df_enhanced['sentiment_scores'].apply(lambda x: x['base'])
    df_enhanced['sentiment_lexicon'] = df_enhanced['sentiment_scores'].apply(lambda x: x['lexicon'])
    df_enhanced['sentiment_proximity'] = df_enhanced['sentiment_scores'].apply(lambda x: x['proximity'])
    df_enhanced['sentiment_industry'] = df_enhanced['sentiment_scores'].apply(lambda x: x['industry'])
    
    # 6. Add primary industry and job
    df_enhanced['primary_industry'] = df_enhanced['detected_industries'].apply(
        lambda x: x[0] if len(x) > 0 else None
    )
    
    df_enhanced['primary_job'] = df_enhanced['detected_jobs'].apply(
        lambda x: x[0] if len(x) > 0 else None
    )
    
    return df_enhanced

def run_enhancement_pipeline():
    """
    Main function to run the enhancement pipeline
    
    Returns:
        Enhanced DataFrame
    """
    print("Starting enhancement pipeline...")
    
    # 1. Load topic data
    df = load_from_cache('data_with_topics.pkl')
    if df is None:
        print("ERROR: Could not load data from data_with_topics.pkl")
        return None
    
    print(f"Loaded data with {len(df)} articles")
    
    # 2. Create dictionaries
    dictionaries = create_dictionaries()
    
    # 3. Add features to dataset
    df_enhanced = add_features_to_dataset(df, dictionaries)
    
    # 4. Save enhanced dataset
    save_to_cache(df_enhanced, 'enhanced_data_with_features.pkl')
    
    print("Enhancement pipeline complete!")
    print("Enhanced data saved to 'enhanced_data_with_features.pkl'")
    
    return df_enhanced

In [6]:
if __name__ == "__main__":
    run_enhancement_pipeline()

Starting enhancement pipeline...
Loaded data with 184391 articles
Creating dictionaries for analysis...


NameError: name 'create_sentiment_dictionaries' is not defined