I wanted to improve each articles analysis with more features that go beyond text. I created dictionaries to detect industries, job roles, technologies, organizations and to perform my custom sentiment analysis.

In [1]:
import pandas as pd
import numpy as np
import pickle
import os
import re
from collections import Counter, defaultdict
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from textblob import TextBlob

In [2]:
# Reproducibility
np.random.seed(42)

# Cache directory
cache_dir = "cache"
os.makedirs(cache_dir, exist_ok=True)
print(f"Using cache directory: {os.path.abspath(cache_dir)}")

def get_cache_path(filename):
    """Get full path for a cache file"""
    return os.path.join(cache_dir, filename)

def save_to_cache(obj, filename):
    """Save object to cache"""
    with open(get_cache_path(filename), 'wb') as f:
        pickle.dump(obj, f)
    print(f"Saved {filename} to cache")

def load_from_cache(filename):
    """Load object from cache if it exists"""
    cache_path = get_cache_path(filename)
    if os.path.exists(cache_path):
        with open(cache_path, 'rb') as f:
            return pickle.load(f)
    return None

# Dictionary Functions

# I created a set of weighted keywords related to positive and negative sentiment and grouped them by themes.
# I will use this to score how positively or negatively AI is being discussed.
def create_sentiment_dictionaries():
    print("Creating sentiment dictionaries...")
    
    # Positive terms related to AI in the workplace context
    positive_terms = {
        # Opportunity and Growth
        'opportunity': 1.0, 'enhance': 0.8, 'improve': 0.8, 'augment': 0.7,
        'growth': 0.7, 'advancement': 0.8, 'upskill': 0.9, 'progress': 0.7,
        'potential': 0.5, 'revolutionize': 0.8, 'transform': 0.7,

        # Productivity and Efficiency
        'efficiency': 0.8, 'productivity': 0.8, 'streamline': 0.7,
        'optimize': 0.7, 'accelerate': 0.6, 'automate': 0.6,

        # Collaboration and Assistance
        'assist': 0.6, 'empower': 0.9, 'collaborate': 0.7, 'partnership': 0.6,
        'complement': 0.7, 'teamwork': 0.7, 'support': 0.6, 'aid': 0.6,

        # Solution and Benefit
        'solution': 0.6, 'benefit': 0.8, 'advantage': 0.7, 'value': 0.6,
        'solve': 0.7, 'facilitate': 0.6, 'enable': 0.7,

        # Innovation and Creation
        'innovation': 0.9, 'create': 0.6, 'invent': 0.7, 'develop': 0.6,
        'pioneer': 0.8, 'breakthrough': 0.9, 'novel': 0.7
    }

    # Negative terms related to AI in the workplace context
    negative_terms = {
        # Job Loss and Replacement
        'replace': -0.8, 'eliminate': -0.9, 'displace': -0.8, 'substitute': -0.7,
        'job loss': -0.9, 'unemployment': -0.9, 'layoff': -0.9, 'redundant': -0.8,
        'downsizing': -0.8, 'obsolete': -0.8, 'outdated': -0.7,

        # Risk and Threat
        'threaten': -0.7, 'risk': -0.6, 'danger': -0.7, 'concern': -0.5,
        'worry': -0.6, 'fear': -0.7, 'threat': -0.8, 'harmful': -0.8,

        # Problems and Challenges
        'controversy': -0.6, 'problem': -0.6, 'challenge': -0.4, 'difficulty': -0.5,
        'obstacle': -0.5, 'hurdle': -0.4, 'barrier': -0.5,

        # Social Issues
        'inequality': -0.7, 'bias': -0.7, 'discrimination': -0.8, 'unfair': -0.7,
        'disparity': -0.7, 'divide': -0.6, 'exclusion': -0.7,

        # Control and Privacy
        'surveillance': -0.8, 'monitor': -0.6, 'control': -0.6, 'invasion': -0.7,
        'privacy': -0.7, 'intrusive': -0.7, 'oversight': -0.5
    }
    
    return {
        'positive_terms': positive_terms,
        'negative_terms': negative_terms
    }

# I created a set of keywords that belong to certain industries like healthcare, finance and manufacturing.
def create_industry_dictionaries():
    """Create comprehensive industry dictionaries using domain knowledge"""
    print("Creating industry dictionaries...")
    
    industry_terms = {
        'healthcare': [
            'doctor', 'physician', 'nurse', 'hospital', 'clinic', 'patient', 'care',
            'medical', 'healthcare', 'health care', 'medicine', 'pharma', 'clinical'
        ],

        'finance': [
            'bank', 'banking', 'investment', 'investor', 'loan', 'credit', 
            'financial', 'finance', 'trading', 'insurance', 'fintech'
        ],

        'manufacturing': [
            'factory', 'manufacturing', 'production', 'assembly', 'supply chain',
            'industrial', 'automotive', 'machinery', 'robotics', 'automation'
        ],

        'retail': [
            'store', 'shop', 'retail', 'e-commerce', 'customer', 'consumer',
            'inventory', 'merchandising', 'commerce', 'shopping'
        ],

        'education': [
            'school', 'university', 'college', 'student', 'teacher', 'professor',
            'education', 'learning', 'teaching', 'training', 'academic'
        ],

        'technology': [
            'software', 'hardware', 'tech', 'technology', 'computer', 'digital',
            'it', 'internet', 'web', 'app', 'computing', 'cloud'
        ],

        'media': [
            'media', 'news', 'entertainment', 'publishing', 'content', 
            'social media', 'journalist', 'writing', 'advertising'
        ],

        'legal': [
            'legal', 'lawyer', 'attorney', 'law firm', 'regulatory', 'compliance',
            'court', 'litigation', 'judge', 'justice', 'contract'
        ]
    }

    # I also gave each term a weight, so more specific terms like hospital or e-commerce have a higher weight.
    # This help improve precision when detecting what industry the article talks about.
    industry_term_weights = {
        'healthcare': {'hospital': 5, 'doctor': 4, 'patient': 3, 'medical': 2, 'healthcare': 5},
        'finance': {'bank': 5, 'investment': 4, 'financial': 3, 'loan': 2, 'finance': 5},
        'manufacturing': {'factory': 5, 'manufacturing': 5, 'production': 4, 'assembly': 3},
        'retail': {'store': 4, 'retail': 5, 'e-commerce': 5, 'consumer': 3},
        'education': {'school': 5, 'university': 5, 'student': 4, 'education': 5},
        'technology': {'software': 4, 'tech': 5, 'technology': 5, 'digital': 3},
        'media': {'media': 5, 'news': 4, 'content': 3, 'publishing': 4},
        'legal': {'lawyer': 5, 'legal': 5, 'law': 4, 'attorney': 5}
    }
    
    return {
        'industry_terms': industry_terms,
        'industry_term_weights': industry_term_weights
    }

# Same idea as industries but for job categories like engineering, creative and healthcare.
# Links AI mentions to who might be affected like teacher or developer.
def create_job_dictionaries():
    print("Creating job dictionaries...")
    
    job_terms = {
        'management': [
            'ceo', 'chief executive', 'cfo', 'cio', 'cto', 'coo', 'executive',
            'manager', 'supervisor', 'director', 'leadership', 'administration'
        ],

        'engineering': [
            'engineer', 'developer', 'programmer', 'coder', 'data scientist',
            'machine learning engineer', 'ai engineer', 'software engineer',
            'technical', 'architect', 'DevOps'
        ],

        'creative': [
            'designer', 'writer', 'artist', 'content creator', 'creative',
            'marketer', 'marketing', 'advertiser', 'author', 'editor'
        ],

        'education': [
            'teacher', 'professor', 'instructor', 'educator', 'faculty',
            'academic', 'trainer', 'teaching', 'tutor', 'lecturer'
        ],

        'healthcare': [
            'doctor', 'nurse', 'physician', 'surgeon', 'medical professional',
            'pharmacist', 'therapist', 'healthcare worker', 'clinician'
        ],

        'finance': [
            'banker', 'accountant', 'financial analyst', 'trader', 'investor',
            'broker', 'financial advisor', 'auditor', 'actuary'
        ],

        'service': [
            'customer service', 'retail worker', 'sales associate', 'cashier',
            'receptionist', 'assistant', 'representative', 'clerk'
        ],

        'manufacturing': [
            'factory worker', 'machine operator', 'assembler', 'production worker',
            'technician', 'mechanic', 'quality control', 'maintenance'
        ]
    }
    
    return {
        'job_terms': job_terms
    }

# To cover different AI domains like NLP, computer vision and robotics. I also included real-world AI model names like GPT for mention of tools.
def create_technology_dictionaries():
    print("Creating technology dictionaries...")
    
    technology_terms = {
        'machine_learning': [
            'machine learning', 'ml', 'artificial intelligence', 'ai', 'algorithm',
            'deep learning', 'neural network', 'data science'
        ],

        'nlp': [
            'natural language processing', 'nlp', 'language model', 'llm',
            'large language model', 'chatbot', 'gpt', 'bert'
        ],

        'computer_vision': [
            'computer vision', 'image recognition', 'object detection',
            'facial recognition', 'image processing'
        ],

        'robotics': [
            'robot', 'robotics', 'automation', 'autonomous', 'self-driving',
            'robotic process automation', 'rpa'
        ],

        'ai_infrastructure': [
            'gpu', 'cloud computing', 'edge computing', 'federated learning',
            'ai chip', 'compute', 'transformer'
        ]
    }

    # AI product models
    ai_models = [
        'gpt', 'chatgpt', 'gpt-4', 'gpt-3', 'dall-e', 'bard', 'palm',
        'llama', 'claude', 'stable diffusion', 'midjourney', 'gemini'
    ]
    
    return {
        'technology_terms': technology_terms,
        'ai_models': ai_models
    }

# Feature Extraction Functions

# I created a function to detect industries mentioned in the article text. It scans the articles for industry keywords and scores each category.
# I multiply frequency * weight * term length weight, this so longer and more precise terms count more.
# The it will return the most likely industries mentioned in the article.
def detect_industries(text, industry_terms, industry_term_weights=None):
    if not text or pd.isna(text):
        return []
        
    text_lower = text.lower()
    
    # Counting the occurrences of each category's keywords with weights.
    category_scores = defaultdict(float)
    
    for category, terms in industry_terms.items():
        for term in terms:
            count = text_lower.count(term)
            if count > 0:
                # Term-specific weight.
                weight = 1.0
                if industry_term_weights and category in industry_term_weights and term in industry_term_weights[category]:
                    weight = industry_term_weights[category][term]
                
                # Applying the additional weight for longer and more specific terms.
                length_weight = min(1.0, 0.5 + len(term) / 20.0)
                
                # Final score
                score = count * weight * length_weight
                category_scores[category] += score
    
    # By descending.
    sorted_categories = sorted(category_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Gives the categories.
    return [category for category, _ in sorted_categories]

# I created a function to detect job categories mentioned in the article text. This is for identifying who the article is referring to engineers, nurses, etc.
def detect_jobs(text, job_terms):
    if not text or pd.isna(text):
        return []
        
    text_lower = text.lower()
    
    # Counting occurrences of each category keywords
    category_scores = defaultdict(float)
    
    for category, terms in job_terms.items():
        for term in terms:
            count = text_lower.count(term)
            if count > 0:
                # Additional weight for longer and more specific terms.
                length_weight = min(1.0, 0.5 + len(term) / 20.0)
                
                # Final score
                score = count * length_weight
                category_scores[category] += score
    
    # Descending order
    sorted_categories = sorted(category_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Returns the categories
    return [category for category, _ in sorted_categories]

# This finds the mentions of AI technologies in the text.mI wanted this to track whether
# the article is talking about general tech like machine learning or specific tools like ChatGPT.
def identify_technologies(text, technology_terms, ai_models):
    if not text or pd.isna(text):
        return {}
        
    text_lower = text.lower()
    
    found_techs = {}
    
    # Technology categories
    for tech_category, keywords in technology_terms.items():
        matched_keywords = [k for k in keywords if k in text_lower]
        if matched_keywords:
            # Sort by length, since longer terms are typically more specific
            matched_keywords.sort(key=len, reverse=True)
            found_techs[tech_category] = matched_keywords
    
    # AI models
    found_models = [model for model in ai_models if model.lower() in text_lower]
    if found_models:
        found_techs['specific_models'] = found_models
    
    return found_techs

# This extracts organization names from the text. I used a simple heuristic to find known AI companies.
# If the article mentions them, I extract them and it helps link the content to real world actors.
def extract_organizations(text):
    if not text or pd.isna(text):
        return []
    
    # Major AI companies and organizations
    known_orgs = [
        'OpenAI', 'Google', 'Microsoft', 'Apple', 'Amazon', 'Meta', 'Facebook',
        'IBM', 'Anthropic', 'NVIDIA', 'Intel', 'AMD', 'Tesla', 'DeepMind'
    ]
    
    # Find the known organizations in the text
    found_orgs = []
    for org in known_orgs:
        if org.lower() in text.lower():
            found_orgs.append(org)
    
    # Top 5
    return found_orgs[:5]

# This is a custom sentiment function that combines:
# TextBlob sentiment analysis 
# My own weighted keyword scores which make it domain-specific
# Sentence level sentiment where AI and impact keywords are close together
# I combine all of these into a weighted score
def analyze_sentiment(text, positive_terms, negative_terms, industry=None):
    if not text or pd.isna(text):
        return {
            'overall': 0,
            'base': 0,
            'lexicon': 0,
            'proximity': 0,
            'industry': 0
        }
    
    # Base sentiment from TextBlob
    base_sentiment = TextBlob(text).sentiment.polarity
    
    # Domain specific approach
    text_lower = text.lower()
    sentences = text.split('.')
    
    # 1. Calculate overall sentiment using domain specific lexicon
    positive_matches = 0
    positive_score = 0
    negative_matches = 0
    negative_score = 0
    
    # Counting and scoring the positive terms
    for term, value in positive_terms.items():
        count = text_lower.count(term)
        if count > 0:
            positive_matches += count
            positive_score += value * count
    
    # Counting and scoring the negative terms
    for term, value in negative_terms.items():
        count = text_lower.count(term)
        if count > 0:
            negative_matches += count
            negative_score += value * count
    
    # 2. Calculating the proximity between AI and impact terms
    ai_terms = ['ai', 'artificial intelligence', 'machine learning']
    impact_terms = ['job', 'work', 'employee', 'career', 'industry']
    
    proximity_score = 0
    proximity_count = 0
    
    # Checking sentences containing both AI and impact terms
    for sentence in sentences:
        sentence = sentence.lower()
        has_ai = any(term in sentence for term in ai_terms)
        has_impact = any(term in sentence for term in impact_terms)
        
        if has_ai and has_impact:
            # Sentiment for this sentence
            sent_sentiment = TextBlob(sentence).sentiment.polarity
            proximity_score += sent_sentiment
            proximity_count += 1
    
    # 3. Industry specific sentiment
    industry_sentiment = 0
    
    # 4. Final weighted sentiment scores
    lexicon_sentiment = 0
    if (positive_matches + negative_matches) > 0:
        lexicon_sentiment = (positive_score + negative_score) / (positive_matches + negative_matches)
    
    proximity_sentiment = 0
    if proximity_count > 0:
        proximity_sentiment = proximity_score / proximity_count
    
    # Final weighted score
    weights = {
        'base': 0.2,
        'lexicon': 0.4,
        'proximity': 0.3,
        'industry': 0.1
    }
    
    final_sentiment = (
        weights['base'] * base_sentiment +
        weights['lexicon'] * lexicon_sentiment +
        weights['proximity'] * proximity_sentiment +
        weights['industry'] * industry_sentiment
    )
    
    return {
        'overall': final_sentiment,
        'base': base_sentiment,
        'lexicon': lexicon_sentiment,
        'proximity': proximity_sentiment,
        'industry': industry_sentiment
    }

# Main Functions

# Ir runs all the setup functions above and returns a dictionary of all terms and weights.
def create_dictionaries():
    dictionaries = {}
    
    # 1. Sentiment Dictionaries
    dictionaries['sentiment'] = create_sentiment_dictionaries()
    
    # 2. Industry Dictionaries
    dictionaries['industry'] = create_industry_dictionaries()
    
    # 3. Job Dictionaries
    dictionaries['job'] = create_job_dictionaries()
    
    # 4. Technology Dictionaries
    dictionaries['technology'] = create_technology_dictionaries()
    
    return dictionaries


# The function takes the dataset and the dictionaries and adds new features to the dataset.
# It detects industries, jobs, technologies, organizations and sentiment scores and returns the enhanced dataset with all the new features.
# I added a few more features like primary industry and job.
def add_features_to_dataset(df, dictionaries):
    print("Adding features to dataset...")
    
    # Making a copy to avoid modifying the original
    df_enhanced = df.copy()
    
    # Dictionaries
    sentiment_dict = dictionaries['sentiment']
    industry_dict = dictionaries['industry']
    job_dict = dictionaries['job']
    technology_dict = dictionaries['technology']
    
    # Detecting industries
    print("Detecting industries and jobs...")
    df_enhanced['detected_industries'] = df_enhanced['cleaned_text'].apply(
        lambda x: detect_industries(
            x, 
            industry_dict['industry_terms'], 
            industry_dict['industry_term_weights']
        )
    )
    
    # Detecting jobs
    df_enhanced['detected_jobs'] = df_enhanced['cleaned_text'].apply(
        lambda x: detect_jobs(x, job_dict['job_terms'])
    )
    
    # Identifying technologies
    print("Identifying AI technologies...")
    df_enhanced['ai_technologies'] = df_enhanced['cleaned_text'].apply(
        lambda x: identify_technologies(
            x, 
            technology_dict['technology_terms'],
            technology_dict['ai_models']
        )
    )
    
    # Extracting the organizations
    print("Extracting organizations...")
    df_enhanced['top_organizations'] = df_enhanced['cleaned_text'].apply(extract_organizations)
    
    # Analyzing the sentiment
    print("Analyzing sentiment...")
    df_enhanced['sentiment_scores'] = df_enhanced.apply(
        lambda x: analyze_sentiment(
            x['cleaned_text'],
            sentiment_dict['positive_terms'],
            sentiment_dict['negative_terms'],
            x['detected_industries'][0] if len(x['detected_industries']) > 0 else None
        ),
        axis=1
    )
    
    # Extracting sentiment scores
    df_enhanced['sentiment_overall'] = df_enhanced['sentiment_scores'].apply(lambda x: x['overall'])
    df_enhanced['sentiment_base'] = df_enhanced['sentiment_scores'].apply(lambda x: x['base'])
    df_enhanced['sentiment_lexicon'] = df_enhanced['sentiment_scores'].apply(lambda x: x['lexicon'])
    df_enhanced['sentiment_proximity'] = df_enhanced['sentiment_scores'].apply(lambda x: x['proximity'])
    df_enhanced['sentiment_industry'] = df_enhanced['sentiment_scores'].apply(lambda x: x['industry'])
    
    # Adding primary industry and job
    df_enhanced['primary_industry'] = df_enhanced['detected_industries'].apply(
        lambda x: x[0] if len(x) > 0 else None
    )
    
    df_enhanced['primary_job'] = df_enhanced['detected_jobs'].apply(
        lambda x: x[0] if len(x) > 0 else None
    )
    
    return df_enhanced

# Runs the pipeline
def run_enhancement_pipeline():
    print("Starting enhancement pipeline...")
    
    # Loading the topic data
    df = load_from_cache('data_with_topics.pkl')
    if df is None:
        print("ERROR: Could not load data from data_with_topics.pkl")
        return None
    
    print(f"Loaded data with {len(df)} articles")
    
    # Creating the dictionaries
    dictionaries = create_dictionaries()
    
    # Adding the features to the dataset
    df_enhanced = add_features_to_dataset(df, dictionaries)
    
    # Save dataset
    save_to_cache(df_enhanced, 'enhanced_data_with_features.pkl')
    
    print("Enhancement pipeline complete!")
    print("Enhanced data saved to 'enhanced_data_with_features.pkl'")
    
    return df_enhanced

Using cache directory: /Users/casey/Documents/GitHub/AI_impact_employment/cache


In [3]:
if __name__ == "__main__":
    run_enhancement_pipeline()

Starting enhancement pipeline...
Loaded data with 184391 articles
Creating sentiment dictionaries...
Creating industry dictionaries...
Creating job dictionaries...
Creating technology dictionaries...
Adding features to dataset...
Detecting industries and jobs...
Identifying AI technologies...
Extracting organizations...
Analyzing sentiment...
Saved enhanced_data_with_features.pkl to cache
Enhancement pipeline complete!
Enhanced data saved to 'enhanced_data_with_features.pkl'
