I wanted to improve each articles analysis with more features that go beyond text. I created dictionaries to detect industries, job roles, technologies and organizations to perform my custom sentiment analysis.

In [1]:
import pandas as pd
import numpy as np
import pickle
import os
import re
from collections import Counter, defaultdict
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from textblob import TextBlob

In [2]:
np.random.seed(42)

In [3]:
# Cache directory.
cache_dir = "cache"
os.makedirs(cache_dir, exist_ok=True)
print(f"Using cache directory: {os.path.abspath(cache_dir)}")

def get_cache_path(filename):
    return os.path.join(cache_dir, filename)

def save_to_cache(obj, filename):
    with open(get_cache_path(filename), 'wb') as f:
        pickle.dump(obj, f)
    print(f"Saved {filename} to cache")

def load_from_cache(filename):
    cache_path = get_cache_path(filename)
    if os.path.exists(cache_path):
        with open(cache_path, 'rb') as f:
            return pickle.load(f)
    return None

Using cache directory: /Users/casey/Documents/GitHub/AI_impact_employment/cache


In [4]:
# Dictionary Functions.

def create_sentiment_dictionaries():
    print("Creating sentiment dictionaries.")
    
    # Positive terms related to AI in the workplace context.
    positive_terms = {
        # Opportunity and growth.
        'opportunity': 1.0, 'enhance': 0.8, 'improve': 0.8, 'augment': 0.7,
        'growth': 0.7, 'advancement': 0.8, 'upskill': 0.9, 'progress': 0.7,
        'potential': 0.5, 'revolutionize': 0.8, 'transform': 0.7,

        # Productivity and efficiency.
        'efficiency': 0.8, 'productivity': 0.8, 'streamline': 0.7,
        'optimize': 0.7, 'accelerate': 0.6, 'automate': 0.6,

        # Collaboration and assistance.
        'assist': 0.6, 'empower': 0.9, 'collaborate': 0.7, 'partnership': 0.6,
        'complement': 0.7, 'teamwork': 0.7, 'support': 0.6, 'aid': 0.6,

        # Solution and benefit.
        'solution': 0.6, 'benefit': 0.8, 'advantage': 0.7, 'value': 0.6,
        'solve': 0.7, 'facilitate': 0.6, 'enable': 0.7,

        # Innovation and creation.
        'innovation': 0.9, 'create': 0.6, 'invent': 0.7, 'develop': 0.6,
        'pioneer': 0.8, 'breakthrough': 0.9, 'novel': 0.7
    }

    # Negative terms related to AI in the workplace context.
    negative_terms = {
        # Job Loss and replacement.
        'replace': -0.8, 'eliminate': -0.9, 'displace': -0.8, 'substitute': -0.7,
        'job loss': -0.9, 'unemployment': -0.9, 'layoff': -0.9, 'redundant': -0.8,
        'downsizing': -0.8, 'obsolete': -0.8, 'outdated': -0.7,

        # Risk and threat.
        'threaten': -0.7, 'risk': -0.6, 'danger': -0.7, 'concern': -0.5,
        'worry': -0.6, 'fear': -0.7, 'threat': -0.8, 'harmful': -0.8,

        # Problems and challenges.
        'controversy': -0.6, 'problem': -0.6, 'challenge': -0.4, 'difficulty': -0.5,
        'obstacle': -0.5, 'hurdle': -0.4, 'barrier': -0.5,

        # Social issues.
        'inequality': -0.7, 'bias': -0.7, 'discrimination': -0.8, 'unfair': -0.7,
        'disparity': -0.7, 'divide': -0.6, 'exclusion': -0.7,

        # Control and privacy.
        'surveillance': -0.8, 'monitor': -0.6, 'control': -0.6, 'invasion': -0.7,
        'privacy': -0.7, 'intrusive': -0.7, 'oversight': -0.5
    }
    
    return {
        'positive_terms': positive_terms,
        'negative_terms': negative_terms
    }

# Industry dictionaries based on my knowledge and common industry terms.
def create_industry_dictionaries():
    print("Creating industry dictionaries.")
    
    industry_terms = {
        'healthcare': [
            'doctor', 'physician', 'nurse', 'hospital', 'clinic', 'patient', 'care',
            'medical', 'healthcare', 'health care', 'medicine', 'pharma', 'clinical'
        ],

        'finance': [
            'bank', 'banking', 'investment', 'investor', 'loan', 'credit', 
            'financial', 'finance', 'trading', 'insurance', 'fintech'
        ],

        'manufacturing': [
            'factory', 'manufacturing', 'production', 'assembly', 'supply chain',
            'industrial', 'automotive', 'machinery', 'robotics', 'automation'
        ],

        'retail': [
            'store', 'shop', 'retail', 'e-commerce', 'customer', 'consumer',
            'inventory', 'merchandising', 'commerce', 'shopping'
        ],

        'education': [
            'school', 'university', 'college', 'student', 'teacher', 'professor',
            'education', 'learning', 'teaching', 'training', 'academic'
        ],

        'technology': [
            'software', 'hardware', 'tech', 'technology', 'computer', 'digital',
            'it', 'internet', 'web', 'app', 'computing', 'cloud'
        ],

        'media': [
            'media', 'news', 'entertainment', 'publishing', 'content', 
            'social media', 'journalist', 'writing', 'advertising'
        ],

        'legal': [
            'legal', 'lawyer', 'attorney', 'law firm', 'regulatory', 'compliance',
            'court', 'litigation', 'judge', 'justice', 'contract'
        ]
    }

    industry_term_weights = {
        'healthcare': {'hospital': 5, 'doctor': 4, 'patient': 3, 'medical': 2, 'healthcare': 5},
        'finance': {'bank': 5, 'investment': 4, 'financial': 3, 'loan': 2, 'finance': 5},
        'manufacturing': {'factory': 5, 'manufacturing': 5, 'production': 4, 'assembly': 3},
        'retail': {'store': 4, 'retail': 5, 'e-commerce': 5, 'consumer': 3},
        'education': {'school': 5, 'university': 5, 'student': 4, 'education': 5},
        'technology': {'software': 4, 'tech': 5, 'technology': 5, 'digital': 3},
        'media': {'media': 5, 'news': 4, 'content': 3, 'publishing': 4},
        'legal': {'lawyer': 5, 'legal': 5, 'law': 4, 'attorney': 5}
    }
    
    return {
        'industry_terms': industry_terms,
        'industry_term_weights': industry_term_weights
    }

# Job and technology dictionaries based on common job roles.
def create_job_dictionaries():
    print("Creating job dictionaries.")
    
    job_terms = {
        'management': [
            'ceo', 'chief executive', 'cfo', 'cio', 'cto', 'coo', 'executive',
            'manager', 'supervisor', 'director', 'leadership', 'administration'
        ],

        'engineering': [
            'engineer', 'developer', 'programmer', 'coder', 'data scientist',
            'machine learning engineer', 'ai engineer', 'software engineer',
            'technical', 'architect', 'DevOps'
        ],

        'creative': [
            'designer', 'writer', 'artist', 'content creator', 'creative',
            'marketer', 'marketing', 'advertiser', 'author', 'editor'
        ],

        'education': [
            'teacher', 'professor', 'instructor', 'educator', 'faculty',
            'academic', 'trainer', 'teaching', 'tutor', 'lecturer'
        ],

        'healthcare': [
            'doctor', 'nurse', 'physician', 'surgeon', 'medical professional',
            'pharmacist', 'therapist', 'healthcare worker', 'clinician'
        ],

        'finance': [
            'banker', 'accountant', 'financial analyst', 'trader', 'investor',
            'broker', 'financial advisor', 'auditor', 'actuary'
        ],

        'service': [
            'customer service', 'retail worker', 'sales associate', 'cashier',
            'receptionist', 'assistant', 'representative', 'clerk'
        ],

        'manufacturing': [
            'factory worker', 'machine operator', 'assembler', 'production worker',
            'technician', 'mechanic', 'quality control', 'maintenance'
        ]
    }
    
    return {
        'job_terms': job_terms
    }

# Technology dictionaries based on common AI and technology words.
def create_technology_dictionaries():
    print("Creating technology dictionaries.")
    
    technology_terms = {
        'machine_learning': [
            'machine learning', 'ml', 'artificial intelligence', 'ai', 'algorithm',
            'deep learning', 'neural network', 'data science'
        ],

        'nlp': [
            'natural language processing', 'nlp', 'language model', 'llm',
            'large language model', 'chatbot', 'gpt', 'bert'
        ],

        'computer_vision': [
            'computer vision', 'image recognition', 'object detection',
            'facial recognition', 'image processing'
        ],

        'robotics': [
            'robot', 'robotics', 'automation', 'autonomous', 'self-driving',
            'robotic process automation', 'rpa'
        ],

        'ai_infrastructure': [
            'gpu', 'cloud computing', 'edge computing', 'federated learning',
            'ai chip', 'compute', 'transformer'
        ]
    }

    ai_models = [
        'gpt', 'chatgpt', 'gpt-4', 'gpt-3', 'dall-e', 'bard', 'palm',
        'llama', 'claude', 'stable diffusion', 'midjourney', 'gemini'
    ]
    
    return {
        'technology_terms': technology_terms,
        'ai_models': ai_models
    }

In [5]:
# Feature Extraction Functions
# Detecting industries.
def detect_industries(text, industry_terms, industry_term_weights=None):
    if not text or pd.isna(text):
        return []
        
    text_lower = text.lower()
    category_scores = defaultdict(float)
    
    for category, terms in industry_terms.items():
        for term in terms:
            count = text_lower.count(term)
            if count > 0:
                weight = 1.0
                if industry_term_weights and category in industry_term_weights and term in industry_term_weights[category]:
                    weight = industry_term_weights[category][term]
                
                length_weight = min(1.0, 0.5 + len(term) / 20.0)
                score = count * weight * length_weight
                category_scores[category] += score
    
    sorted_categories = sorted(category_scores.items(), key=lambda x: x[1], reverse=True)
    return [category for category, _ in sorted_categories]

In [6]:
# Detecting jobs.
def detect_jobs(text, job_terms):
    if not text or pd.isna(text):
        return []
        
    text_lower = text.lower()
    category_scores = defaultdict(float)
    
    for category, terms in job_terms.items():
        for term in terms:
            count = text_lower.count(term)
            if count > 0:
                length_weight = min(1.0, 0.5 + len(term) / 20.0)
                score = count * length_weight
                category_scores[category] += score
    
    sorted_categories = sorted(category_scores.items(), key=lambda x: x[1], reverse=True)
    return [category for category, _ in sorted_categories]

# Detecting technologies.
def identify_technologies(text, technology_terms, ai_models):
    if not text or pd.isna(text):
        return {}
        
    text_lower = text.lower()
    found_techs = {}
    
    for tech_category, keywords in technology_terms.items():
        matched_keywords = [k for k in keywords if k in text_lower]
        if matched_keywords:
            matched_keywords.sort(key=len, reverse=True)
            found_techs[tech_category] = matched_keywords
    
    found_models = [model for model in ai_models if model.lower() in text_lower]
    if found_models:
        found_techs['specific_models'] = found_models
    
    return found_techs

# Detecting organizations.
def extract_organizations(text):
    if not text or pd.isna(text):
        return []
    
    known_orgs = [
        'OpenAI', 'Google', 'Microsoft', 'Apple', 'Amazon', 'Meta', 'Facebook',
        'IBM', 'Anthropic', 'NVIDIA', 'Intel', 'AMD', 'Tesla', 'DeepMind'
    ]
    
    found_orgs = []
    for org in known_orgs:
        if org.lower() in text.lower():
            found_orgs.append(org)
    
    return found_orgs[:5]

In [7]:
# Sentiment Analysis
# Managing negation detection for speed.
def fast_detect_negations(text, target_terms):
    if not text:
        return []
    
    text_lower = text.lower()
    negated_terms = []
    
    # Negation words.
    negation_words = ['not', 'never', 'no', "don't", "doesn't", "didn't", "won't", "can't"]
    
    for neg_word in negation_words:
        if neg_word in text_lower:
            # Looking for sentiment terms within 30 characters after negation.
            for term in target_terms:
                pattern = f"{neg_word}.{{0,30}}{re.escape(term)}"
                if re.search(pattern, text_lower):
                    negated_terms.append(term)
    
    return negated_terms

# Proximity analysis.
def fast_proximity_analysis(text, positive_terms, negative_terms):
    if not text:
        return 0
    
    text_lower = text.lower()
    sentences = text.split('.')
    
    ai_terms = ['ai', 'artificial intelligence', 'machine learning', 'automation']
    impact_terms = ['job', 'work', 'employee', 'career', 'industry', 'employment']
    
    proximity_scores = []
    
    for sentence in sentences:
        sentence_lower = sentence.lower()
        has_ai = any(term in sentence_lower for term in ai_terms)
        has_impact = any(term in sentence_lower for term in impact_terms)
        
        if has_ai and has_impact:
            # Sentiment scoring.
            sentence_score = 0
            for term, value in positive_terms.items():
                if term in sentence_lower:
                    sentence_score += value
            for term, value in negative_terms.items():
                if term in sentence_lower:
                    sentence_score += value
            
            proximity_scores.append(sentence_score)
    
    return np.mean(proximity_scores) if proximity_scores else 0

# Custom sentiment analysis.
def fast_enhanced_sentiment_analysis(text, positive_terms, negative_terms, industry=None):
    if not text or pd.isna(text):
        return {
            'overall': 0,
            'base_textblob': 0,
            'lexicon_normalized': 0,
            'proximity_enhanced': 0,
            'recency_weighted': 0,
            'negation_adjusted': 0,
            'word_count': 0,
            'negated_terms_count': 0
        }
    
    # Base sentiment from TextBlob.
    base_sentiment = TextBlob(text).sentiment.polarity
    
    # Normalized lexicon analysis.
    text_lower = text.lower()
    word_count = len(text_lower.split())
    
    positive_score = 0
    positive_matches = 0
    negative_score = 0
    negative_matches = 0
    
    all_sentiment_terms = list(positive_terms.keys()) + list(negative_terms.keys())
    
    # Negation detection.
    negated_terms = fast_detect_negations(text, all_sentiment_terms)
    
    # Scored terms.
    for term, value in positive_terms.items():
        count = text_lower.count(term)
        if count > 0:
            if term in negated_terms:
                value *= -0.5
            positive_matches += count
            positive_score += value * count
    
    for term, value in negative_terms.items():
        count = text_lower.count(term)
        if count > 0:
            if term in negated_terms:
                value *= -0.5
            negative_matches += count
            negative_score += value * count
    
    # Normalized by text length.
    total_matches = positive_matches + negative_matches
    if total_matches > 0 and word_count > 0:
        normalization_factor = np.log(word_count + 1)
        lexicon_normalized = (positive_score + negative_score) / (total_matches * normalization_factor)
    else:
        lexicon_normalized = 0
    
    # Proximity analysis.
    proximity_enhanced = fast_proximity_analysis(text, positive_terms, negative_terms)
    
    # Recency weight for AI terms appearing later. If the term appears in the last 60% of the text.
    recency_weight = 1.0
    ai_terms = ['ai', 'artificial intelligence', 'machine learning']
    for term in ai_terms:
        last_pos = text_lower.rfind(term)
        if last_pos > len(text) * 0.6:  
            recency_weight = 1.3
            break
    
    recency_weighted = lexicon_normalized * recency_weight
    
    # Negation adjustment.
    negation_penalty = len(negated_terms) * 0.1
    negation_adjusted = lexicon_normalized - negation_penalty
    
    # Combined score.
    weights = {
        'base_textblob': 0.35,       
        'lexicon_normalized': 0.30,   
        'proximity_enhanced': 0.25,
        'recency_weighted': 0.05,
        'negation_adjusted': 0.05
    }
    
    overall_sentiment = (
        weights['base_textblob'] * base_sentiment +
        weights['lexicon_normalized'] * lexicon_normalized +
        weights['proximity_enhanced'] * proximity_enhanced +
        weights['recency_weighted'] * (recency_weighted - lexicon_normalized) +
        weights['negation_adjusted'] * (negation_adjusted - lexicon_normalized)
    )
    
    return {
        'overall': overall_sentiment,
        'base_textblob': base_sentiment,
        'lexicon_normalized': lexicon_normalized,
        'proximity_enhanced': proximity_enhanced,
        'recency_weighted': recency_weighted,
        'negation_adjusted': negation_adjusted,
        'word_count': word_count,
        'negated_terms_count': len(negated_terms)
    }

In [8]:
# Main Functions
# Dictionaries for the enhanced features.
def create_dictionaries():
    dictionaries = {}
    
    dictionaries['sentiment'] = create_sentiment_dictionaries()
    dictionaries['industry'] = create_industry_dictionaries()
    dictionaries['job'] = create_job_dictionaries()
    dictionaries['technology'] = create_technology_dictionaries()
    
    return dictionaries

# Added the enhanced features to the dataset.
def add_fast_enhanced_features_to_dataset(df, dictionaries):
    
    df_enhanced = df.copy()
    
    sentiment_dict = dictionaries['sentiment']
    industry_dict = dictionaries['industry']
    job_dict = dictionaries['job']
    technology_dict = dictionaries['technology']
    
    # Feature detection.
    print("Detecting industries and jobs.")
    tqdm.pandas(desc="Industries")
    df_enhanced['detected_industries'] = df_enhanced['cleaned_text'].progress_apply(
        lambda x: detect_industries(
            x, 
            industry_dict['industry_terms'], 
            industry_dict['industry_term_weights']
        )
    )
    
    tqdm.pandas(desc="Jobs")
    df_enhanced['detected_jobs'] = df_enhanced['cleaned_text'].progress_apply(
        lambda x: detect_jobs(x, job_dict['job_terms'])
    )
    
    print("Identifying AI technologies.")
    tqdm.pandas(desc="Technologies")
    df_enhanced['ai_technologies'] = df_enhanced['cleaned_text'].progress_apply(
        lambda x: identify_technologies(
            x, 
            technology_dict['technology_terms'],
            technology_dict['ai_models']
        )
    )
    
    print("Extracting organizations.")
    tqdm.pandas(desc="Organizations")
    df_enhanced['top_organizations'] = df_enhanced['cleaned_text'].progress_apply(extract_organizations)
    
    print("Analyzing sentiment with fast enhanced model.")
    
    tqdm.pandas(desc="Sentiment Analysis")
    df_enhanced['enhanced_sentiment_scores'] = df_enhanced.progress_apply(
        lambda x: fast_enhanced_sentiment_analysis(
            x['cleaned_text'],
            sentiment_dict['positive_terms'],
            sentiment_dict['negative_terms'],
            x['detected_industries'][0] if len(x['detected_industries']) > 0 else None
        ),
        axis=1
    )
    
    # Extracted sentiment scores.
    df_enhanced['sentiment_overall_enhanced'] = df_enhanced['enhanced_sentiment_scores'].apply(lambda x: x['overall'])
    df_enhanced['sentiment_base_textblob'] = df_enhanced['enhanced_sentiment_scores'].apply(lambda x: x['base_textblob'])
    df_enhanced['sentiment_lexicon_normalized'] = df_enhanced['enhanced_sentiment_scores'].apply(lambda x: x['lexicon_normalized'])
    df_enhanced['sentiment_proximity_enhanced'] = df_enhanced['enhanced_sentiment_scores'].apply(lambda x: x['proximity_enhanced'])
    df_enhanced['sentiment_recency_weighted'] = df_enhanced['enhanced_sentiment_scores'].apply(lambda x: x['recency_weighted'])
    df_enhanced['sentiment_negation_adjusted'] = df_enhanced['enhanced_sentiment_scores'].apply(lambda x: x['negation_adjusted'])
    
    # Additional features.
    df_enhanced['article_word_count'] = df_enhanced['enhanced_sentiment_scores'].apply(lambda x: x['word_count'])
    df_enhanced['negated_terms_count'] = df_enhanced['enhanced_sentiment_scores'].apply(lambda x: x['negated_terms_count'])
    
    # Primary categories for industries and jobs.
    df_enhanced['primary_industry'] = df_enhanced['detected_industries'].apply(
        lambda x: x[0] if len(x) > 0 else None
    )
    
    df_enhanced['primary_job'] = df_enhanced['detected_jobs'].apply(
        lambda x: x[0] if len(x) > 0 else None
    )
    
    return df_enhanced

In [9]:
def run_fast_enhanced_pipeline():
    
    # Loaded data.
    df = load_from_cache('data_with_topics.pkl')
    if df is None:
        print("Error: Could not load data.")
        return None
    
    print(f"Loaded data with {len(df)} articles")
    
    # Created the dictionaries.
    dictionaries = create_dictionaries()
    
    # Added features.
    df_enhanced = add_fast_enhanced_features_to_dataset(df, dictionaries)
    
    # Saved dataset.
    save_to_cache(df_enhanced, 'fast_enhanced_data_with_features.pkl')
    
    print("Fast enhanced pipeline completed.")
    print("Enhanced data saved to 'fast_enhanced_data_with_features.pkl'")
    
    return df_enhanced

In [10]:
# Runing the pipeline.
if __name__ == "__main__":
    df_enhanced = run_fast_enhanced_pipeline()
    
    if df_enhanced is not None:
        print("Pipeline completed successfully.")
        print(f"Enhanced dataset shape: {df_enhanced.shape}")
    else:
        print("Pipeline failed.")

Starting FAST enhanced pipeline...
Loaded data with 184391 articles
Creating sentiment dictionaries...
Creating industry dictionaries...
Creating job dictionaries...
Creating technology dictionaries...
Adding fast enhanced features to dataset...
Detecting industries and jobs...


Industries:   0%|          | 0/184391 [00:00<?, ?it/s]

Jobs:   0%|          | 0/184391 [00:00<?, ?it/s]

Identifying AI technologies...


Technologies:   0%|          | 0/184391 [00:00<?, ?it/s]

Extracting organizations...


Organizations:   0%|          | 0/184391 [00:00<?, ?it/s]

Analyzing sentiment with fast enhanced model...
Note: Using TextBlob + enhanced features (NO FinBERT for speed)


Sentiment Analysis:   0%|          | 0/184391 [00:00<?, ?it/s]

Saved fast_enhanced_data_with_features.pkl to cache
Fast enhanced pipeline complete!
Enhanced data saved to 'fast_enhanced_data_with_features.pkl'
Pipeline completed successfully!
Enhanced dataset shape: (184391, 29)


In [12]:
# Industry categories and times.
# Industry categories and their counts from the dataset.
def get_industry_categories_and_times(df):
    if 'detected_industries' not in df.columns:
        print("Error: detected industries column not found in dataset.")
        return None
    
    industry_counts = Counter()
    
    for industries in df['detected_industries']:
        for industry in industries:
            industry_counts[industry] += 1
    
    # Sorted by count.
    sorted_industries = sorted(industry_counts.items(), key=lambda x: x[1], reverse=True)
    
    return sorted_industries

# Summary of industry categories and their counts.
def print_industry_summary(df):
    industry_summary = get_industry_categories_and_times(df)
    
    if industry_summary is None:
        print("No industry data available.")
        return
    
    print("Industry Categories Summary:")
    for industry, count in industry_summary:
        print(f"{industry}: {count} articles")
    
    print(f"Total unique industries: {len(industry_summary)}")
if __name__ == "__main__":
    # Load the dataset.
    df_enhanced = load_from_cache('fast_enhanced_data_with_features.pkl')
    
    if df_enhanced is not None:
        print_industry_summary(df_enhanced)
    else:
        print("Error: Could not load the dataset.")
        print("Run the fast enhanced pipeline.")

Industry Categories Summary:
technology: 184354 articles
media: 180245 articles
education: 146171 articles
healthcare: 128916 articles
retail: 125436 articles
finance: 122744 articles
legal: 118712 articles
manufacturing: 81758 articles
Total unique industries detected: 8


In [13]:
# Job types and times.
# Job types and their counts.
def get_job_types_and_times(df):
    if 'detected_jobs' not in df.columns:
        print("Error: detected jobs column not found in the dataset.")
        return None
    
    job_counts = Counter()
    
    for jobs in df['detected_jobs']:
        for job in jobs:
            job_counts[job] += 1
    
    # Sorted by count.
    sorted_jobs = sorted(job_counts.items(), key=lambda x: x[1], reverse=True)
    
    return sorted_jobs

# Summary of job types and their counts.
def print_job_summary(df):
    job_summary = get_job_types_and_times(df)
    
    if job_summary is None:
        print("No job data available.")
        return
    
    print("Job Types Summary:")
    for job, count in job_summary:
        print(f"{job}: {count} articles")
    
    print(f"Total unique job types: {len(job_summary)}")
if __name__ == "__main__":
    # Load the dataset.
    df_enhanced = load_from_cache('fast_enhanced_data_with_features.pkl')
    
    if df_enhanced is not None:
        print_job_summary(df_enhanced)
    else:
        print("Error: Could not load enhanced dataset.")
        print("Run the fast enhanced pipeline.")

Job Types Summary:
management: 165572 articles
creative: 124566 articles
engineering: 82332 articles
education: 47065 articles
finance: 46949 articles
service: 39692 articles
healthcare: 19718 articles
manufacturing: 11444 articles
Total unique job types detected: 8
