In [37]:
# Topic modeling script for AI impact on employment analysis
# Set random seed for reproducibility
import numpy as np
np.random.seed(42)

import os
import pickle
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [38]:
# Create cache directory if it doesn't exist
cache_dir = "cache"
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

def get_cache_path(filename):
    """Get full path for a cache file"""
    return os.path.join(cache_dir, filename)

def save_to_cache(obj, filename):
    """Save object to cache"""
    with open(get_cache_path(filename), 'wb') as f:
        pickle.dump(obj, f)

def load_from_cache(filename):
    """Load object from cache if it exists"""
    cache_path = get_cache_path(filename)
    if os.path.exists(cache_path):
        with open(cache_path, 'rb') as f:
            return pickle.load(f)
    return None

In [39]:
def run_topic_modeling(df_input, num_topics=10, force_recompute=False):
    """
    Run topic modeling on articles
    
    Args:
        df_input: DataFrame with cleaned articles (must contain 'cleaned_text' column)
        num_topics: Number of topics to extract
        force_recompute: Whether to force recomputation
        
    Returns:
        Tuple of (topic_model, document_topics, feature_names)
    """
    if df_input is None:
        print("ERROR: Input DataFrame is None!")
        return None, None, None, None
        
    if 'cleaned_text' not in df_input.columns:
        print("ERROR: Input DataFrame has no 'cleaned_text' column!")
        print(f"Available columns: {df_input.columns.tolist()}")
        return None, None, None, None
    
    cache_file = f"topic_model_{num_topics}.pkl"
    
    # Results
    if not force_recompute:
        cached_data = load_from_cache(cache_file)
        if cached_data is not None:
            print("Loaded topic model from cache")
            return cached_data
    
    print("Running topic modeling...")
    
    documents = df_input['cleaned_text'].tolist()
    if not documents:
        print("ERROR: No documents found in the cleaned_text column!")
        return None, None, None, None
    
    # Use CountVectorizer with n-grams
    print("Creating document-term matrix...")
    vectorizer = CountVectorizer(
        max_df=0.7,
        min_df=10,
        max_features=10000,
        ngram_range=(1, 2),
        stop_words='english'
    )
    
    # Create document-term matrix
    try:
        X = vectorizer.fit_transform(documents)
        feature_names = vectorizer.get_feature_names_out()
    except Exception as e:
        print(f"Error in vectorization: {e}")
        # Return placeholder if we encounter an error
        return None, np.array([0] * len(documents)), [], []
    
    # Train LDA model
    print(f"Training LDA model with {num_topics} topics...")
    lda = LatentDirichletAllocation(
        n_components=num_topics,
        max_iter=10,
        random_state=42,
        n_jobs=-1
    )
    
    # Fit model and transform documents
    try:
        doc_topic_dists = lda.fit_transform(X)
        
        # Get the most probable topic for each document
        doc_topics = doc_topic_dists.argmax(axis=1)
        
        # Extract top terms for each topic
        topic_terms = []
        for topic_idx, topic in enumerate(lda.components_):
            top_features_ind = topic.argsort()[:-10 - 1:-1]
            top_features = [feature_names[i] for i in top_features_ind]
            topic_terms.append(top_features)
        
        # Save topic model, document topics, and feature names
        result = (lda, doc_topics, feature_names, topic_terms)
        save_to_cache(result, cache_file)
        
        return result
    except Exception as e:
        print(f"Error in topic modeling: {e}")
        return None, np.array([0] * len(documents)), [], []

In [40]:
def plot_topics(lda, feature_names, n_top_words=10, save_path=None):
    """
    Plot the top words for each topic in the LDA model
    
    Args:
        lda: Trained LDA model
        feature_names: Names of features from vectorizer
        n_top_words: Number of top words to show per topic
        save_path: Path to save the figure, if None just displays
    """
    fig, axes = plt.subplots(5, 2, figsize=(15, 25), sharex=True)
    axes = axes.flatten()
    
    for topic_idx, topic in enumerate(lda.components_):
        if topic_idx >= len(axes):
            break
            
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]
        
        ax = axes[topic_idx]
        ax.barh(top_features, weights)
        ax.set_title(f'Topic {topic_idx + 1}', fontsize=20)
        ax.tick_params(axis='both', which='major', labelsize=14)
        ax.set_xlabel('Weight', fontsize=14)
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    else:
        plt.show()
    return fig

In [41]:
def plot_topics_over_time(df, topics, topic_terms, top_n_topics=5, save_path=None):
    """Plot the prevalence of top topics over time"""
    # Create DataFrame with topics
    topic_df = pd.DataFrame({'date': df['date'], 'topic': topics})
    
    # Counts of each topic
    topic_counts = {}
    for topic in range(max(topics) + 1):
        topic_counts[topic] = np.sum(topics == topic)
    
    # Get top topics
    top_topics = sorted(topic_counts.items(), key=lambda x: x[1], reverse=True)
    top_topics = [topic for topic, count in top_topics[:top_n_topics]]
    
    # Top topics by month
    topic_df = topic_df[topic_df['topic'].isin(top_topics)]
    topic_df['yearmonth'] = topic_df['date'].dt.strftime('%Y-%m')
    
    # Topics per month
    topic_time = topic_df.groupby(['yearmonth', 'topic']).size().reset_index(name='count')
    
    # Datetime
    topic_time['date'] = pd.to_datetime(topic_time['yearmonth'] + '-01')
    topic_time = topic_time.sort_values('date')
    
    # Create pivot table
    pivot_df = topic_time.pivot(index='date', columns='topic', values='count').fillna(0)
    
    # Add topic labels from terms
    topic_labels = {}
    for topic in top_topics:
        # Label from top 3 words
        label = ', '.join(topic_terms[topic][:3])
        topic_labels[topic] = f"Topic {topic}: {label}"
    
    plt.figure(figsize=(12, 6))
    
    if topic_labels:
        pivot_df = pivot_df.rename(columns=topic_labels)
    
    pivot_df.plot(kind='line', figsize=(12, 6))
    plt.title('Top Topics Over Time', fontsize=16)
    plt.xlabel('Date', fontsize=14)
    plt.ylabel('Article Count', fontsize=14)
    plt.legend(title='Topic', loc='upper left', bbox_to_anchor=(1, 1))
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    else:
        plt.show()
    return plt.gcf()

In [49]:
# First, check which file exists
import os

if os.path.exists('clean_filter.py'):
    file_to_use = 'clean_filter.py'
elif os.path.exists('clean_and_filter.py'):
    file_to_use = 'clean_and_filter.py'
else:
    raise FileNotFoundError("Neither clean_filter.py nor clean_and_filter.py found")

# Then use the file that exists
try:
    exec(open(file_to_use).read())
    print(f"Successfully executed {file_to_use}")
except Exception as e:
    print(f"Error executing {file_to_use}: {e}")

FileNotFoundError: Neither clean_filter.py nor clean_and_filter.py found

In [1]:
import os
print("Files in current directory:", os.listdir('.'))

Files in current directory: ['LDA.IPYNB', '.DS_Store', 'cache', 'Final_NLP_v2 (4).ipynb', 'clean_filter.py', '.git', 'Final_NLP_v2.ipynb']


In [2]:
import sys
import os

# Print current working directory to verify
print("Current working directory:", os.getcwd())

# Make sure the script is in the Python path
script_path = os.path.abspath('clean_filter.py')
print("Script path:", script_path)
print("Script exists:", os.path.exists(script_path))

# Import the module instead of using exec
if os.path.exists('clean_filter.py'):
    try:
        import clean_filter
        print("Successfully imported clean_filter module")
    except Exception as e:
        print(f"Error importing clean_filter: {e}")
else:
    print("clean_filter.py not found in specified path")

Current working directory: /Users/casey/Documents/GitHub/AI_impact_employment
Script path: /Users/casey/Documents/GitHub/AI_impact_employment/clean_filter.py
Script exists: True
Starting data preprocessing...
Loading dataset...
Dataset loaded successfully. Shape: (200083, 5)
Cleaning and processing text...


  text = BeautifulSoup(text, "html.parser").get_text()


After removing rows with invalid dates: 200083 rows
Filtering for relevance...
After filtering for relevance: 184391 rows
Saving data to cache...
Saved cleaned_data.pkl to cache
Saved cleaned_data_for_lda.pkl to cache
Saved cleaned_data_minimal.pkl to cache
Data preprocessing complete!
Processed 200083 articles, with 184391 relevant articles saved to cache
You can now run LDA.py for topic modeling
Successfully imported clean_filter module


In [3]:
if __name__ == "__main__":
    # Try loading from each of the possible cache files
    df_clean = None
    cache_files = ["cleaned_data_for_lda.pkl", "cleaned_data_minimal.pkl", "cleaned_data.pkl"]
    
    for cache_file in cache_files:
        print(f"Attempting to load {cache_file}...")
        try:
            df_clean = load_from_cache(cache_file)
            if df_clean is not None and 'cleaned_text' in df_clean.columns:
                print(f"Successfully loaded data from {cache_file}")
                print(f"DataFrame shape: {df_clean.shape}")
                print(f"Columns: {df_clean.columns.tolist()}")
                break
            else:
                if df_clean is None:
                    print(f"File {cache_file} not found or could not be loaded")
                else:
                    print(f"File {cache_file} loaded but 'cleaned_text' column is missing")
                df_clean = None
        except Exception as e:
            print(f"Error loading {cache_file}: {e}")
            df_clean = None
    
    if df_clean is None:
        print("ERROR: Could not load valid data from any cache file!")
        print("Please run clean_and_filter.py first to generate the necessary data files.")
        exit(1)
    
    # Verify we have the necessary data
    print(f"Data loaded with {len(df_clean)} articles")
    
    # Run topic modeling
    lda, doc_topics, feature_names, topic_terms = run_topic_modeling(
        df_clean, 
        num_topics=10, 
        force_recompute=True
    )
    
    if lda is None:
        print("ERROR: Topic modeling failed!")
        exit(1)
    
    # Add topic to each document
    df_clean['topic'] = doc_topics
    
    print("Creating topic visualizations...")
    
    # Plot top words for each topic
    fig1 = plot_topics(lda, feature_names, n_top_words=10, save_path="topic_words.png")
    
    # Plot topics over time
    fig2 = plot_topics_over_time(df_clean, doc_topics, topic_terms, top_n_topics=5, save_path="topics_over_time.png")
    
    # Save the enriched DataFrame with topics
    save_to_cache(df_clean, "data_with_topics.pkl")
    
    print("Topic modeling and visualization complete!")
    print("Saved topic visualization to 'topic_words.png'")
    print("Saved topics over time to 'topics_over_time.png'")
    print("Saved enriched data to cache as 'data_with_topics.pkl'")

Attempting to load cleaned_data_for_lda.pkl...
Error loading cleaned_data_for_lda.pkl: name 'load_from_cache' is not defined
Attempting to load cleaned_data_minimal.pkl...
Error loading cleaned_data_minimal.pkl: name 'load_from_cache' is not defined
Attempting to load cleaned_data.pkl...
Error loading cleaned_data.pkl: name 'load_from_cache' is not defined
ERROR: Could not load valid data from any cache file!
Please run clean_and_filter.py first to generate the necessary data files.


TypeError: object of type 'NoneType' has no len()