In [1]:
import os
import re
from collections import Counter, defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from concurrent.futures import ProcessPoolExecutor

# Set up stop words
stop_words = set(stopwords.words('english'))

# Folder path and variables
folder_path = '/Users/aya/Desktop/Skole/social_graps/Aya/shared/performer_files'
genre_documents = defaultdict(list)  # Dictionary to store text for each genre

# Helper function to process each file
def process_file(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
        
        # Extract genres from each Wikipedia page (adjust regex if needed)
        genres = re.findall(r'\[\[([\w\s]+)\]\]', text.lower())  
        
        # Remove punctuation and convert text to lowercase in one go
        text = re.sub(r'[^\w\s]', '', text.lower())
        
        return text, genres

# Step 1: Load and aggregate text by genre (parallelized)
with ProcessPoolExecutor() as executor:
    futures = [executor.submit(process_file, os.path.join(folder_path, filename)) 
               for filename in os.listdir(folder_path)]
    
    for future in futures:
        text, genres = future.result()
        for genre in genres:
            genre_documents[genre].append(text)

# Step 2: Process text for each genre and create TF lists
tf_lists = {}
for genre, texts in genre_documents.items():
    # Concatenate all text for the genre
    combined_text = ' '.join(texts)
    
    # Tokenize and remove stop words
    tokens = [token for token in word_tokenize(combined_text) if token not in stop_words]
    
    # Count word frequencies and filter out rare words
    token_counts = Counter(tokens)
    filtered_counts = {word: count for word, count in token_counts.items() if count >= 5}

    # Sort by frequency and get the top 15 words
    top_words = dict(Counter(filtered_counts).most_common(15))
    
    # Store the TF list for this genre
    tf_lists[genre] = top_words

# Step 3: Display results for each genre
for genre, tf_list in tf_lists.items():
    print(f"Top words for genre '{genre}':")
    for word, count in tf_list.items():
        print(f"{word}: {count}")
    print("\n" + "="*40 + "\n")


[nltk_data] Downloading package punkt to /Users/aya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/aya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


KeyboardInterrupt: 