In [5]:
import os
import re
from collections import Counter, defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from concurrent.futures import ThreadPoolExecutor
import nltk

# Download NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

# Define path and stop words
folder_path = '/Users/aya/Desktop/Skole/social_graps/Aya/shared/performer_files'
stop_words = set(stopwords.words('english'))

# Dictionary to aggregate text by genre
genre_texts = defaultdict(list)

# Helper function to process each file
def process_file(file_path):
    with open(file_path, 'r') as file:
        text = file.read().lower()
        
        # Extract genres from each Wikipedia page
        genres = re.findall(r'\[\[([\w\s]+)\]\]', text)
        
        # Clean text by removing punctuation
        text = re.sub(r'[^\w\s]', '', text)
        
        return text, genres

# Step 1: Use ThreadPoolExecutor to read and aggregate text by genre
with ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_file, os.path.join(folder_path, filename)) 
               for filename in os.listdir(folder_path)]
    
    for future in futures:
        text, genres = future.result()
        for genre in genres:
            genre_texts[genre].append(text)

# Step 2: Select the 15 most common genres for term frequency analysis
genre_counts = Counter({genre: sum(len(text.split()) for text in texts) for genre, texts in genre_texts.items()})
top_15_genres = [genre for genre, _ in genre_counts.most_common(15)]
top_15_texts = {genre: ' '.join(genre_texts[genre]) for genre in top_15_genres}

# Step 3: Create Term Frequency (TF) lists
tf_lists = {}
for genre, combined_text in top_15_texts.items():
    # Tokenize, remove stop words, and filter out rare words
    tokens = [word for word in word_tokenize(combined_text) if word not in stop_words]
    token_counts = Counter(tokens)
    filtered_counts = {word: count for word, count in token_counts.items() if count >= 5}

    # Get the top 15 words by frequency
    top_words = dict(Counter(filtered_counts).most_common(15))
    
    # Store the TF list for this genre
    tf_lists[genre] = top_words

# Step 4: Display the top 15 words for each genre
for genre, tf_list in tf_lists.items():
    print(f"Top words for genre '{genre}':")
    for word, count in tf_list.items():
        print(f"{word}: {count}")
    print("\n" + "="*40 + "\n")


[nltk_data] Downloading package punkt to /Users/aya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/aya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Top words for genre 'rolling stone':
music: 57392
album: 49182
country: 48360
web: 32188
swift: 31205
urlstatuslive: 29623
new: 28138
ref: 26491
2023: 24872
magazine: 23930
first: 23841
released: 22693
song: 22177
awards: 20243
songs: 20055


Top words for genre 'allmusic':
country: 60435
music: 45468
album: 42675
web: 24654
songs: 19544
released: 19371
also: 18563
new: 17716
song: 17189
first: 17164
ref: 15901
records: 15769
single: 15665
year: 14768
albums: 13879


Top words for genre 'the new york times':
music: 18659
country: 15288
album: 15036
web: 11563
new: 10920
urlstatuslive: 10336
swift: 9901
news: 9826
2023: 9165
ref: 8883
first: 8480
magazine: 8215
song: 7553
award: 7552
released: 7180


Top words for genre 'country music':
country: 29791
music: 21363
album: 16217
scoperow: 9972
year: 9373
web: 8581
released: 8375
single: 8108
rowspan2: 7839
records: 7468
song: 7466
aligncenter: 6956
new: 6948
first: 6946
styletextaligncenter: 6686


Top words for genre 'entertainment weekl