In [1]:
import pandas as pd
import os
import spacy
import concurrent.futures
from tqdm import tqdm
import json

# Define input and output folders
batch_one_folder = "Batch_one"
output_file = "word_frequencies.json"

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# POS tags to consider
VALID_POS_TAGS = {"ADJ", "NOUN", "PROPN", "VERB"}

# Thresholds
magic_threshold_min = 5
magic_threshold_max = 100

# Chunking parameters
num_files_per_chunk = 50  # Process 50 files at a time


def extract_valid_words(text):
    """Extract words from text using spaCy, keeping only certain POS tags."""
    doc = nlp(text)
    return [token.text.lower() for token in doc if token.pos_ in VALID_POS_TAGS and token.is_alpha]

def compute_frequencies(file):
    """Compute word frequencies from a single file."""
    filepath = os.path.join(batch_one_folder, file)
    local_frequencies = {}
    try:
        for chunk in pd.read_csv(filepath, on_bad_lines='skip', engine="c", usecols=["concatenated_text"], chunksize=100000):
            for text in chunk["concatenated_text"].dropna():
                for word in extract_valid_words(text):
                    local_frequencies[word] = local_frequencies.get(word, 0) + 1
    except Exception as e:
        print(f"Error processing {file}: {e}")
    return local_frequencies

def process_files_in_chunks(files):
    """Process files in chunks to optimize performance."""
    f_total = {}
    for i in range(0, len(files), num_files_per_chunk):
        chunk_files = files[i:i + num_files_per_chunk]
        print(f"Processing chunk {i // num_files_per_chunk + 1} of {len(files) // num_files_per_chunk + 1}...")
        with concurrent.futures.ProcessPoolExecutor(max_workers=32) as executor:
            results = list(tqdm(executor.map(compute_frequencies, chunk_files), total=len(chunk_files), desc="Processing"))
        
        for local_freq in results:
            for word, count in local_freq.items():
                if magic_threshold_min <= count <= magic_threshold_max:
                    f_total[word] = f_total.get(word, 0) + count
    
    return f_total

# Get list of files
filtered_files = [f for f in os.listdir(batch_one_folder) if f.endswith(".csv")]
print(f"Total files to process: {len(filtered_files)}")

# Process files in chunks
word_frequencies = process_files_in_chunks(filtered_files)

# Save word frequencies to a JSON file
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(word_frequencies, f, ensure_ascii=False, indent=4)

# Display a sample of the most frequent words
sorted_words = sorted(word_frequencies.items(), key=lambda x: x[1], reverse=True)[:10]
sample_df = pd.DataFrame(sorted_words, columns=["Word", "Frequency"])
print("\nSample Word Frequencies:")
print(sample_df)


Total files to process: 263
Processing chunk 1 of 6...


Processing: 100%|██████████| 50/50 [14:47<00:00, 17.74s/it]  


Processing chunk 2 of 6...


Processing: 100%|██████████| 50/50 [13:28<00:00, 16.17s/it]  


Processing chunk 3 of 6...


Processing: 100%|██████████| 50/50 [15:17<00:00, 18.35s/it]  


Processing chunk 4 of 6...


Processing: 100%|██████████| 50/50 [18:10<00:00, 21.82s/it]  


Processing chunk 5 of 6...


Processing: 100%|██████████| 50/50 [16:58<00:00, 20.38s/it]  


Processing chunk 6 of 6...


Processing: 100%|██████████| 13/13 [06:38<00:00, 30.68s/it]



Sample Word Frequencies:
         Word  Frequency
0     doubler      23435
1        jpeg      22543
2  upvotebank      21099
3    rewarded      21077
4  aggregator      20987
5     minimum      19927
6        roll      19755
7        paid      19030
8   excellent      18926
9       tried      18890
