In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import spacy

# Load SpaCy's language model (for POS tagging)
nlp = spacy.load("en_core_web_sm")

# Function to tokenize and filter words based on POS tags and additional rules
def tokenize_and_filter(text):
    """
    Tokenize and filter words based on POS tags and additional rules.
    Includes verbs, adjectives, and nouns, excluding pronouns.
    Excludes symbols, single letters, and non-alphabetic tokens.
    """
    doc = nlp(text)
    filtered_tokens = [
        token.text.lower() for token in doc
        if token.pos_ in {"VERB", "ADJ", "NOUN"}  # Select specific POS tags
        and token.tag_ not in {"PRP", "PRP$"}     # Exclude pronouns
        and len(token.text) > 1                   # Exclude single characters
        and token.text.isalpha()                  # Exclude non-alphabetic tokens
    ]
    return filtered_tokens

# Function to update the word frequency dictionary
def update_word_count(word_counter, tokens):
    """Update the word count Counter with tokens."""
    word_counter.update(tokens)

# Function to plot and save a bar chart from a dictionary
def save_bar_plot(data_dict, x_label, y_label, title, output_path, show=False):
    """
    Plot a bar chart from a dictionary and save it as an image file.
    Optionally display the plot in the notebook.
    """
    # Sort the dictionary and take the top 10 tokens
    sorted_items = sorted(data_dict.items(), key=lambda x: x[1], reverse=True)[:10]
    words, counts = zip(*sorted_items) if sorted_items else ([], [])  # Handle empty case
    
    # Create the plot
    plt.figure(figsize=(10, 6))
    plt.barh(words, counts, color="skyblue")
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.gca().invert_yaxis()  # Invert the y-axis for better readability
    plt.tight_layout()
    
    # Save the plot to the specified file path
    plt.savefig(output_path)
    
    # Optionally show the plot
    if show:
        plt.show()
    
    # Close the figure to free memory
    plt.close()

# Paths for input and output
output_folder = "steemit_tsv_filtered_output"
output_file = "word_frequencies.csv"
plots_folder = "bar_plots"

# Ensure the output folder for plots exists
os.makedirs(plots_folder, exist_ok=True)

# Initialize a Counter for word counts
word_counter = Counter()

# Open a file to save word frequencies
with open(output_file, "w") as outfile:
    outfile.write("word,count\n")  # Write header

    # Loop through each file in the output folder
    for i, file_name in enumerate(os.listdir(output_folder)):
        if file_name.endswith(".csv"):
            file_path = os.path.join(output_folder, file_name)
            
            # Read the file
            df = pd.read_csv(file_path)
            
            # Initialize a Counter for the current file
            file_word_counter = Counter()
            
            # Tokenize and filter the "text" column
            for text in df["text"].dropna():
                tokens = tokenize_and_filter(text)
                update_word_count(word_counter, tokens)
                update_word_count(file_word_counter, tokens)
                
                # Write word frequencies to the cumulative file incrementally
                for word, count in Counter(tokens).items():
                    outfile.write(f"{word},{count}\n")
            
            # Generate and save a bar plot for the current file
            plot_file_name = f"{os.path.splitext(file_name)[0]}_bar_plot.png"
            plot_file_path = os.path.join(plots_folder, plot_file_name)
            
            # Show the bar plot only for the first file
            save_bar_plot(
                file_word_counter,
                x_label="Word",
                y_label="Count",
                title=f"Top Words in {file_name}",
                output_path=plot_file_path,
                show=(i == 0)  # Show only for the first file
            )

print(f"Word frequencies saved to '{output_file}'.")
print(f"Bar plots saved to the '{plots_folder}' folder.")