In [None]:

import os
import pandas as pd
import spacy
import matplotlib.pyplot as plt

# Load spaCy model for English
nlp = spacy.load("en_core_web_sm")

# Define folders and thresholds
input_folder = "steemit_tsv_filtered_output"
output_folder = "results"
os.makedirs(output_folder, exist_ok=True)  # Create the folder if it doesn't exist
output_file = os.path.join(output_folder, "daily_results.csv")
magic_threshold_min = 5
magic_threshold_max = 100

# Filter POS tags
VALID_POS_TAGS = {"ADJ", "NOUN", "PROPN", "VERB"}

# Function to process text and extract valid words using spaCy
def extract_valid_words(text):
    doc = nlp(text)
    valid_words = [token.text.lower() for token in doc if token.pos_ in VALID_POS_TAGS and token.is_alpha]
    return valid_words

# Step 1: Build total word frequencies
f_total = {}

def build_total_frequencies():
    for file in os.listdir(input_folder):
        if file.endswith(".csv"):
            filepath = os.path.join(input_folder, file)
            df = pd.read_csv(filepath, sep=",", on_bad_lines='skip', engine="python")
            for text in df['concatenated_text'].dropna():
                valid_words = extract_valid_words(text)
                for word in valid_words:
                    f_total[word] = f_total.get(word, 0) + 1

# Step 2-9: Process daily data and compute required values
def process_daily_data():
    daily_results = []
    plot_count = 0  # Counter to limit the number of bar plots to 5

    for file in os.listdir(input_folder):
        if file.endswith(".csv"):
            filepath = os.path.join(input_folder, file)
            date = file.split(".")[0]
            df = pd.read_csv(filepath, sep=",", on_bad_lines='skip', engine="python")

            # Calculate daily word frequencies
            f_daily = {}
            for text in df['concatenated_text'].dropna():
                valid_words = extract_valid_words(text)
                for word in valid_words:
                    f_daily[word] = f_daily.get(word, 0) + 1

            # Compute score_daily
            score_daily = {word: f_daily[word] / f_total[word] for word in f_daily}

            # Shortlist words based on threshold
            sorted_words = sorted(score_daily.items(), key=lambda x: x[1], reverse=True)
            shortlisted_words = []
            for threshold in [x / 100 for x in range(50, 101)]:
                shortlisted_words = [word for word, score in sorted_words if score >= threshold]
                if magic_threshold_min <= len(shortlisted_words) <= magic_threshold_max:
                    break

            # Append to results
            daily_results.append({"date": date, "words": shortlisted_words})

            # Plot top 100 words (limited to 5 plots)
            if plot_count < 5:
                top_100 = sorted_words[:100]
                words, scores = zip(*top_100)
                plt.figure(figsize=(12, 6))
                plt.bar(words, scores)
                plt.xticks(rotation=90)
                plt.title(f"Top 100 Words for {date}")
                plt.xlabel("Words")
                plt.ylabel("Scores")
                plt.tight_layout()
                plot_path = os.path.join(output_folder, f"top_100_words_{date}.png")
                plt.savefig(plot_path)
                plt.close()
                print(f"Saved bar plot: {plot_path}")
                plot_count += 1

    # Save daily results to CSV
if daily_results:
    # Ensure daily_results is a list of dictionaries with consistent keys
    results_path = os.path.join(output_folder, "daily_results.csv")
    try:
        daily_results_df = pd.DataFrame(daily_results)  # Convert to DataFrame
        daily_results_df.to_csv(results_path, index=False)  # Save to CSV
        print(f"Daily results saved to: {results_path}")
    except Exception as e:
        print(f"Error saving daily_results.csv: {e}")
else:
    print("No daily results to save.")

# Execute steps
build_total_frequencies()
process_daily_data()

print("Processing complete.")
