## Import Required Libraries
Add the necessary imports for TF-IDF processing:

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import os
import pickle

## Flatten Processed Data for TF-IDF Input
Prepare the data by flattening it into a list of documents and their associated labels

In [2]:
def prepare_tfidf_input(processed_tweets):
    """
    Flattens the processed tweets dictionary into a list of documents and labels.
    Args:
        processed_tweets (dict): Dictionary with labels as keys and lists of tokenized tweets as values.
    Returns:
        tuple: (list of documents, list of labels)
    """
    documents = []
    labels = []
    for label, tweets in processed_tweets.items():
        for tweet in tweets:
            documents.append(" ".join(tweet))  # Join tokens into a single string for TF-IDF
            labels.append(label)
    return documents, labels


## Compute TF-IDF
Use TfidfVectorizer to transform the documents into a sparse TF-IDF matrix

In [3]:
def compute_tfidf(documents):
    """
    Computes the TF-IDF matrix for the given documents.
    
    Args:
        documents (list): List of tokenized and preprocessed documents as strings.
    
    Returns:
        tuple: (TF-IDF matrix, feature names)
    """
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)
    feature_names = vectorizer.get_feature_names_out()
    return tfidf_matrix, feature_names


## Save TF-IDF as a CSV File
Format the data and save it to a CSV file

In [4]:
def save_tfidf_to_csv(tfidf_matrix, feature_names, labels, output_path):
    """
    Saves the TF-IDF matrix to a CSV file.

    Args:
        tfidf_matrix (sparse matrix): Computed TF-IDF matrix.
        feature_names (list): List of feature names (vocabulary).
        labels (list): List of document labels.
        output_path (str): Path to save the CSV file.
    """
    # Convert sparse matrix to dense and create DataFrame
    tfidf_dense = tfidf_matrix.toarray()
    tfidf_df = pd.DataFrame(tfidf_dense, columns=feature_names)
    
    # Add labels as a "Class" column
    tfidf_df["Class"] = labels

    # Save to CSV
    tfidf_df.to_csv(output_path, index=False)
    print(f"TF-IDF matrix saved to {output_path}")


In [5]:
# Step 1: Prepare input data

# Load processed_tweets from the pickle file
with open("data/processed_tweets.pkl", "rb") as file:
    processed_tweets = pickle.load(file)

print("Processed tweets loaded successfully")
documents, labels = prepare_tfidf_input(processed_tweets)

# Step 2: Compute TF-IDF matrix
tfidf_matrix, feature_names = compute_tfidf(documents)

# Step 3: Save TF-IDF matrix to CSV
output_csv_path = "reports/tfidf_values.csv"
os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)  # Create 'reports/' if it doesn't exist
save_tfidf_to_csv(tfidf_matrix, feature_names, labels, output_csv_path)


Processed tweets loaded successfully
TF-IDF matrix saved to reports/tfidf_values.csv
