## Import Required Libraries
Add the necessary imports for TF-IDF processing:

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import os
import pickle
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

## Flatten Processed Data for TF-IDF Input
Prepare the data by flattening it into a list of documents and their associated labels

In [2]:
def prepare_tfidf_input(processed_tweets):
    """
    Flattens the processed tweets dictionary into a list of documents and labels.
    Args:
        processed_tweets (dict): Dictionary with labels as keys and lists of tokenized tweets as values.
    Returns:
        tuple: (list of documents, list of labels)
    """
    documents = []
    labels = []
    for label, tweets in processed_tweets.items():
        for tweet in tweets:
            documents.append(" ".join(tweet))  # Join tokens into a single string for TF-IDF
            labels.append(label)
    logging.info(f"Prepared {len(documents)} documents and {len(set(labels))} unique labels for TF-IDF vectorization.")
    return documents, labels

## Compute TF-IDF
Use TfidfVectorizer to transform the documents into a sparse TF-IDF matrix

In [3]:
def compute_tfidf(documents):
    """
    Computes the TF-IDF matrix for the given documents.
    
    Args:
        documents (list): List of tokenized and preprocessed documents as strings.
    
    Returns:
        tuple: (TF-IDF matrix, feature names)
    """
    # Vectorizer with optimized settings
    vectorizer = TfidfVectorizer(
        max_features=15000, 
        max_df=0.98,  
        min_df=0.0002,  
        ngram_range=(1,1)  # Use unigrams and bigrams
    )
    tfidf_matrix = vectorizer.fit_transform(documents)
    feature_names = vectorizer.get_feature_names_out()
    
    # Log the number of features after vectorization
    logging.info(f"Computed TF-IDF matrix with {tfidf_matrix.shape[1]} features (vocabulary size).")
    return tfidf_matrix, feature_names


## Save TF-IDF as a CSV File
Format the data and save it to a CSV file

In [4]:
def save_tfidf_to_csv(tfidf_matrix, feature_names, labels, output_path):
    """
    Saves the TF-IDF matrix to a CSV file.

    Args:
        tfidf_matrix (sparse matrix): Computed TF-IDF matrix.
        feature_names (list): List of feature names (vocabulary).
        labels (list): List of document labels.
        output_path (str): Path to save the CSV file.
    """
    # Convert sparse matrix to dense format and create DataFrame
    tfidf_dense = tfidf_matrix.toarray()
    tfidf_df = pd.DataFrame(tfidf_dense, columns=feature_names)
    
    # Add labels as a "Class" column
    tfidf_df["Class"] = labels

    # Save to CSV
    os.makedirs(os.path.dirname(output_path), exist_ok=True)  # Ensure output directory exists
    tfidf_df.to_csv(output_path, index=False)
    logging.info(f"TF-IDF matrix saved to '{output_path}'.")

In [5]:
# Step 1: Prepare input data
# Load processed_tweets from the pickle file
with open("../data/processed_tweets.pkl", "rb") as file:
    processed_tweets = pickle.load(file)

logging.info("Processed tweets loaded successfully.")
documents, labels = prepare_tfidf_input(processed_tweets)

# Step 2: Compute TF-IDF matrix
tfidf_matrix, feature_names = compute_tfidf(documents)

# Step 3: Save TF-IDF matrix to CSV
output_csv_path = "../reports/tfidf_values.csv"
save_tfidf_to_csv(tfidf_matrix, feature_names, labels, output_csv_path)

2024-11-28 22:20:46,243 - INFO - Processed tweets loaded successfully.
2024-11-28 22:20:46,244 - INFO - Prepared 2999 documents and 3 unique labels for TF-IDF vectorization.
2024-11-28 22:20:46,256 - INFO - Computed TF-IDF matrix with 2682 features (vocabulary size).
2024-11-28 22:20:48,188 - INFO - TF-IDF matrix saved to '../reports/tfidf_values.csv'.
