## Import Required Libraries
Add the necessary imports for TF-IDF processing:

In [1]:
import os
import pickle
import logging
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer


logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


## Flatten Processed Data for TF-IDF Input
Prepare the data by flattening it into a list of documents and their associated labels

In [None]:
def load_preprocessed_data(pickle_path):
    """
    Load preprocessed data (texts and labels) from a pickle file.

    Args:
        pickle_path (str): Path to the pickle file.

    Returns:
        tuple: (list of texts, list of labels)
    """
    try:
        with open(pickle_path, "rb") as file:
            data = pickle.load(file)
        logging.info(f"Loaded preprocessed data from {pickle_path}")
        return data["texts"], data["labels"]
    except FileNotFoundError:
        logging.error(f"Pickle file not found at {pickle_path}")
        raise
    except Exception as e:
        logging.error(f"Error loading pickle file: {str(e)}")
        raise

## Compute TF-IDF
Use TfidfVectorizer to transform the documents into a sparse TF-IDF matrix

In [None]:
def create_tfidf_matrix(documents, max_features=15000, max_df=0.99, min_df=0.00002, ngram_range=(1, 1)):
    """
    Transform preprocessed documents into a TF-IDF matrix.

    Args:
        documents (list): List of preprocessed and tokenized documents as strings.
        max_features (int): Maximum number of features to keep in the TF-IDF matrix.
        max_df (float): Maximum document frequency for words (as a proportion).
        min_df (float): Minimum document frequency for words (as a proportion).
        ngram_range (tuple): The range of n-gram sizes to consider (e.g., unigrams, bigrams).

    Returns:
        tuple: (TF-IDF sparse matrix, list of feature names)
    """
    try:
        vectorizer = TfidfVectorizer(
            max_features=max_features,
            max_df=max_df,
            min_df=min_df,
            ngram_range=ngram_range
        )
        tfidf_matrix = vectorizer.fit_transform(documents)
        feature_names = vectorizer.get_feature_names_out()
        logging.info(f"TF-IDF matrix created with {len(feature_names)} features.")
        return tfidf_matrix, feature_names
    except Exception as e:
        logging.error(f"Error creating TF-IDF matrix: {str(e)}")
        raise


## Save TF-IDF as a CSV File
Format the data and save it to a CSV file

In [None]:
def save_tfidf_to_csv(tfidf_matrix, feature_names, labels, output_path):
    """
    Save the TF-IDF matrix along with feature names and labels to a CSV file.

    Args:
        tfidf_matrix (sparse matrix): Computed TF-IDF matrix.
        feature_names (list): List of feature names (vocabulary).
        labels (list): List of labels corresponding to the documents.
        output_path (str): File path to save the CSV file.
    """
    try:
        # Convert the sparse matrix to a dense format
        tfidf_dense = tfidf_matrix.toarray()

        # Create a DataFrame and add labels as a "Class" column
        df = pd.DataFrame(tfidf_dense, columns=feature_names)
        df["Class"] = labels

        # Ensure the output directory exists
        os.makedirs(os.path.dirname(output_path), exist_ok=True)

        # Save to CSV
        df.to_csv(output_path, index=False)
        logging.info(f"TF-IDF matrix saved to {output_path}")
    except Exception as e:
        logging.error(f"Error saving TF-IDF matrix to CSV: {str(e)}")
        raise


In [None]:
processed_tweets_path = "../data/processed_tweets.pkl"

# Load preprocessed data (texts and labels)
texts, labels = load_preprocessed_data(processed_tweets_path)

# Create TF-IDF matrix
tfidf_matrix, feature_names = create_tfidf_matrix(texts)

# Save TF-IDF matrix to CSV
output_csv_path = "../reports/tfidf_values.csv"
save_tfidf_to_csv(tfidf_matrix, feature_names, labels, output_csv_path)


2024-11-28 23:43:21,227 - INFO - Processed tweets loaded successfully.
2024-11-28 23:43:21,228 - INFO - Prepared 2999 documents and 3 unique labels for TF-IDF vectorization.
2024-11-28 23:43:21,242 - INFO - Computed TF-IDF matrix with 2761 features.
2024-11-28 23:43:23,248 - INFO - TF-IDF matrix successfully saved to '../reports/tfidf_values.csv'.
