In [8]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer

def load_text_files(directory):
    """
    Load text files from a directory and return a list of their contents.
    
    Parameters:
    directory (str): The path to the directory containing text files.
    
    Returns:
    list: A list containing the contents of each text file.
    """
    documents = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                documents.append(file.read())
    return documents

def tfidf_vectorize(directory):
    """
    Perform TF-IDF vectorization on text files in a specified directory.
    
    Parameters:
    directory (str): The path to the directory containing text files.
    
    Returns:
    tuple: A tuple containing the TF-IDF matrix and the feature names.
    """
    # Load text documents
    documents = load_text_files(directory)
    
    # Initialize the TfidfVectorizer
    vectorizer = TfidfVectorizer()
    
    # Perform TF-IDF vectorization
    tfidf_matrix = vectorizer.fit_transform(documents)
    
    # Get the feature names
    feature_names = vectorizer.get_feature_names_out()
    
    return tfidf_matrix, feature_names

# Example usage:
# In this case in the folder 'data', I have a single text file entitled sharks.txt.
# You can add more text files to the data folder and vectorize them all

directory_path = 'data'
tfidf_matrix, feature_names = tfidf_vectorize(directory_path)

# Printing the TF-IDF matrix
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())
# Printing the feature names
print("Feature Names:\n", feature_names)


TF-IDF Matrix:
 [[0.01667825 0.01667825 0.01667825 0.01667825 0.01667825 0.01667825
  0.01667825 0.01667825 0.01667825 0.01667825 0.01667825 0.01667825
  0.01667825 0.01667825 0.01667825 0.01667825 0.01667825 0.01667825
  0.01667825 0.01667825 0.03335651 0.01667825 0.01667825 0.03335651
  0.18346078 0.01667825 0.01667825 0.01667825 0.01667825 0.23349554
  0.01667825 0.15010428 0.01667825 0.01667825 0.01667825 0.01667825
  0.01667825 0.01667825 0.01667825 0.01667825 0.01667825 0.03335651
  0.08339126 0.03335651 0.01667825 0.01667825 0.01667825 0.03335651
  0.01667825 0.01667825 0.01667825 0.01667825 0.01667825 0.01667825
  0.01667825 0.01667825 0.01667825 0.01667825 0.01667825 0.01667825
  0.01667825 0.01667825 0.01667825 0.01667825 0.01667825 0.01667825
  0.01667825 0.01667825 0.01667825 0.01667825 0.01667825 0.03335651
  0.01667825 0.01667825 0.01667825 0.01667825 0.01667825 0.01667825
  0.01667825 0.01667825 0.01667825 0.01667825 0.01667825 0.01667825
  0.03335651 0.01667825 0.016678