In [9]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer

def load_text_files(directory):
    """
    Load text files from a directory and return a list of their contents.
    
    Parameters:
    directory (str): The path to the directory containing text files.
    
    Returns:
    list: A list containing the contents of each text file.
    """
    documents = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                documents.append(file.read())
    return documents

def tfidf_vectorize(directory):
    """
    Perform TF-IDF vectorization on text files in a specified directory.
    
    Parameters:
    directory (str): The path to the directory containing text files.
    
    Returns:
    tuple: A tuple containing the TF-IDF matrix and the feature names.
    """
    # Load text documents
    documents = load_text_files(directory)
    print(len(documents), "documents to be vectorized...")
    # Initialize the TfidfVectorizer
    vectorizer = TfidfVectorizer()
    
    # Perform TF-IDF vectorization
    tfidf_matrix = vectorizer.fit_transform(documents)
    
    # Get the feature names
    feature_names = vectorizer.get_feature_names_out()
    
    return tfidf_matrix, feature_names

# Example usage:
# In this case in the folder 'data', I have a single text file entitled sharks.txt.
# You can add more text files to the data folder and vectorize them all

directory_path = 'data'
tfidf_matrix, feature_names = tfidf_vectorize(directory_path)

# Printing the TF-IDF matrix
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())
# Printing the feature names
print("Feature Names:\n", feature_names)


4 documents to be vectorized...
TF-IDF Matrix:
 [[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.03152452 0.04938925 ... 0.         0.         0.        ]
 [0.         0.06557669 0.         ... 0.         0.03424618 0.03424618]
 [0.02754436 0.01758121 0.         ... 0.02754436 0.         0.        ]]
Feature Names:
 ['000' '10' '11' '12' '13' '14' '1442' '1569' '16th' '17' '1970' '200'
 '33' '359' '40' '419' '444' '458' '600' '71' '88' 'abortion' 'actively'
 'activities' 'adaptive' 'addition' 'adults' 'after' 'again' 'agaleus'
 'ago' 'aid' 'aids' 'all' 'alopiidae' 'also' 'although' 'ampullae' 'an'
 'anatomy' 'ancient' 'and' 'another' 'any' 'apart' 'apex' 'apparatus'
 'appeared' 'appendages' 'applied' 'approximately' 'aquatic' 'are' 'armor'
 'around' 'arteriosus' 'as' 'assumed' 'at' 'atherion' 'back' 'balance'
 'bars' 'batoidea' 'be' 'bearing' 'became' 'beckington' 'been' 'behaviour'
 'behind' 'being' 'believe' 'between' 'biologists' 'birth' 'bladd