In [8]:
import os

# Function to load text files into a corpus
def load_corpus(directory):
    corpus = []
    filenames = sorted(os.listdir(directory))
    for filename in filenames:
        if filename.endswith('.txt'):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                corpus.append(file.read())
    return corpus

# Load the corpus
directory = 'C://Users//AjayG//Downloads//NLP Files//NLP Files//'
corpus = load_corpus(directory)

# Display the number of documents loaded
print(f"Number of documents loaded: {len(corpus)}")


Number of documents loaded: 6


In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Initialize the vectorizers
bow_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()

# Create the Document-Term Matrix using BoW
bow_dtm = bow_vectorizer.fit_transform(corpus)

# Create the Document-Term Matrix using TF-IDF
tfidf_dtm = tfidf_vectorizer.fit_transform(corpus)

# Display the shape of the matrices
print(f"BoW DTM shape: {bow_dtm.shape}")
print(f"TF-IDF DTM shape: {tfidf_dtm.shape}")


BoW DTM shape: (6, 8055)
TF-IDF DTM shape: (6, 8055)


In [10]:
import numpy as np
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
from scipy.spatial.distance import jaccard

# Compute cosine distances
bow_cosine_dist = cosine_distances(bow_dtm)
tfidf_cosine_dist = cosine_distances(tfidf_dtm)

# Compute euclidean distances
bow_euclidean_dist = euclidean_distances(bow_dtm)
tfidf_euclidean_dist = euclidean_distances(tfidf_dtm)

# Compute jaccard distances (using binary occurrence for BoW)
bow_binary_dtm = bow_dtm > 0
tfidf_binary_dtm = tfidf_dtm > 0
bow_jaccard_dist = np.array([[jaccard(doc1.toarray()[0], doc2.toarray()[0]) for doc2 in bow_binary_dtm] for doc1 in bow_binary_dtm])
tfidf_jaccard_dist = np.array([[jaccard(doc1.toarray()[0], doc2.toarray()[0]) for doc2 in tfidf_binary_dtm] for doc1 in tfidf_binary_dtm])

# Function to print distance matrices
def print_distance_matrices(matrix, metric_name):
    print(f"\n{metric_name} Distance Matrix:")
    print(matrix)

# Print the distance matrices
print_distance_matrices(bow_cosine_dist, "BoW Cosine")
print_distance_matrices(tfidf_cosine_dist, "TF-IDF Cosine")
print_distance_matrices(bow_euclidean_dist, "BoW Euclidean")
print_distance_matrices(tfidf_euclidean_dist, "TF-IDF Euclidean")
print_distance_matrices(bow_jaccard_dist, "BoW Jaccard")
print_distance_matrices(tfidf_jaccard_dist, "TF-IDF Jaccard")



BoW Cosine Distance Matrix:
[[0.         0.53802301 0.58331073 0.42462329 0.38855393 0.36745826]
 [0.53802301 0.         0.69794351 0.58970598 0.47125979 0.51287716]
 [0.58331073 0.69794351 0.         0.39843401 0.46198249 0.47645155]
 [0.42462329 0.58970598 0.39843401 0.         0.29234071 0.30632056]
 [0.38855393 0.47125979 0.46198249 0.29234071 0.         0.27713866]
 [0.36745826 0.51287716 0.47645155 0.30632056 0.27713866 0.        ]]

TF-IDF Cosine Distance Matrix:
[[0.         0.78913759 0.77091131 0.66706881 0.65115761 0.64905839]
 [0.78913759 0.         0.87103214 0.80234511 0.7145532  0.76938004]
 [0.77091131 0.87103214 0.         0.62656172 0.71390641 0.71235097]
 [0.66706881 0.80234511 0.62656172 0.         0.56159611 0.56446119]
 [0.65115761 0.7145532  0.71390641 0.56159611 0.         0.54761421]
 [0.64905839 0.76938004 0.71235097 0.56446119 0.54761421 0.        ]]

BoW Euclidean Distance Matrix:
[[   0.           19.92485885   17.97220076   25.05992817 9425.70904495
    1