In [18]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np

In [19]:
# Retrieving the dataset
categories = ["alt.atheism", "sci.med", "sci.electronics", "comp.graphics", "talk.politics.guns", "sci.crypt"]
data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

In [20]:
# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

In [21]:
# Vectorizing the data: Converting the words into numbers using TF-IDF and CountVectorizer
# TF-IDF Vectorizeassigns a weight to each word based on its frequency
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=30000)

# Count Vectorizer counts the occurrence of each word
count_vectorizer = CountVectorizer(stop_words='english', max_features=30000)

In [22]:
# Fit the TF-IDF and CountVectorizer on the training data and convert them into arrays
# fit() >>: Adjusts the model to the dataset by identifying the vocabulary (in the case of `CountVectorizer`) 
# or calculating the weights (in the case of `TfidfVectorizer`).
# transform()>> Transforms the dataset into a numerical representation (feature vectors).
# toarray()>> Converts the sparse representation into a dense matrix, which is easier to handle 

train_tfidf_matrix = tfidf_vectorizer.fit_transform(X_train).toarray()  # Transform train data using TF-IDF
train_tf_matrix = count_vectorizer.fit_transform(X_train).toarray()  # Transform train data using CountVectorizer

# Transform the test data using the vocabulary learned from the training data
test_tfidf_matrix = tfidf_vectorizer.transform(X_test).toarray()  # Transform test data using TF-IDF
test_tf_matrix = count_vectorizer.transform(X_test).toarray()  # Transform test data using CountVectorizer

# Checking the shapes 
print(f"Train TF-IDF matrix shape: {train_tfidf_matrix.shape}")
print(f"Train CountVectorizer matrix shape: {train_tf_matrix.shape}")
print(f"Test TF-IDF matrix shape: {test_tfidf_matrix.shape}")
print(f"Test CountVectorizer matrix shape: {test_tf_matrix.shape}")

Train TF-IDF matrix shape: (4517, 30000)
Train CountVectorizer matrix shape: (4517, 30000)
Test TF-IDF matrix shape: (1130, 30000)
Test CountVectorizer matrix shape: (1130, 30000)


In [0]:
# Cosine similarity calculation on the training data
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity  training documents using TF and TFIDF
train_tf_cosine_sim = cosine_similarity(train_tf_matrix, train_tf_matrix) 
train_tfidf_cosine_sim = cosine_similarity(train_tfidf_matrix, train_tfidf_matrix)  

# Cosine similarity calculation on the test data with respect to the training data
test_tf_cosine_sim = cosine_similarity(test_tf_matrix, train_tf_matrix)  # Test cosine similarity with CountVectorizer
test_tfidf_cosine_sim = cosine_similarity(test_tfidf_matrix, train_tfidf_matrix)  # Test cosine similarity with TF-IDF

In [25]:
# Calculating accuracy on training data using cosine similarity and TF-IDF vectors
k = 7  # NUMber of neighbors to be considered
training_correct = 0
for i in range(0, len(y_train)):  # Loop over each document in the training set 
    similar_index = np.argsort(train_tfidf_cosine_sim[i])[
                    :-(k + 1):-1].tolist()  # Get the indices of the top k similar documents
    predicted_labels = y_train[similar_index].tolist()  # Get the labels of the k nearest neighbors
    predicted_label = max(predicted_labels, key=predicted_labels.count)  # Majority vote to predict the class
    actual_label = y_train[i]  # Actual label of the current document
    if predicted_label == actual_label:  # Check if the prediction was correct
        training_correct += 1  # Increment the correct prediction count
training_accuracy = training_correct / len(y_train)  # Calculate training accuracy
print(f"Training Accuracy using Cosine Similarity: {training_accuracy:.2f}")

Training Accuracy using Cosine Similarity: 0.90


In [26]:
# Calculating accuracy on test data using cosine similarity and TF-IDF vectors
test_correct = 0
for i in range(0, len(y_test)):  # Loop over each document in the test set (use len(y_test) instead of data)
    similar_index = np.argsort(test_tfidf_cosine_sim[i])[
                    :-(k + 1):-1].tolist()  # Get the indices of the top k similar documents
    predicted_labels = y_train[similar_index].tolist()  # Get the labels of the k nearest neighbors
    predicted_label = max(predicted_labels, key=predicted_labels.count)  # Majority vote to predict the class
    actual_label = y_test[i]  # Actual label of the current document
    if predicted_label == actual_label:  # Check if the prediction was correct
        test_correct += 1  # Increment the correct prediction count
test_accuracy = test_correct / len(y_test)  # Calculate test accuracy
print(f"Test Accuracy using Cosine Similarity: {test_accuracy:.2f}")

Test Accuracy using Cosine Similarity: 0.83
