# Using TFIDF and cosine similarity

In [None]:
!pip install scikit-learn --quiet

In [33]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Prepare the dummy dataset
file_names = [
    "report1.txt", "report2.txt", "data_summary.csv", "analysis_results.xlsx",
    "presentation.pptx", "meeting_notes.docx", "budget.xlsx", "project_plan.pdf"
]

# Step 2: Compute TF-IDF vectors for the dataset
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(file_names)

# Step 3: Implement a function to perform a search query using cosine similarity
def search(query, tfidf_matrix, vectorizer, top_k=5):
    # Returns the
    query_vec = vectorizer.transform([query])
    '''
    returns data in form of (0, 12)	0.6423280258820045
    0: This represents the row index in the TF-IDF matrix. In this case, it indicates the first document in your query (or the first element in the list of queries you are transforming).
    12: This represents the column index, which corresponds to the position of a specific token in the vocabulary.
    0.6423280258820045: This is the TF-IDF value for the token at the given row and column indices.
    '''
    print(f'-------------\nquery_vec: {query_vec}', end="\n-------------\n")
    cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    # Sort based on similarity score
    related_docs_indices = cosine_similarities.argsort()[-top_k:][::-1]
    return [(file_names[idx], cosine_similarities[idx]) for idx in related_docs_indices]

# Test the search function with a sample query
query = "pptx csv"
results = search(query, tfidf_matrix, vectorizer)
for filename, score in results:
    print(f"Filename: {filename}, Similarity Score: {score}")

# Output the TF-IDF matrix for reference
print("\nTF-IDF Matrix:")
print(list(vectorizer.get_feature_names_out()))
print(tfidf_matrix.toarray())


-------------
query_vec:   (0, 7)	0.7071067811865475
  (0, 2)	0.7071067811865475
-------------
Filename: presentation.pptx, Similarity Score: 0.5000000000000001
Filename: data_summary.csv, Similarity Score: 0.5000000000000001
Filename: project_plan.pdf, Similarity Score: 0.0
Filename: budget.xlsx, Similarity Score: 0.0
Filename: meeting_notes.docx, Similarity Score: 0.0

TF-IDF Matrix:
['analysis_results', 'budget', 'csv', 'data_summary', 'docx', 'meeting_notes', 'pdf', 'pptx', 'presentation', 'project_plan', 'report1', 'report2', 'txt', 'xlsx']
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.76642984 0.
  0.64232803 0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.76642984
  0.64232803 0.        ]
 [0.         0.         0.70710678 0.70710678 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.7

In [20]:
query="txt"
# def search(query, tfidf_matrix, vectorizer, top_k=5):
query_vec = vectorizer.transform([query])
cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
cosine_similarities
# related_docs_indices = cosine_similarities.argsort()[-top_k:][::-1]
# return [(file_names[idx], cosine_similarities[idx]) for idx in related_docs_indices]

array([0.64232803, 0.64232803, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        ])