In [None]:
# Reading the articles

In [None]:
# Initialize an empty list to store the articles
articles = []

# List of file names 
l = [1,2,3,7,8,9,11,12,13,14,15,16,17,18,21,22,23,24,25,26]

# Loop through each file name
for i in l:
    # Open each file in read mode with latin-1 encoding
    f = open(rf"/kaggle/input/papers/ResearchPapers/{i}.txt" , encoding='latin-1')
    
    # Read the content of the file
    s = f.read()
    
    # Append the content to the articles list
    articles.append(s)

# Print the total number of articles read
print(len(articles))

# List of stopwords to be excluded from the analysis
stopwords = ['a', 'is', 'the', 'of', 'all', 'and', 'to', 'can', 'be', 'as', 'once', 'for', 'at', 'am', 'are', 'has', 'have', 'had', 'up', 'his', 'her', 'in', 'on', 'no', 'we', 'do']


# Preprossecing

In [None]:
import re  # Importing the regular expression module
from nltk.stem.porter import PorterStemmer  # Importing Porter Stemmer from NLTK

port_stem = PorterStemmer()  # Initializing a Porter Stemmer object

# Function to perform stemming on content
def stemming(content):
    stemmed_content = re.sub('[^\w\s]', '', content)  # Removing non-alphanumeric characters except whitespace
    stemmed_content = stemmed_content.lower()  # Converting content to lowercase
    stemmed_content = stemmed_content.split()  # Splitting content into tokens
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords]  # Stemming and removing stopwords
    return stemmed_content  # Returning the stemmed content

temp = []  # Initializing an empty list for temporary storage

# Looping through each article in the list of articles
for i in articles:
    # Applying stemming to the current article and appending the result to the temporary list
    temp.append(stemming(i))

# Assigning the stemmed articles back to the original articles list
articles = temp

# Printing the stemmed articles
print(articles[1])


In [None]:
vocab = set()  # Initialize an empty set to store unique words (vocabulary)

# Iterate through each article in the list of articles
for i in articles:
    # Update the vocabulary set with the words in the current article
    vocab.update(set(i))

# Print the length of the vocabulary (number of unique words)
print(len(vocab))

# Sort the vocabulary set alphabetically and convert it back to a list
vocab = sorted(vocab)


# TF-IDF from scratch
credit : https://www.kaggle.com/code/yassinehamdaoui1/creating-tf-idf-model-from-scratch

### TF

In [None]:
import pandas as pd
from collections import Counter

# Initialize an empty list to store dictionaries
dicts = []

# Loop through each article in the list of articles
for article in articles:
    # Count occurrences of each word in the current article using Counter
    word_counts = Counter(article)
    
    # Create a dictionary with keys from the vocabulary and values from word_counts
    dic = {word: word_counts.get(word, 0) for word in vocab}
    
    # Append the dictionary for the current article to the list
    dicts.append(dic)

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(dicts)

# Print or further process the DataFrame as needed
df


In [None]:
# Function to compute term frequency (TF) scores
def computeTF(wordDict, doc):
    tfDict = {}  # Initialize an empty dictionary for TF scores
    corpusCount = len(doc)  # Count of words in the document (corpus)
    
    # Calculate TF score for each word in the wordDict
    for word, count in wordDict.items():
        tfDict[word] = count / float(corpusCount)  # TF = (Number of occurrences of word) / (Total number of words)
    
    return tfDict  # Return the dictionary of TF scores

tfscores = []  # Initialize an empty list to store TF scores for each article

# Loop through each dictionary (word counts) and article pair using zip
for dic, article in zip(dicts, articles):
    # Compute TF scores for the current article using the word counts dictionary
    tfscores.append(computeTF(dic, article))

# Create a DataFrame from the list of TF scores
tf = pd.DataFrame(tfscores)

# Print or further process the DataFrame as needed
tf


### IDF

In [None]:
import math  # Importing the math module for mathematical operations

# Function to compute Inverse Document Frequency (IDF) scores
def computeIDF(docList):
    idfDict = {}  # Initialize an empty dictionary to store IDF scores
    N = len(docList)  # Total number of documents in the corpus
    
    # Initialize idfDict with keys from the first document's keys and values set to 0
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    
    # Calculate IDF score for each word in idfDict
    for word, val in idfDict.items():
        # Compute IDF score using the formula: IDF = log(N / (df + 1))
        # where N is the total number of documents and df is the document frequency of the word
        idfDict[word] = math.log10(N / (float(val) + 1))
        
    return idfDict  # Return the dictionary containing IDF scores

# Compute IDF scores for the list of dictionaries (word counts for each document)
idfs = computeIDF(dicts)

# Further process or use the IDF scores as needed
print(idfs)

### TF-IDF

In [None]:
def computeTFIDF(tfBow, idfs):
    tfidf = {}  # Initialize an empty dictionary to store TF-IDF scores
    for word, val in tfBow.items():
        # Compute TF-IDF score for each word using TF score and IDF score
        tfidf[word] = val * idfs[word]
    return tfidf  # Return the dictionary containing TF-IDF scores

# Iterate through each row in the TF DataFrame and compute TF-IDF scores

TFIDF = []
for tfrow in tf.iterrows():
    # Extract the row (TF scores for one article) as a dictionary
    tfBow = dict(tfrow[1])
    # Compute TF-IDF scores for the current article using TF scores and IDF scores
    TFIDF.append(computeTFIDF(tfBow, idfs))
    
# Create a DataFrame to store the IDF scores
tf_idf = pd.DataFrame(TFIDF)


In [None]:
tf_idf

# Query prcocessing

In [None]:
def process_query(query):

    query = stemming(query)  # Preprocess the query: tokenize and stem

    # Add the preprocessed query to the list of articles
    temp = articles.copy()  # Create a shallow copy of the articles list
    temp.append(query)  # Add the query to the temp list


    query_added_articles_list = temp

    tf_dicts = []  # List to store TF scores for each article

    # Calculate TF scores for each article in the combined list
    for article in query_added_articles_list:
        word_counts = Counter(article)  # Count occurrences of each word
        # Create a dictionary with word counts for each word in the vocabulary
        dic = {word: word_counts.get(word, 0) for word in vocab}
        tf_dicts.append(dic)  # Append the TF dictionary for the current article

    tfscores = []  # List to store TF scores

    # Compute TF scores for each article
    for dic, article in zip(tf_dicts, query_added_articles_list):
        tfscores.append(computeTF(dic, query_added_articles_list))

    # Create a DataFrame to store TF scores
    tf = pd.DataFrame(tfscores)

    # Compute IDF scores for the list of TF dictionaries
    idfs = computeIDF(tf_dicts)

    TFIDF = []  # List to store TF-IDF scores

    # Compute TF-IDF scores for each TF score dictionary
    for tfs in tfscores:
        TFIDF.append(computeTFIDF(tfs, idfs))

    # Create a DataFrame to store TF-IDF scores
    return TFIDF


# Cosine Similarity

In [None]:
# Function to calculate the dot product of two dictionaries
def dot_product(dict1, dict2):
    # Get the keys that are common to both dictionaries
    common_keys = set(dict1.keys()) & set(dict2.keys())
    
    # Initialize the dot product to 0
    dot_product = 0
    
    # Iterate through the common keys and calculate the dot product
    for key in common_keys:
        dot_product += dict1[key] * dict2[key]
    
    return dot_product

# Function to calculate the magnitude of a dictionary (treated as a vector)
def magnitude(dictionary):
    # Calculate the sum of the squares of the values
    sum_of_squares = sum(value ** 2 for value in dictionary.values())
    
    # Take the square root of the sum
    mag = math.sqrt(sum_of_squares)
    
    return mag

def compute_similarity(TFIDF):
    # Extract the TF-IDF vector for the query from the list of TF-IDF vectors
    query_vec = TFIDF.pop()

    # List to store similarity scores between the query vector and other vectors
    similarity = []

    # Calculate similarity scores between the query vector and each remaining TF-IDF vector
    for V in TFIDF:
        # Calculate dot product between the query vector and the current TF-IDF vector
        dp = dot_product(V, query_vec)
        
        # Calculate magnitudes of the two vectors
        m1 = magnitude(V)
        m2 = magnitude(query_vec)

        # Calculate cosine similarity between the two vectors
        if m1*m2 == 0:
            sim = -1
        else:
            sim = dp / (m1 * m2)
        if (sim > 0.001):
        # Append the similarity score to the list
            similarity.append(sim)
        else:
            similarity.append(-1)
            
    indexes = list(range(len(similarity)))

    # Filter out indexes where the similarity score is -1
    filtered_indexes = [index for index, sim in zip(indexes, similarity) if sim != -1]

    # Sort the filtered indexes based on the similarity scores (in descending order)
    sorted_indexes = sorted(filtered_indexes, key=lambda x: similarity[x], reverse=True)
    sorted_indexes = [x + 1 for x in sorted_indexes]
    # Print the sorted indexes
    return (sorted_indexes , similarity)

In [None]:
test = 'Machine learing'
tfidf = process_query(test)
query_results, similarity_scores = compute_similarity(tfidf)
print(query_results)


# The formula to calculate cosine similarity between two vectors A and B is as follows:

# cosine_similarity(A, B) = (A · B) / (||A|| * ||B||)

# Where:

# (A · B) represents the dot product of vectors A and B.
# ||A|| represents the magnitude (or Euclidean norm) of vector A, calculated as the square root of the sum of the squares of its components.
# ||B|| represents the magnitude of vector B, calculated in the same way.
# The cosine similarity ranges from -1 to 1:

# If the cosine similarity is 1, it means the vectors are pointing in the same direction and are identical.
# If the cosine similarity is -1, it means the vectors are pointing in opposite directions and are completely dissimilar.
# If the cosine similarity is 0, it means the vectors are orthogonal (perpendicular) and have no similarity.
# In the context of the provided code, cosine similarity is used to measure the similarity between the query vector and the TF-IDF vectors. The similarity scores are calculated using the cosine similarity formula, and the higher the score, the more similar the vectors are considered to be.

#  GUI

In [None]:
import tkinter as tk
from tkinter import ttk
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
import matplotlib.pyplot as plt

def calculate_and_display():
    # Get the input string from the entry widget
    input_string = entry.get()
    
    # Call your functions with the input string
    tfidf = process_query(input_string)
    query_results, similarity_scores = compute_similarity(tfidf)
    
    # Display the results on the GUI
    result_label.config(text=f"Query Result: {query_results}")
    
    # Plot the similarity scores
    plot_similarity(similarity_scores)

def plot_similarity(similarity_scores):
    # Clear the previous plot if any
    for widget in result_frame.winfo_children():
        widget.destroy()
    
    # Create a new figure for the plot
    fig = plt.Figure(figsize=(8, 6))
    ax = fig.add_subplot(111)
    
    # Plot the similarity scores
    l = range(1, len(similarity_scores) + 1)
    bars = ax.bar(l, similarity_scores, color=['salmon' if score < 0 else 'skyblue' for score in similarity_scores])
    ax.set_xlabel('Document Index')  # Label for the x-axis
    ax.set_ylabel('Similarity Score')  # Label for the y-axis
    ax.set_title('Similarity Scores for Documents')  # Title of the plot
    ax.set_xticks(l)  # Set the x-ticks to document indexes
    
    # Embed the Matplotlib plot in the Tkinter window
    canvas = FigureCanvasTkAgg(fig, master=result_frame)
    canvas.draw()
    canvas.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=True)

# Create the main window
root = tk.Tk()
root.title("TF-IDF Search Engine")

# Styling
root.geometry("500x400")
root.configure(bg="#f0f0f0")

# Create a label for the input
input_label = ttk.Label(root, text="Enter your search query:", background="#f0f0f0", font=("Arial", 12))
input_label.pack(pady=10)

# Create an entry widget for the input
entry = ttk.Entry(root, width=40, font=("Arial", 10))
entry.pack(pady=5)

# Create a button to calculate and display the results
calculate_button = ttk.Button(root, text="Search", command=calculate_and_display)
calculate_button.pack(pady=5)

# Create a frame to contain the plot
result_frame = ttk.Frame(root)
result_frame.pack(pady=10, padx=10, fill=tk.BOTH, expand=True)

# Create a label to display the results
result_label = ttk.Label(root, text="", background="#f0f0f0", font=("Arial", 10, "italic"), wraplength=380)
result_label.pack(pady=10)

# Run the Tkinter event loop
root.mainloop()
