### Import packages

In [1]:
import nltk
from nltk.stem.snowball import SnowballStemmer  # type: ignore
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
import numpy as np # type: ignore
import pandas as pd # type: ignore
import tkinter as tk
from tkinter import filedialog
import re
from sklearn.metrics.pairwise import cosine_similarity
from icecream import ic

### Function to create dataframe and excel file

In [2]:
# function to create the excel files
def create_excel_dataframe(input , name, columns, index):
    df = pd.DataFrame(input, columns = columns, index = index) # Create a pandas dataframe
    df.to_excel(name) # Create an excel file
    return df

### Download the **stopwords** 

In [3]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/glados/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Remove the punctuation from the docs

In [4]:
def convert_to_list(doc) -> list:
    doc = doc.split()
    return doc

In [5]:
stemmer = SnowballStemmer(language='english')
dictionary = []

### Download wordnet and create a method to check that the word have meaning or not.

In [6]:
nltk.download('wordnet')

def is_meaningful(word) -> bool:
    
    if wn.synsets(word):
        return True
    else: 
        return False    

[nltk_data] Downloading package wordnet to /home/glados/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Method to create the dictionary

In [7]:
def create_dictionary(doc, dictionary) -> list:
    for word_index, word in enumerate(doc):
        
        word = word.lower()
        
        if word.isalnum() and word not in stop_words and is_meaningful(word) and len(word) > 1:
            
            stem_word = stemmer.stem(word)
            doc[word_index] = stem_word
            
            if stem_word not in dictionary:
                dictionary.append(stem_word)
    
    return doc

### Method to join the words together with one space distance

In [8]:
def join_document(doc) -> str:
    doc = ' '.join(doc)
    return doc

### Methods which contain the methods to process the document

In [9]:
def process_document(document) -> str:
    
    document = convert_to_list(document)
    
    document = create_dictionary(document, dictionary)
    
    document = join_document(document)
    
    return document

In [10]:
def select_file():
    global raw_documents
    
    file_name = filedialog.askopenfilename(title = "Select Dataset",
                                           initialdir = "IR-Project",
                                           filetypes = (("All files","*.*"), ("Text files", "*.txt")))
    
    with open(file_name, "r") as f:
        raw_documents = f.read()
        window.destroy()

In [11]:

window = tk.Tk()
window.geometry("200x100")
window.title("Select Dataset")

button = tk.Button(
    window,
            text = "Select",
            command = select_file).pack()


window.mainloop()

### Select a dataset to extract, then extracted the results in the two type of documents, the processed docs goes to **docs** directory, and the original docs saved to **Original docs** directory.

In [12]:

raw_dataset = raw_documents # the raw dataset that read from the one file

raw_dataset = raw_dataset.split(".I") # split the docs by index

raw_dataset.remove(raw_dataset[0]) # remove the empty first index

doc_index = 0
    

# write each index in 'raw_dataset'
for doc in raw_dataset:
    doc_index += 1
    doc_path = "./docs" + "/doc" + str(doc_index) + ".txt"
    doc = process_document(doc)
    
    
    with open(doc_path, "w") as document: # write each index into the separated doc
        document.write(doc)

doc_index = 0

for doc in raw_dataset:
    doc_index += 1
    doc_path = "./Original docs" + "/docs" + str(doc_index) + ".txt"
    
    
    with open(doc_path, "w") as document: # write each index into the separated doc
        document.write(doc)


### Dictionary:

In [13]:
df = pd.DataFrame(dictionary, columns=["Words"])
df

Unnamed: 0,Words
0,experiment
1,investig
2,aerodynam
3,wing
4,slipstream
...,...
3416,intuit
3417,squir
3418,standoff
3419,recover


In [14]:
df.to_excel("dictionary.xlsx")

### Initialized the numpy arrays for TF, IDF, TF-IDF:

In [15]:
term_doc_matrix = np.zeros((len(dictionary), doc_index))
tf_array = np.zeros((len(dictionary), doc_index))
idf_array = np.zeros((len(dictionary), 1))
tf_idf_array = np.zeros((len(dictionary), doc_index))

### Methods to calculate the TF, DF:

In [16]:
def calculate_term_frequency(doc, dictionary, doc_no):
    for word in dictionary:
        if word in doc:
            frequency = doc.count(word)
            row = dictionary.index(word)
            column = doc_no - 1
            tf_array[row, column] += frequency


def calculate_document_frequency(doc, dictionary):
    for word in dictionary:
        if word in doc:
            row = dictionary.index(word)
            idf_array[row, 0] += 1

### Reads the Documents one by one to calculate the TF, DF:

In [17]:
doc_names_list = []
for num in range(1, doc_index + 1):
    doc_path = "./docs" + "/doc" + str(num) + ".txt"
    doc_names_list.append("doc" + str(num) + ".txt")
    with open(doc_path, "r") as file:
        doc = file.read()
        calculate_term_frequency(doc, dictionary, num)
        calculate_document_frequency(doc, dictionary)

In [18]:
tf_data_frame = create_excel_dataframe(tf_array, index = dictionary, columns = doc_names_list, name = "term_frequency.xlsx")
tf_data_frame

Unnamed: 0,doc1.txt,doc2.txt,doc3.txt,doc4.txt,doc5.txt,doc6.txt,doc7.txt,doc8.txt,doc9.txt,doc10.txt,...,doc1391.txt,doc1392.txt,doc1393.txt,doc1394.txt,doc1395.txt,doc1396.txt,doc1397.txt,doc1398.txt,doc1399.txt,doc1400.txt
experiment,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
investig,2.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,3.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
aerodynam,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wing,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
slipstream,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
intuit,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
squir,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
standoff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
recover,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


### Calculate the Term Frequency **(TF)** for each word in each document:

$$
TF_{(w,d)}
=
\begin{cases}
\text{ if } \;\;\; tf_{(w,d)} > 0  \;\;\;\;\;\;\;\; 1 + \log(tf_{(w,d)})\\
\text{otherwise} \;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\; 0 
\end{cases}
$$

In [19]:
row, column = np.shape(tf_array)

for r in range(row):
    for c in range(column):
        if tf_array[r, c] > 0:
            extracted_element = tf_array[r, c]
            tf_array[r, c] = 1 + np.log10(extracted_element)
        else:
            tf_array[r, c] = 0

### Calculate Inverse Document Frequency **(IDF)** for each word:

$$
IDF_{(w)} = \log(\frac{N}{df_{w}})
$$

In [20]:
count_of_documents = column
row, column = np.shape(idf_array)

for r in range(row):
    if idf_array[r, 0] > 0:
        extracted_element = idf_array[r, 0]
        idf_array[r, 0] = np.log10(count_of_documents / extracted_element)
    else:
        idf_array[r, 0] = 0

### Calculating the TF-IDF by multiplying peer to peer each row of TF array into IDF array:

In [21]:

row, column = np.shape(tf_idf_array)
r, c = 0, 0
while r < row:
        tf_idf_array[r, c]= tf_array[r, c] * idf_array[r, 0]
        c += 1
        if c == column:
            r += 1
            c = 0

### Export the TF, IDF, TF-IDF dataframes:

In [22]:
tf_dataframe = create_excel_dataframe(tf_array, "tf_excel.xlsx", doc_names_list, dictionary)

idf_dataframe = create_excel_dataframe(idf_array, "idf_excel.xlsx", ["IDF"], dictionary)

tf_idf_dataframe = create_excel_dataframe(tf_idf_array, "tf_idf_excel.xlsx", doc_names_list, dictionary)


In [23]:
tf_dataframe

Unnamed: 0,doc1.txt,doc2.txt,doc3.txt,doc4.txt,doc5.txt,doc6.txt,doc7.txt,doc8.txt,doc9.txt,doc10.txt,...,doc1391.txt,doc1392.txt,doc1393.txt,doc1394.txt,doc1395.txt,doc1396.txt,doc1397.txt,doc1398.txt,doc1399.txt,doc1400.txt
experiment,1.477121,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,1.000000,0.0,0.000000,0.0,1.0,1.0,0.00000,0.0,0.0
investig,1.301030,1.00000,0.0,0.00000,0.0,0.00000,1.0,1.00000,1.477121,0.0,...,0.0,0.000000,1.0,0.000000,1.0,0.0,0.0,0.00000,0.0,1.0
aerodynam,1.301030,0.00000,0.0,0.00000,1.0,0.00000,0.0,0.00000,0.000000,0.0,...,1.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0
wing,1.602060,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0
slipstream,1.778151,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
intuit,0.000000,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,1.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0
squir,0.000000,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,0.000000,1.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0
standoff,0.000000,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,0.000000,0.0,1.000000,0.0,0.0,0.0,0.00000,0.0,0.0
recover,0.000000,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,0.000000,0.0,1.000000,0.0,0.0,0.0,0.00000,0.0,0.0


In [24]:
idf_dataframe

Unnamed: 0,IDF
experiment,0.610834
investig,0.586221
aerodynam,0.876615
wing,0.786293
slipstream,1.970037
...,...
intuit,3.146128
squir,2.367977
standoff,2.845098
recover,1.714764


In [25]:
tf_idf_dataframe

Unnamed: 0,doc1.txt,doc2.txt,doc3.txt,doc4.txt,doc5.txt,doc6.txt,doc7.txt,doc8.txt,doc9.txt,doc10.txt,...,doc1391.txt,doc1392.txt,doc1393.txt,doc1394.txt,doc1395.txt,doc1396.txt,doc1397.txt,doc1398.txt,doc1399.txt,doc1400.txt
experiment,0.902276,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.610834,0.000000,0.000000,0.000000,0.610834,0.610834,0.000000,0.0,0.000000
investig,0.762692,0.586221,0.0,0.000000,0.000000,0.000000,0.586221,0.586221,0.865920,0.0,...,0.000000,0.000000,0.586221,0.000000,0.586221,0.000000,0.000000,0.000000,0.0,0.586221
aerodynam,1.140503,0.000000,0.0,0.000000,0.876615,0.000000,0.000000,0.000000,0.000000,0.0,...,0.876615,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
wing,1.259688,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
slipstream,3.503023,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
intuit,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,3.146128,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
squir,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,2.367977,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
standoff,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,2.845098,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
recover,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,1.714764,0.000000,0.000000,0.000000,0.000000,0.0,0.000000


### **Query**:

In [227]:
query = input("Please enter what you want to search")
query

'a general theory of three dimensional flow in subsonic and supersonic turbo-machines of axial-radial-and mixed-flow types .'

In [228]:
query_lower = query.lower()

In [229]:
query_words_list = query_lower.split()

In [230]:
temp = []
for word in query_words_list:
    if word not in stop_words and is_meaningful(word) and len(word) > 1:
        temp.append(word)  
query_words_list = temp

In [231]:
temp = [stemmer.stem(word) for word in query_words_list]
query_words = set(temp) 
query_words_list = temp
query_words_list

['general',
 'theori',
 'three',
 'dimension',
 'flow',
 'subson',
 'superson',
 'type']

In [232]:
query_words_list = " ".join(query_words_list)
query_words_list

'general theori three dimension flow subson superson type'

In [233]:
query_words

{'dimension',
 'flow',
 'general',
 'subson',
 'superson',
 'theori',
 'three',
 'type'}

### Calculating the **TF-IDF** of **Query**:

In [234]:
query_idf_lst = [idf_array[dictionary.index(word)][0] for word in query_words if word in dictionary]
query_idf_lst

[1.0930495921948182,
 0.7196167743136628,
 0.5136707434935138,
 0.8719701864145581,
 0.2780716738551965,
 1.0,
 0.7147642715192507,
 0.6675615400843947]

In [235]:
query_tf_lst = [np.log10(query_words_list.count(word)) + 1 for word in query_words if query_words_list.count(word) > 0 and word in dictionary]
query_tf_lst

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]

In [236]:
query_tfidf = [idf*query_tf_lst[count] for count, idf in enumerate(query_idf_lst)]
query_tfidf

[1.0930495921948182,
 0.7196167743136628,
 0.5136707434935138,
 0.8719701864145581,
 0.2780716738551965,
 1.0,
 0.7147642715192507,
 0.6675615400843947]

In [237]:
query_words = list(query_words)

### Add pad 0 to query_tfidf to make it equal to the tf-idf of the documents.

In [238]:
row, column = np.shape(tf_idf_array)

In [239]:
query_tfidf = np.pad(query_tfidf, (0, row - len(query_tfidf)), 'constant', constant_values = (0))

### Move the values to the Corresponding index.

In [240]:
for count, value in enumerate(query_tfidf):
    if count < len(query_words):
        if value > 0:
            if count != dictionary.index(query_words[count]):
                dest_index = dictionary.index(query_words[count])
                query_tfidf[dest_index] = value
                query_tfidf[count] = 0     

In [241]:
query_tfidf

array([0., 0., 0., ..., 0., 0., 0.])

In [242]:
np.shape(query_tfidf)

(3421,)

In [243]:
np.shape(tf_idf_dataframe["doc1.txt"].tolist())

(3421,)

### Calculate the **Cosine** similarity:

In [244]:
cosine_values = []
cosine_docs = []


for number in range(1, column + 1):
    
    doc_name = "doc"+ str(number) +".txt"
    doc_column = tf_idf_dataframe[doc_name].tolist()
    
    nominator = np.dot(query_tfidf, doc_column)
    
    doc_column_norm = np.linalg.norm(doc_column)
    query_tfidf_norm = np.linalg.norm(query_tfidf)
    
    denominator = doc_column_norm * query_tfidf_norm
    
    cosine_theta = nominator / denominator
    
    cosine_values.append(cosine_theta)
    cosine_docs.append(doc_name)

### Rank the **Cosine** Similarity:
Create a dictionary based on the *cosine_docs* and *cosine_values*, get the **items** from the created dictionary; and set the **key** argument equal to the value of the item; for the descending order, set the reverse argument to *True*.  

In [245]:
cosine_rank = sorted(dict(zip(cosine_docs, cosine_values)).items(), key = lambda x: x[1], reverse = True)
cosine_rank

[('doc987.txt', 0.23521785265133516),
 ('doc445.txt', 0.22295844601271841),
 ('doc1368.txt', 0.18631442326605427),
 ('doc1339.txt', 0.18504107362917663),
 ('doc80.txt', 0.18051213115343837),
 ('doc791.txt', 0.16999756842928268),
 ('doc487.txt', 0.159587207443016),
 ('doc774.txt', 0.1573959153958866),
 ('doc775.txt', 0.15587277249846937),
 ('doc527.txt', 0.14941929131483406),
 ('doc176.txt', 0.1439874159548463),
 ('doc409.txt', 0.1425443279352183),
 ('doc38.txt', 0.13997475155607597),
 ('doc264.txt', 0.1388115555933149),
 ('doc680.txt', 0.13352614985854291),
 ('doc1281.txt', 0.13346757654389804),
 ('doc472.txt', 0.13012264569080506),
 ('doc216.txt', 0.12951100562659046),
 ('doc1006.txt', 0.12816732942028783),
 ('doc427.txt', 0.12809735098901465),
 ('doc710.txt', 0.12721530700176104),
 ('doc751.txt', 0.12647916349884689),
 ('doc467.txt', 0.1251851377343641),
 ('doc465.txt', 0.12445654829260312),
 ('doc1276.txt', 0.12051348995055998),
 ('doc1073.txt', 0.12002020538386528),
 ('doc137.txt',

### Create dataframe and excel file from the Cosine similarity:

In [246]:
df = pd.DataFrame([doc[1] for doc in cosine_rank], index = [doc[0] for doc in cosine_rank], columns = ['cosine'])
df.to_excel("cosine_similarity.xlsx")

In [216]:

window = tk.Tk()
window.geometry("200x200")

def choose():
    global files_path
    files_path = filedialog.askopenfilenames(
                parent = window,
                initialdir = "./Original docs",
                title = "Choose the docs",
                filetypes = [("text name","*.txt")]
                )
    
    
            
    window.destroy()
    
button = tk.Button(window,
                   text = 'Select',
                   command = choose,
                   font = ('Arial', 10))
button.pack()

window.mainloop()

In [217]:
files_path

()

In [218]:
temp = []
pattern = r'docs(\d+)\.txt'

for path in files_path:
    temp.append(re.findall(pattern, path))

In [219]:
temp

[]

In [220]:
query_tfidf

array([0.61083392, 0.78629255, 1.97003678, ..., 0.        , 0.        ,
       0.        ])

In [221]:
target_doc_numbers = [int(value[0]) for value in temp]


for doc in target_doc_numbers:
    for count, word in enumerate(query_words):
    
        word_index_in_dictionary = dictionary.index(word)
        query_tfidf[word_index_in_dictionary] += tf_idf_array[word_index_in_dictionary ,doc] 
        

In [222]:
query_tfidf

array([0.61083392, 0.78629255, 1.97003678, ..., 0.        , 0.        ,
       0.        ])

In [223]:
cosine_values = []
cosine_docs = []  

for number in range(1, column + 1):
    
    doc_name = "doc"+ str(number) +".txt"
    doc_column = tf_idf_dataframe[doc_name].tolist()
    
    nominator = np.dot(query_tfidf, doc_column)
    
    doc_column_norm = np.linalg.norm(doc_column)
    query_tfidf_norm = np.linalg.norm(query_tfidf)
    
    denominator = doc_column_norm * query_tfidf_norm
    
    cosine_theta = nominator / denominator
    
    cosine_values.append(cosine_theta)
    cosine_docs.append(doc_name)

In [224]:
cosine_rank = sorted(dict(zip(cosine_docs, cosine_values)).items(), key = lambda x: x[1], reverse = True)
cosine_rank

[('doc137.txt', 0.1807135933391944),
 ('doc567.txt', 0.1459996855812259),
 ('doc19.txt', 0.14206407735540963),
 ('doc287.txt', 0.13993790863307015),
 ('doc1.txt', 0.12973789717671474),
 ('doc1272.txt', 0.12911006553245957),
 ('doc999.txt', 0.12909864838199314),
 ('doc1066.txt', 0.12879966385720298),
 ('doc51.txt', 0.1283838901735248),
 ('doc749.txt', 0.12677703000092827),
 ('doc877.txt', 0.12479104081542365),
 ('doc689.txt', 0.12182716200676662),
 ('doc606.txt', 0.1211338030167561),
 ('doc748.txt', 0.1178444655343333),
 ('doc1112.txt', 0.11773866389854433),
 ('doc801.txt', 0.11661059294281913),
 ('doc1352.txt', 0.11513249255869681),
 ('doc203.txt', 0.11331643475832172),
 ('doc1335.txt', 0.11285425097347612),
 ('doc634.txt', 0.11233884306752283),
 ('doc297.txt', 0.11172454229121674),
 ('doc860.txt', 0.11085498268796082),
 ('doc780.txt', 0.11065511890789512),
 ('doc711.txt', 0.11019576211405162),
 ('doc29.txt', 0.10856310872644352),
 ('doc390.txt', 0.10753272837684281),
 ('doc544.txt', 0

In [225]:
df = pd.DataFrame([doc[1] for doc in cosine_rank], index = [doc[0] for doc in cosine_rank], columns = ['cosine'])
df.to_excel("cosine_similarity_rerank.xlsx")

In [226]:
df

Unnamed: 0,cosine
doc137.txt,0.180714
doc567.txt,0.146000
doc19.txt,0.142064
doc287.txt,0.139938
doc1.txt,0.129738
...,...
doc1388.txt,0.000000
doc1389.txt,0.000000
doc1394.txt,0.000000
doc1398.txt,0.000000
