### Import packages

In [145]:
import GUI
import nltk
from nltk.stem.snowball import SnowballStemmer  # type: ignore
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
import numpy as np # type: ignore
import pandas as pd # type: ignore
import re
from icecream import ic

### Function to create dataframe and excel file

In [2]:
# function to create the excel files
def create_excel_dataframe(input , name, columns, index):
    df = pd.DataFrame(input, columns = columns, index = index) # Create a pandas dataframe
    df.to_excel(name) # Create an excel file
    return df

### Download the **stopwords** 

In [3]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/glados/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Remove the punctuation from the docs

In [4]:
def convert_to_list(doc) -> list:
    doc = re.sub(r"[^\w\s]", " ", doc)
    doc = doc.split()
    return doc

In [5]:
stemmer = SnowballStemmer(language='english')
dictionary = []

### Download wordnet and create a method to check that the word have meaning or not.

In [6]:
nltk.download('wordnet')

def is_meaningful(word) -> bool:
    
    if wn.synsets(word):
        return True
    else: 
        return False    

[nltk_data] Downloading package wordnet to /home/glados/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Method to create the dictionary

In [7]:
def create_dictionary(doc, dictionary) -> list:
    for word_index, word in enumerate(doc):
        
        word = word.lower()
        
        if word.isalnum() and word not in stop_words and is_meaningful(word) and len(word) > 1:
            
            stem_word = stemmer.stem(word)
            doc[word_index] = stem_word
            
            if stem_word not in dictionary:
                dictionary.append(stem_word)
    
    return doc

### Method to join the words together with one space distance

In [8]:
def join_document(doc) -> str:
    doc = ' '.join(doc)
    return doc

### Methods which contain the methods to process the document

In [9]:
def process_document(document) -> str:
    
    document = convert_to_list(document)
    
    document = create_dictionary(document, dictionary)
    
    document = join_document(document)
    
    return document

### Select a dataset to extract, then extracted the results in the two type of documents, the processed docs goes to **docs** directory, and the original docs saved to **Original docs** directory.

In [10]:
raw_document = GUI.select_file() # Select File GUI implementation

raw_dataset = raw_document # the raw dataset that read from the one file

raw_dataset = raw_dataset.split(".I") # split the docs by index

raw_dataset.remove(raw_dataset[0]) # remove the empty first index

doc_index = 0
    

# write each index in 'raw_dataset'
for doc in raw_dataset:
    doc_index += 1
    doc_path = "./docs" + "/doc" + str(doc_index) + ".txt"
    doc = process_document(doc)
    
    
    with open(doc_path, "w") as document: # write each index into the separated doc
        document.write(doc)

doc_index = 0

for doc in raw_dataset:
    doc_index += 1
    doc_path = "./Original docs" + "/docs" + str(doc_index) + ".txt"
    
    
    with open(doc_path, "w") as document: # write each index into the separated doc
        document.write(doc)

GUI.custom_window("700x35", "Extract Documents", f"There have been {doc_index} documents are extracted.")

### Dictionary:

In [11]:
df = pd.DataFrame(dictionary, columns=["Words"])
df

Unnamed: 0,Words
0,experiment
1,investig
2,aerodynam
3,wing
4,slipstream
...,...
3859,hopkin
3860,aeolotrop
3861,intuit
3862,recover


In [12]:
df.to_excel("dictionary.xlsx")

### Initialized the numpy arrays for TF, IDF, TF-IDF:

In [13]:
term_doc_matrix = np.zeros((len(dictionary), doc_index))
tf_array = np.zeros((len(dictionary), doc_index))
idf_array = np.zeros((len(dictionary), 1))
tf_idf_array = np.zeros((len(dictionary), doc_index))

### Methods to calculate the TF, DF:

In [14]:
def calculate_term_frequency(doc, dictionary, doc_no):
    for word in dictionary:
        if word in doc:
            frequency = doc.count(word)
            row = dictionary.index(word)
            column = doc_no - 1
            tf_array[row, column] += frequency


def calculate_document_frequency(doc, dictionary):
    for word in dictionary:
        if word in doc:
            row = dictionary.index(word)
            idf_array[row, 0] += 1

### Reads the Documents one by one to calculate the TF, DF:

In [15]:
doc_names_list = []
for num in range(1, doc_index + 1):
    doc_path = "./docs" + "/doc" + str(num) + ".txt"
    doc_names_list.append("doc" + str(num) + ".txt")
    with open(doc_path, "r") as file:
        doc = file.read()
        calculate_term_frequency(doc, dictionary, num)
        calculate_document_frequency(doc, dictionary)

### Calculate the Term Frequency **(TF)** for each word in each document:

$$
TF_{(w,d)}
=
\begin{cases}
\text{ if } \;\;\; tf_{(w,d)} > 0  \;\;\;\;\;\;\;\; 1 + \log(tf_{(w,d)})\\
\text{otherwise} \;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\; 0 
\end{cases}
$$

In [16]:
row, column = np.shape(tf_array)

for r in range(row):
    for c in range(column):
        if tf_array[r, c] > 0:
            extracted_element = tf_array[r, c]
            tf_array[r, c] = 1 + np.log10(extracted_element)
        else:
            tf_array[r, c] = 0

            
GUI.custom_window("700x35", "Complete!", "The calculation of Term Frequency is have been successful.")

### Calculate Inverse Document Frequency **(IDF)** for each word:

$$
IDF_{(w)} = \log(\frac{N}{df_{w}})
$$

In [17]:
count_of_documents = column
row, column = np.shape(idf_array)

for r in range(row):
    if idf_array[r, 0] > 0:
        extracted_element = idf_array[r, 0]
        idf_array[r, 0] = np.log10(count_of_documents / extracted_element)
    else:
        idf_array[r, 0] = 0

GUI.custom_window("700x35", "Complete!", "The calculation of Inverse Document Frequency is have been successful.")

### Calculating the TF-IDF by multiplying peer to peer each row of TF array into IDF array:

In [18]:

row, column = np.shape(tf_idf_array)
r, c = 0, 0
while r < row:
        tf_idf_array[r, c]= tf_array[r, c] * idf_array[r, 0]
        c += 1
        if c == column:
            r += 1
            c = 0

### Export the TF, IDF, TF-IDF dataframes:

In [19]:
tf_dataframe = create_excel_dataframe(tf_array, "tf_excel.xlsx", doc_names_list, dictionary)

idf_dataframe = create_excel_dataframe(idf_array, "idf_excel.xlsx", ["IDF"], dictionary)

tf_idf_dataframe = create_excel_dataframe(tf_idf_array, "tf_idf_excel.xlsx", doc_names_list, dictionary)

GUI.custom_window("700x35", "Excel File", "Creating the Excel file is DONE!")

In [20]:
tf_dataframe

Unnamed: 0,doc1.txt,doc2.txt,doc3.txt,doc4.txt,doc5.txt,doc6.txt,doc7.txt,doc8.txt,doc9.txt,doc10.txt,...,doc1391.txt,doc1392.txt,doc1393.txt,doc1394.txt,doc1395.txt,doc1396.txt,doc1397.txt,doc1398.txt,doc1399.txt,doc1400.txt
experiment,1.477121,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,1.000000,0.0,0.000000,0.0,1.0,1.0,0.00000,0.0,0.0
investig,1.301030,1.00000,0.0,0.00000,0.0,0.00000,1.0,1.00000,1.477121,0.0,...,0.0,0.000000,1.0,0.000000,1.0,0.0,0.0,0.00000,0.0,1.0
aerodynam,1.301030,0.00000,0.0,0.00000,1.0,0.00000,0.0,0.00000,0.000000,0.0,...,1.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0
wing,1.602060,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0
slipstream,1.778151,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
hopkin,0.000000,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,1.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0
aeolotrop,0.000000,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,1.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0
intuit,0.000000,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,1.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0
recover,0.000000,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,0.000000,0.0,1.000000,0.0,0.0,0.0,0.00000,0.0,0.0


In [21]:
idf_dataframe

Unnamed: 0,IDF
experiment,0.615928
investig,0.586221
aerodynam,0.876615
wing,0.788193
slipstream,1.970037
...,...
hopkin,3.146128
aeolotrop,3.146128
intuit,3.146128
recover,1.714764


In [22]:
tf_idf_dataframe

Unnamed: 0,doc1.txt,doc2.txt,doc3.txt,doc4.txt,doc5.txt,doc6.txt,doc7.txt,doc8.txt,doc9.txt,doc10.txt,...,doc1391.txt,doc1392.txt,doc1393.txt,doc1394.txt,doc1395.txt,doc1396.txt,doc1397.txt,doc1398.txt,doc1399.txt,doc1400.txt
experiment,0.909801,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.615928,0.000000,0.000000,0.000000,0.615928,0.615928,0.000000,0.0,0.000000
investig,0.762692,0.586221,0.0,0.000000,0.000000,0.000000,0.586221,0.586221,0.865920,0.0,...,0.000000,0.000000,0.586221,0.000000,0.586221,0.000000,0.000000,0.000000,0.0,0.586221
aerodynam,1.140503,0.000000,0.0,0.000000,0.876615,0.000000,0.000000,0.000000,0.000000,0.0,...,0.876615,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
wing,1.262733,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
slipstream,3.503023,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
hopkin,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,3.146128,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
aeolotrop,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,3.146128,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
intuit,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,3.146128,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
recover,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,1.714764,0.000000,0.000000,0.000000,0.000000,0.0,0.000000


### **Query**:

In [95]:
query = input("Please enter what you want to search")

In [125]:
query_lower = query.lower()

In [126]:
query_words_list = query_lower.split()

In [127]:
# remove the stop words
for word in query_words_list:
    if word in stop_words or not is_meaningful(word):
        query_words_list.remove(word)  
query_words_list

['oh',
 'hell',
 'night',
 'city',
 'wake',
 'fuck',
 'samurai',
 'have',
 'city',
 'burn']

In [128]:
temp = [stemmer.stem(word) for word in query_words_list]
query_words = set(temp) 
query_words_list = temp
query_words_list

['oh',
 'hell',
 'night',
 'citi',
 'wake',
 'fuck',
 'samurai',
 'have',
 'citi',
 'burn']

In [129]:
query_words_list = " ".join(query_words_list)
query_words_list

'oh hell night citi wake fuck samurai have citi burn'

In [130]:
query_words_list = re.sub(r"[^\w\s]"," ", query_words_list)
query_words_list

'oh hell night citi wake fuck samurai have citi burn'

In [131]:
string = [query_words_list]

In [133]:
query_words

{'burn', 'citi', 'fuck', 'have', 'hell', 'night', 'oh', 'samurai', 'wake'}

### Calculating the **TF-IDF** of **Query**:

In [170]:
from sklearn.feature_extraction.text import TfidfVectorizer



tfidf = TfidfVectorizer()
 

result = tfidf.fit_transform(string)

query_tfidf = np.squeeze(result.toarray())
query_tfidf

array([0.28867513, 0.57735027, 0.28867513, 0.28867513, 0.28867513,
       0.28867513, 0.28867513, 0.28867513, 0.28867513])

In [188]:
query_words = list(tfidf.vocabulary_.keys())
query_words

['oh', 'hell', 'night', 'citi', 'wake', 'fuck', 'samurai', 'have', 'burn']

#### Make the values of the words in query which is not present in dictionary documents to Zero(0).
for corresponding the query words and the dictionary, we obtain the indexes of the words in the dictionary, to for later change the rows of the tf_idf array.

In [184]:
query_words_in_dictionary = {}

for index, word in enumerate(query_words):
    
    if word not in dictionary:
        query_tfidf[index] = 0
    else:
        query_words_in_dictionary.update({word : dictionary.index(word)})
   
      
query_words_in_dictionary

{'night': 2908, 'citi': 3355, 'wake': 546, 'burn': 1521}

In [173]:
query_tfidf

array([0.        , 0.        , 0.28867513, 0.28867513, 0.28867513,
       0.        , 0.        , 0.        , 0.28867513])

### Add pad 0 to query_tfidf to make it equal to the tf-idf of the documents.

In [None]:
row, column = np.shape(tf_idf_array)

In [174]:
query_tfidf = np.pad(query_tfidf, (0, row - len(query_tfidf) + 1), 'constant', constant_values = (0))

3864

### Move the values to the Corresponding index.

In [191]:
for count, value in enumerate(query_tfidf):
    if count < len(query_words):
        if value > 0:
            if query_words[count] != dictionary.index(query_words[count]):
                dest_index = dictionary.index(query_words[count])
                query_tfidf[dest_index] = value
                query_tfidf[count] = 0

array([0., 0., 0., ..., 0., 0., 0.])

In [196]:
query_tfidf

array([0., 0., 0., ..., 0., 0., 0.])

In [202]:
np.shape(query_tfidf)

(3864,)

In [203]:
np.shape(tf_idf_dataframe["doc1.txt"].tolist())

(3864,)

### Calculate the **Cosine** similarity:

In [217]:
cosine_values = []
cosine_docs = []


for number in range(1, column + 1):
    
    doc_name = "doc"+ str(number) +".txt"
    doc_column = tf_idf_dataframe[doc_name].tolist()
    
    nominator = np.dot(query_tfidf, doc_column)
    
    doc_column_norm = np.linalg.norm(doc_column)
    query_tfidf_norm = np.linalg.norm(query_tfidf)
    
    denominator = doc_column_norm * query_tfidf_norm
    
    cosine_theta = nominator / denominator
    if cosine_theta != 0:
        cosine_values.append(cosine_theta)
        cosine_docs.append(doc_name)

### Rank the **Cosine** Similarity:
Create a dictionary based on the *cosine_docs* and *cosine_values*, get the **items** from the created dictionary; and set the **key** argument equal to the value of the item; for the descending order, set the reverse argument to *True*.  

In [224]:
cosine_rank = sorted(dict(zip(cosine_docs, cosine_values)).items(), key = lambda x: x[1], reverse = True)
cosine_rank

[('doc1141.txt', 0.1393532065631248),
 ('doc1368.txt', 0.1308034769776183),
 ('doc621.txt', 0.1307060595428954),
 ('doc619.txt', 0.12806356552261228),
 ('doc1152.txt', 0.12641130345722604),
 ('doc154.txt', 0.12095343866999612),
 ('doc480.txt', 0.11978030130690938),
 ('doc979.txt', 0.11521314570680773),
 ('doc691.txt', 0.10789233146055996),
 ('doc1196.txt', 0.10512351070754462),
 ('doc446.txt', 0.10226948051965148),
 ('doc527.txt', 0.09498685425916992),
 ('doc409.txt', 0.09380464687746644),
 ('doc1080.txt', 0.09347627215069507),
 ('doc282.txt', 0.09296509163086875),
 ('doc121.txt', 0.08704986395167301),
 ('doc1183.txt', 0.08679012876250579),
 ('doc289.txt', 0.08506502643891949),
 ('doc907.txt', 0.08325558437234795),
 ('doc126.txt', 0.0818121685735506),
 ('doc695.txt', 0.08163024421184224),
 ('doc103.txt', 0.08162972944624305),
 ('doc721.txt', 0.08030291418129155),
 ('doc1184.txt', 0.07955235363275141),
 ('doc563.txt', 0.07829026775104976),
 ('doc622.txt', 0.07785301640893755),
 ('doc536

### Create dataframe and excel file from the Cosine similarity:

In [228]:
df = pd.DataFrame([doc[1] for doc in cosine_rank], index = [doc[0] for doc in cosine_rank], columns = ['cosine'])
df.to_excel("cosine_similarity.xlsx")