### Import packages

In [11]:
import GUI
import nltk
from nltk.stem.snowball import SnowballStemmer  # type: ignore
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
import numpy as np # type: ignore
import pandas as pd # type: ignore
from icecream import ic
import tkinter as tk
from tkinter import filedialog

### Function to create dataframe and excel file

In [12]:
# function to create the excel files
def create_excel_dataframe(input , name, columns, index):
    df = pd.DataFrame(input, columns = columns, index = index) # Create a pandas dataframe
    df.to_excel(name) # Create an excel file
    return df

### Download the **stopwords** 

In [13]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/glados/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Remove the punctuation from the docs

In [14]:
def convert_to_list(doc) -> list:
    doc = doc.split()
    return doc

In [15]:
stemmer = SnowballStemmer(language='english')
dictionary = []

### Download wordnet and create a method to check that the word have meaning or not.

In [16]:
nltk.download('wordnet')

def is_meaningful(word) -> bool:
    
    if wn.synsets(word):
        return True
    else: 
        return False    

[nltk_data] Downloading package wordnet to /home/glados/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Method to create the dictionary

In [17]:
def create_dictionary(doc, dictionary) -> list:
    for word_index, word in enumerate(doc):
        
        word = word.lower()
        
        if word.isalnum() and word not in stop_words and is_meaningful(word) and len(word) > 1:
            
            stem_word = stemmer.stem(word)
            doc[word_index] = stem_word
            
            if stem_word not in dictionary:
                dictionary.append(stem_word)
    
    return doc

### Method to join the words together with one space distance

In [18]:
def join_document(doc) -> str:
    doc = ' '.join(doc)
    return doc

### Methods which contain the methods to process the document

In [19]:
def process_document(document) -> str:
    
    document = convert_to_list(document)
    
    document = create_dictionary(document, dictionary)
    
    document = join_document(document)
    
    return document

### Select a dataset to extract, then extracted the results in the two type of documents, the processed docs goes to **docs** directory, and the original docs saved to **Original docs** directory.

In [20]:
raw_document = GUI.select_file() # Select File GUI implementation

raw_dataset = raw_document # the raw dataset that read from the one file

raw_dataset = raw_dataset.split(".I") # split the docs by index

raw_dataset.remove(raw_dataset[0]) # remove the empty first index

doc_index = 0
    

# write each index in 'raw_dataset'
for doc in raw_dataset:
    doc_index += 1
    doc_path = "./docs" + "/doc" + str(doc_index) + ".txt"
    doc = process_document(doc)
    
    
    with open(doc_path, "w") as document: # write each index into the separated doc
        document.write(doc)

doc_index = 0

for doc in raw_dataset:
    doc_index += 1
    doc_path = "./Original docs" + "/docs" + str(doc_index) + ".txt"
    
    
    with open(doc_path, "w") as document: # write each index into the separated doc
        document.write(doc)

GUI.custom_window("700x35", "Extract Documents", f"There have been {doc_index} documents are extracted.")

### Dictionary:

In [21]:
df = pd.DataFrame(dictionary, columns=["Words"])
df

Unnamed: 0,Words
0,experiment
1,investig
2,aerodynam
3,wing
4,slipstream
...,...
3416,intuit
3417,squir
3418,standoff
3419,recover


In [22]:
df.to_excel("dictionary.xlsx")

### Initialized the numpy arrays for TF, IDF, TF-IDF:

In [23]:
term_doc_matrix = np.zeros((len(dictionary), doc_index))
tf_array = np.zeros((len(dictionary), doc_index))
idf_array = np.zeros((len(dictionary), 1))
tf_idf_array = np.zeros((len(dictionary), doc_index))

### Methods to calculate the TF, DF:

In [24]:
def calculate_term_frequency(doc, dictionary, doc_no):
    for word in dictionary:
        if word in doc:
            frequency = doc.count(word)
            row = dictionary.index(word)
            column = doc_no - 1
            tf_array[row, column] += frequency


def calculate_document_frequency(doc, dictionary):
    for word in dictionary:
        if word in doc:
            row = dictionary.index(word)
            idf_array[row, 0] += 1

### Reads the Documents one by one to calculate the TF, DF:

In [25]:
doc_names_list = []
for num in range(1, doc_index + 1):
    doc_path = "./docs" + "/doc" + str(num) + ".txt"
    doc_names_list.append("doc" + str(num) + ".txt")
    with open(doc_path, "r") as file:
        doc = file.read()
        calculate_term_frequency(doc, dictionary, num)
        calculate_document_frequency(doc, dictionary)

### Calculate the Term Frequency **(TF)** for each word in each document:

$$
TF_{(w,d)}
=
\begin{cases}
\text{ if } \;\;\; tf_{(w,d)} > 0  \;\;\;\;\;\;\;\; 1 + \log(tf_{(w,d)})\\
\text{otherwise} \;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\;\; 0 
\end{cases}
$$

In [26]:
row, column = np.shape(tf_array)

for r in range(row):
    for c in range(column):
        if tf_array[r, c] > 0:
            extracted_element = tf_array[r, c]
            tf_array[r, c] = 1 + np.log10(extracted_element)
        else:
            tf_array[r, c] = 0

            
GUI.custom_window("700x35", "Complete!", "The calculation of Term Frequency is have been successful.")

### Calculate Inverse Document Frequency **(IDF)** for each word:

$$
IDF_{(w)} = \log(\frac{N}{df_{w}})
$$

In [27]:
count_of_documents = column
row, column = np.shape(idf_array)

for r in range(row):
    if idf_array[r, 0] > 0:
        extracted_element = idf_array[r, 0]
        idf_array[r, 0] = np.log10(count_of_documents / extracted_element)
    else:
        idf_array[r, 0] = 0

GUI.custom_window("700x35", "Complete!", "The calculation of Inverse Document Frequency is have been successful.")

### Calculating the TF-IDF by multiplying peer to peer each row of TF array into IDF array:

In [28]:

row, column = np.shape(tf_idf_array)
r, c = 0, 0
while r < row:
        tf_idf_array[r, c]= tf_array[r, c] * idf_array[r, 0]
        c += 1
        if c == column:
            r += 1
            c = 0

### Export the TF, IDF, TF-IDF dataframes:

In [29]:
tf_dataframe = create_excel_dataframe(tf_array, "tf_excel.xlsx", doc_names_list, dictionary)

idf_dataframe = create_excel_dataframe(idf_array, "idf_excel.xlsx", ["IDF"], dictionary)

tf_idf_dataframe = create_excel_dataframe(tf_idf_array, "tf_idf_excel.xlsx", doc_names_list, dictionary)

GUI.custom_window("700x35", "Excel File", "Creating the Excel file is DONE!")

In [30]:
tf_dataframe

Unnamed: 0,doc1.txt,doc2.txt,doc3.txt,doc4.txt,doc5.txt,doc6.txt,doc7.txt,doc8.txt,doc9.txt,doc10.txt,...,doc1391.txt,doc1392.txt,doc1393.txt,doc1394.txt,doc1395.txt,doc1396.txt,doc1397.txt,doc1398.txt,doc1399.txt,doc1400.txt
experiment,1.477121,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,1.000000,0.0,0.000000,0.0,1.0,1.0,0.00000,0.0,0.0
investig,1.301030,1.00000,0.0,0.00000,0.0,0.00000,1.0,1.00000,1.477121,0.0,...,0.0,0.000000,1.0,0.000000,1.0,0.0,0.0,0.00000,0.0,1.0
aerodynam,1.301030,0.00000,0.0,0.00000,1.0,0.00000,0.0,0.00000,0.000000,0.0,...,1.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0
wing,1.602060,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0
slipstream,1.778151,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
intuit,0.000000,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,1.000000,0.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0
squir,0.000000,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,0.000000,1.0,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0
standoff,0.000000,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,0.000000,0.0,1.000000,0.0,0.0,0.0,0.00000,0.0,0.0
recover,0.000000,0.00000,0.0,0.00000,0.0,0.00000,0.0,0.00000,0.000000,0.0,...,0.0,0.000000,0.0,1.000000,0.0,0.0,0.0,0.00000,0.0,0.0


In [31]:
idf_dataframe

Unnamed: 0,IDF
experiment,0.610834
investig,0.586221
aerodynam,0.876615
wing,0.786293
slipstream,1.970037
...,...
intuit,3.146128
squir,2.367977
standoff,2.845098
recover,1.714764


In [32]:
tf_idf_dataframe

Unnamed: 0,doc1.txt,doc2.txt,doc3.txt,doc4.txt,doc5.txt,doc6.txt,doc7.txt,doc8.txt,doc9.txt,doc10.txt,...,doc1391.txt,doc1392.txt,doc1393.txt,doc1394.txt,doc1395.txt,doc1396.txt,doc1397.txt,doc1398.txt,doc1399.txt,doc1400.txt
experiment,0.902276,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.610834,0.000000,0.000000,0.000000,0.610834,0.610834,0.000000,0.0,0.000000
investig,0.762692,0.586221,0.0,0.000000,0.000000,0.000000,0.586221,0.586221,0.865920,0.0,...,0.000000,0.000000,0.586221,0.000000,0.586221,0.000000,0.000000,0.000000,0.0,0.586221
aerodynam,1.140503,0.000000,0.0,0.000000,0.876615,0.000000,0.000000,0.000000,0.000000,0.0,...,0.876615,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
wing,1.259688,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
slipstream,3.503023,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
intuit,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,3.146128,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
squir,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,2.367977,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
standoff,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,2.845098,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
recover,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,1.714764,0.000000,0.000000,0.000000,0.000000,0.0,0.000000


### **Query**:

In [118]:
query = input("Please enter what you want to search")

In [119]:
query_lower = query.lower()

In [120]:
query_words_list = query_lower.split()

In [121]:
temp = []
for word in query_words_list:
    if word not in stop_words and is_meaningful(word) and len(word) > 1:
        temp.append(word)  
query_words_list = temp

In [122]:
temp = [stemmer.stem(word) for word in query_words_list]
query_words = set(temp) 
query_words_list = temp
query_words_list

['paper', 'flow', 'visual', 'slender', 'conic', 'wing']

In [123]:
query_words_list = " ".join(query_words_list)
query_words_list

'paper flow visual slender conic wing'

In [124]:
string = [query_words_list]

In [125]:
query_words

{'conic', 'flow', 'paper', 'slender', 'visual', 'wing'}

### Calculating the **TF-IDF** of **Query**:

In [126]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
 
result = tfidf.fit_transform(string)

query_tfidf = np.squeeze(result.toarray())
query_tfidf

array([0.40824829, 0.40824829, 0.40824829, 0.40824829, 0.40824829,
       0.40824829])

In [127]:
query_words = list(tfidf.vocabulary_.keys())
query_words

['paper', 'flow', 'visual', 'slender', 'conic', 'wing']

#### Make the values of the words in query which is not present in dictionary documents to Zero(0).
for corresponding the query words and the dictionary, we obtain the indexes of the words in the dictionary, to for later change the rows of the tf_idf array.

In [128]:
query_words_in_dictionary = {}

for index, word in enumerate(query_words):
    
    if word not in dictionary:
        query_tfidf[index] = 0
    else:
        query_words_in_dictionary.update({word : dictionary.index(word)})
   
      
query_words_in_dictionary

{'paper': 455,
 'flow': 46,
 'visual': 2405,
 'slender': 614,
 'conic': 530,
 'wing': 3}

In [129]:
query_tfidf

array([0.40824829, 0.40824829, 0.40824829, 0.40824829, 0.40824829,
       0.40824829])

### Add pad 0 to query_tfidf to make it equal to the tf-idf of the documents.

In [130]:
row, column = np.shape(tf_idf_array)

In [131]:
query_tfidf = np.pad(query_tfidf, (0, row - len(query_tfidf)), 'constant', constant_values = (0))

### Move the values to the Corresponding index.

In [132]:
for count, value in enumerate(query_tfidf):
    if count < len(query_words):
        if value > 0:
            if query_words[count] != dictionary.index(query_words[count]):
                dest_index = dictionary.index(query_words[count])
                query_tfidf[dest_index] = value
                query_tfidf[count] = 0     

In [133]:
query_tfidf

array([0., 0., 0., ..., 0., 0., 0.])

In [134]:
np.shape(query_tfidf)

(3421,)

In [135]:
np.shape(tf_idf_dataframe["doc1.txt"].tolist())

(3421,)

### Calculate the **Cosine** similarity:

In [136]:
cosine_values = []
cosine_docs = []


for number in range(1, column + 1):
    
    doc_name = "doc"+ str(number) +".txt"
    doc_column = tf_idf_dataframe[doc_name].tolist()
    
    nominator = np.dot(query_tfidf, doc_column)
    
    doc_column_norm = np.linalg.norm(doc_column)
    query_tfidf_norm = np.linalg.norm(query_tfidf)
    
    denominator = doc_column_norm * query_tfidf_norm
    
    cosine_theta = nominator / denominator
    
    cosine_values.append(cosine_theta)
    cosine_docs.append(doc_name)

### Rank the **Cosine** Similarity:
Create a dictionary based on the *cosine_docs* and *cosine_values*, get the **items** from the created dictionary; and set the **key** argument equal to the value of the item; for the descending order, set the reverse argument to *True*.  

In [137]:
cosine_rank = sorted(dict(zip(cosine_docs, cosine_values)).items(), key = lambda x: x[1], reverse = True)
cosine_rank

[('doc513.txt', 0.19607482853978583),
 ('doc633.txt', 0.17426988318102674),
 ('doc752.txt', 0.15542166632949103),
 ('doc683.txt', 0.1506298113376101),
 ('doc465.txt', 0.1441580383192786),
 ('doc601.txt', 0.1430629649785663),
 ('doc225.txt', 0.1408328477831821),
 ('doc680.txt', 0.1396010123458158),
 ('doc1197.txt', 0.13566204781162672),
 ('doc791.txt', 0.13503131972668983),
 ('doc545.txt', 0.13383587621702397),
 ('doc222.txt', 0.13322429744435446),
 ('doc1186.txt', 0.13264543295499495),
 ('doc288.txt', 0.13145920406995124),
 ('doc901.txt', 0.12752738339636596),
 ('doc902.txt', 0.12649629159171163),
 ('doc935.txt', 0.12031328714163154),
 ('doc247.txt', 0.11991277478133805),
 ('doc420.txt', 0.11830732045163463),
 ('doc921.txt', 0.11793800573249373),
 ('doc250.txt', 0.11660749412670403),
 ('doc1058.txt', 0.11589789621466565),
 ('doc45.txt', 0.1153669848910106),
 ('doc191.txt', 0.11506140717077723),
 ('doc602.txt', 0.11391494630350565),
 ('doc60.txt', 0.11389106095533336),
 ('doc19.txt', 0.

### Create dataframe and excel file from the Cosine similarity:

In [138]:
df = pd.DataFrame([doc[1] for doc in cosine_rank], index = [doc[0] for doc in cosine_rank], columns = ['cosine'])
df.to_excel("cosine_similarity.xlsx")

In [139]:

window = tk.Tk()
window.geometry("200x200")

def choose():
    global temp
    files_path = filedialog.askopenfilenames(
                parent = window,
                initialdir = "./Original docs",
                title = "Choose the docs",
                filetypes = [("text name","*.txt")]
                )
    
    temp = []
    for path in files_path:
        with open(path, "r") as f:
            temp.append(f.read())
            
    window.destroy()
    
button = tk.Button(window,
                   text = 'Select',
                   command = choose,
                   font = ('Arial', 10))
button.pack()

window.mainloop()

print(temp)

[' 1\n.T\nexperimental investigation of the aerodynamics of a\nwing in a slipstream .\n.A\nbrenckman,m.\n.B\nj. ae. scs. 25, 1958, 324.\n.W\nexperimental investigation of the aerodynamics of a\nwing in a slipstream .\n  an experimental study of a wing in a propeller slipstream was\nmade in order to determine the spanwise distribution of the lift\nincrease due to slipstream at different angles of attack of the wing\nand at different free stream to slipstream velocity ratios .  the\nresults were intended in part as an evaluation basis for different\ntheoretical treatments of this problem .\n  the comparative span loading curves, together with\nsupporting evidence, showed that a substantial part of the lift increment\nproduced by the slipstream was due to a /destalling/ or\nboundary-layer-control effect .  the integrated remaining lift\nincrement, after subtracting this destalling lift, was found to agree\nwell with a potential flow theory .\n  an empirical evaluation of the destalling ef

In [140]:
np.shape(temp)

(5,)

In [141]:
selected_docs = ' '.join(temp)
docs_query = selected_docs + query
docs_query

" 1\n.T\nexperimental investigation of the aerodynamics of a\nwing in a slipstream .\n.A\nbrenckman,m.\n.B\nj. ae. scs. 25, 1958, 324.\n.W\nexperimental investigation of the aerodynamics of a\nwing in a slipstream .\n  an experimental study of a wing in a propeller slipstream was\nmade in order to determine the spanwise distribution of the lift\nincrease due to slipstream at different angles of attack of the wing\nand at different free stream to slipstream velocity ratios .  the\nresults were intended in part as an evaluation basis for different\ntheoretical treatments of this problem .\n  the comparative span loading curves, together with\nsupporting evidence, showed that a substantial part of the lift increment\nproduced by the slipstream was due to a /destalling/ or\nboundary-layer-control effect .  the integrated remaining lift\nincrement, after subtracting this destalling lift, was found to agree\nwell with a potential flow theory .\n  an empirical evaluation of the destalling eff

In [142]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [143]:
query_lower = docs_query.lower()
query_words_list = query_lower.split()
query_words_list

['1',
 '.t',
 'experimental',
 'investigation',
 'of',
 'the',
 'aerodynamics',
 'of',
 'a',
 'wing',
 'in',
 'a',
 'slipstream',
 '.',
 '.a',
 'brenckman,m.',
 '.b',
 'j.',
 'ae.',
 'scs.',
 '25,',
 '1958,',
 '324.',
 '.w',
 'experimental',
 'investigation',
 'of',
 'the',
 'aerodynamics',
 'of',
 'a',
 'wing',
 'in',
 'a',
 'slipstream',
 '.',
 'an',
 'experimental',
 'study',
 'of',
 'a',
 'wing',
 'in',
 'a',
 'propeller',
 'slipstream',
 'was',
 'made',
 'in',
 'order',
 'to',
 'determine',
 'the',
 'spanwise',
 'distribution',
 'of',
 'the',
 'lift',
 'increase',
 'due',
 'to',
 'slipstream',
 'at',
 'different',
 'angles',
 'of',
 'attack',
 'of',
 'the',
 'wing',
 'and',
 'at',
 'different',
 'free',
 'stream',
 'to',
 'slipstream',
 'velocity',
 'ratios',
 '.',
 'the',
 'results',
 'were',
 'intended',
 'in',
 'part',
 'as',
 'an',
 'evaluation',
 'basis',
 'for',
 'different',
 'theoretical',
 'treatments',
 'of',
 'this',
 'problem',
 '.',
 'the',
 'comparative',
 'span',
 '

In [144]:
temp = []
for word in query_words_list:
    if word not in stop_words and is_meaningful(word) and len(word) > 1:
        temp.append(word)  
query_words_list = temp
query_words_list

['experimental',
 'investigation',
 'aerodynamics',
 'wing',
 'slipstream',
 'experimental',
 'investigation',
 'aerodynamics',
 'wing',
 'slipstream',
 'experimental',
 'study',
 'wing',
 'propeller',
 'slipstream',
 'made',
 'order',
 'determine',
 'distribution',
 'lift',
 'increase',
 'due',
 'slipstream',
 'different',
 'angles',
 'attack',
 'wing',
 'different',
 'free',
 'stream',
 'slipstream',
 'velocity',
 'ratios',
 'results',
 'intended',
 'part',
 'evaluation',
 'basis',
 'different',
 'theoretical',
 'treatments',
 'problem',
 'comparative',
 'span',
 'loading',
 'together',
 'supporting',
 'showed',
 'substantial',
 'part',
 'lift',
 'increment',
 'produced',
 'slipstream',
 'due',
 'effect',
 'integrated',
 'remaining',
 'lift',
 'subtracting',
 'found',
 'agree',
 'well',
 'potential',
 'flow',
 'theory',
 'empirical',
 'evaluation',
 'effects',
 'made',
 'specific',
 'configuration',
 'experiment',
 'simple',
 'shear',
 'flow',
 'past',
 'flat',
 'plate',
 'incompress

In [145]:
temp = [stemmer.stem(word) for word in query_words_list]
query_words = set(temp) 
query_words_list = temp
query_words

{'aerodynam',
 'aeronaut',
 'agre',
 'also',
 'analyt',
 'angl',
 'approxim',
 'attack',
 'basi',
 'bodi',
 'boundari',
 'classic',
 'compar',
 'comparison',
 'composit',
 'conduct',
 'configur',
 'conic',
 'consid',
 'constant',
 'curv',
 'depart',
 'determin',
 'differ',
 'discuss',
 'distribut',
 'due',
 'edg',
 'effect',
 'emit',
 'empir',
 'england',
 'equat',
 'evalu',
 'exist',
 'experi',
 'experiment',
 'expos',
 'featur',
 'flat',
 'flow',
 'fluid',
 'found',
 'free',
 'gradient',
 'heat',
 'high-spe',
 'incompress',
 'increas',
 'increment',
 'input',
 'institut',
 'integr',
 'intend',
 'intern',
 'investig',
 'laminar',
 'layer',
 'lead',
 'libbi',
 'lift',
 'linear',
 'load',
 'made',
 'may',
 'must',
 'necessari',
 'nose',
 'novel',
 'obtain',
 'one',
 'one-dimension',
 'order',
 'origin',
 'outsid',
 'paper',
 'part',
 'past',
 'plate',
 'polytechn',
 'possibl',
 'potenti',
 'present',
 'pressur',
 'problem',
 'produc',
 'propel',
 'rate',
 'ratio',
 'recent',
 'region',


In [146]:
query_words_list = " ".join(query_words_list)

string = [query_words_list]

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
 
result = tfidf.fit_transform(string)

query_tfidf = np.squeeze(result.toarray())

query_words = list(tfidf.vocabulary_.keys())

In [147]:
query_words_in_dictionary = {}

for index, word in enumerate(query_words):
    
    if word not in dictionary:
        query_tfidf[index] = 0
    else:
        query_words_in_dictionary.update({word : dictionary.index(word)})
        
        
row, column = np.shape(tf_idf_array)

query_tfidf = np.pad(query_tfidf, (0, row - len(query_tfidf)), 'constant', constant_values = (0))

In [148]:
for count, value in enumerate(query_tfidf):
    if count < len(query_words):
        if value > 0:
            if query_words[count] != dictionary.index(query_words[count]):
                dest_index = dictionary.index(query_words[count])
                query_tfidf[dest_index] = value
                query_tfidf[count] = 0           

In [149]:
cosine_values = []
cosine_docs = []  

for number in range(1, column + 1):
    
    doc_name = "doc"+ str(number) +".txt"
    doc_column = tf_idf_dataframe[doc_name].tolist()
    
    nominator = np.dot(query_tfidf, doc_column)
    
    doc_column_norm = np.linalg.norm(doc_column)
    query_tfidf_norm = np.linalg.norm(query_tfidf)
    
    denominator = doc_column_norm * query_tfidf_norm
    
    cosine_theta = nominator / denominator
    
    cosine_values.append(cosine_theta)
    cosine_docs.append(doc_name)

In [150]:
cosine_rank = sorted(dict(zip(cosine_docs, cosine_values)).items(), key = lambda x: x[1], reverse = True)
cosine_rank

[('doc2.txt', 0.40500554560432606),
 ('doc3.txt', 0.20574347040062585),
 ('doc5.txt', 0.1958340230795956),
 ('doc1149.txt', 0.18282790296495868),
 ('doc666.txt', 0.17854159238255077),
 ('doc134.txt', 0.1753591901494408),
 ('doc4.txt', 0.1735007770829745),
 ('doc1198.txt', 0.15448725758934187),
 ('doc508.txt', 0.15191031805239402),
 ('doc294.txt', 0.1462475386223091),
 ('doc1076.txt', 0.14027839273793655),
 ('doc1104.txt', 0.13961975581612057),
 ('doc1307.txt', 0.1390400392614134),
 ('doc962.txt', 0.13898212334461135),
 ('doc295.txt', 0.12722141625901925),
 ('doc25.txt', 0.1268819165556121),
 ('doc160.txt', 0.12577037801430166),
 ('doc568.txt', 0.12575065587053927),
 ('doc1383.txt', 0.12552857235858011),
 ('doc1281.txt', 0.12548272588065737),
 ('doc318.txt', 0.12479077850528907),
 ('doc94.txt', 0.12475585875737417),
 ('doc688.txt', 0.12468632406561245),
 ('doc421.txt', 0.12455953870853918),
 ('doc610.txt', 0.12368531413803635),
 ('doc655.txt', 0.12273995178624877),
 ('doc992.txt', 0.120

In [151]:
df = pd.DataFrame([doc[1] for doc in cosine_rank], index = [doc[0] for doc in cosine_rank], columns = ['cosine'])
df.to_excel("cosine_similarity_rerank.xlsx")