# Foundations of Data Science Final - Question 3

### Groupmembers:  Almut Bohnhoff, Anastasiya Strohnova, Kitti Kresznai, Natalie Schober

In [12]:
import numpy as np
import os


# building a word dictionary from the input list of documents
def compute_dict(path):
    
    frequency = {}
    separate_text_documents = {}
    
    all_files = os.listdir(path)
    for file in all_files:
       
       # error handling: check if the input files are text files
       if not file.endswith('.txt'):
           raise ValueError('The files found in the specified folder are not txt files.')
       else:
           text = []
           # open the file and then call .read() to get the text
           with open(os.path.join(path, file)) as f:
              for line in f:
           # removing their punctuation
                  line = line.lower()
                  unwanted_punc = ['.', ',','!','?', '(', ')','{','}','[',']', ';', '/', '\\', ':']
                  for punc in unwanted_punc:
                      line = line.replace(punc, '')
                  words = line.split()
                  for word in words:
                    text.append(word)
                    if word not in frequency:
                        frequency[word] = 1
                    else:
                        frequency[word] = frequency[word] + 1
           separate_text_documents[file] = text
    return frequency, separate_text_documents


# creating a word vector for each document, and for the search document
def create_word_vectors(frequency, documents, search_doc):
    list_of_word_vectors = {}
    
    for file in documents:
        word_vector = []
        for key in frequency:
            if key in documents[file]:
                word_vector.append(1)
            else:
                word_vector.append(0)
        list_of_word_vectors[file] = word_vector    

    #turn the search document into a word vector
    cleaned_search_doc = []
    search_doc = search_doc.lower()
    unwanted_punc = ['.', ',','!','?', '(', ')','{','}','[',']', ';', '/', '\\', ':']
    for punc in unwanted_punc:
        search_doc = search_doc.replace(punc, '')
    words = search_doc.split()
    for word in words:
        cleaned_search_doc.append(word)
        
    search_doc_vector = []
    for key in frequency:
        if key in cleaned_search_doc:
            search_doc_vector.append(1)
        else:
            search_doc_vector.append(0)
    # return the list of word vectors, and the search document vector
    return list_of_word_vectors, search_doc_vector

#provide a list of documents that are similar to the given search
# document, in descending order of their similarity with the search document
def calculate_similarity(word_vectors,search_doc_vector):    
    #calculating the dot products & euclidean distance between the search document 
    #and the list of documents
    results_dot = {}
    results_eucl = {}
    search_doc_vector_ar = np.array(search_doc_vector)
    for vector in word_vectors:
        word_vector = np.array(word_vectors[vector])        
        dotproduct = np.dot(word_vector, search_doc_vector_ar)
        results_dot[vector] = dotproduct
        euclidean = np.linalg.norm(word_vector - search_doc_vector_ar)
        results_eucl[vector] = euclidean
    #order the list of documents into a descending order based on the dotproduct
    ordered_list_doc = dict(sorted(results_dot.items(), key=lambda item: item[1], reverse=True))
    ordered_list_eucl = dict(sorted(results_eucl.items(), key=lambda item: item[1], reverse=False))
    return 'Result for dot product: {}'.format(ordered_list_doc.keys()) + '\nResult for euclidean distance: {}'.format(ordered_list_eucl.keys())


#The final function to call
def text_similarity(path, search_doc):
    frequency, documents = compute_dict(path)
    list_of_word_vectors, search_doc_vector = create_word_vectors(frequency, documents, search_doc)
    result = calculate_similarity(list_of_word_vectors, search_doc_vector)
    return result



#Testing
path = "/Users/kitti/Documents/Python Scripts/Spyder Scripts/Test_textfiles/"
search_doc = 'The sky is blue, and the grass is green.'
print(text_similarity(path, search_doc))

Result for dot product: dict_keys(['file.txt', 'file2.txt', 'file3.txt'])
Result for euclidean distance: dict_keys(['file.txt', 'file2.txt', 'file3.txt'])


'C:\\Users\\kitti\\Documents\\Python Scripts\\Foundations-of-Data-Science-Final-main'