In [27]:
import os
import regex as re
import pickle
import math

def get_documents(dir, suffix):
            files = []
            for file in os.listdir(dir):
                if file.endswith(suffix):
                    files.append(file)
            return files

def get_index(folder, documents):
    index = dict()
    for document in documents:
        with open(os.path.join(folder, document)) as open_document:
            match_list = list(re.finditer("\p{Latin}+", open_document.read()))

        for match in match_list:
            match_word = match.group(0).lower()
            match_pos = match.span(0)[0]
            if match_word not in index:
                index[match_word] = dict()
            if document not in index[match_word]:
                index[match_word][document] = list()
            index[match_word][document].append(match_pos)

    pickle.dump(index, open('master_index.idx', "wb"))
    return index


def get_total_number_words_in(document):
    return len(re.findall('\p{Latin}+', open('Selma/' + document).read().lower()))


def term_frequency(term, document, document_number_words, index):
    words = document_number_words[document]
    try:
        return len(index[term][document])/words
    except:
        return 0

def inverse_document_frequency(term, documents, index):
    document_count = 0
    for document in documents:
        try:
            index[term][document]
        except:
            continue
        document_count += 1
    if document_count == 0:
        return 0
    return math.log(len(documents)/(document_count),10)

def tf_idf(documents, index):
    tf_idf = dict()
    document_number_words = dict()
    
    for document in documents:
        document_number_words[document] = get_total_number_words_in(document)

    for document in documents:
        tf_idf[document] = dict()

    for term in index:
        for document in documents:
            tf_idf[document][term] = term_frequency(term, document, document_number_words, index) * inverse_document_frequency(term, documents, index)
    return tf_idf

def compare_documents(documents, index, tf_idf):
    norm = dict()
    cos_sim = dict()
    
    for document_1 in documents:
        cos_sim[document_1] = dict()
        for document_2 in documents:
            cos_sim[document_1][document_2] = 0
    
    for document in documents:
        ins_sqr = 0
        for term in index:
            ins_sqr += math.pow(tf_idf[document][term],2)
        norm[document] = math.sqrt(ins_sqr)

    for term in index:
        for document_1 in documents:
            for document_2 in documents:
                if(document_1 == document_2):
                    continue
                cos_sim[document_1][document_2] += tf_idf[document_1][term] * tf_idf[document_2][term]

    for document_1 in documents:
        for document_2 in documents:
            cos_sim[document_1][document_2] /= (norm[document_1]*norm[document_2])

    return cos_sim

documents = get_documents("Selma", "txt")
index = get_index('Selma', documents)
tf_idf = tf_idf(documents, index)
print(tf_idf["bannlyst.txt"]["et"])
cos_sim = compare_documents(documents, index, tf_idf)
print(cos_sim)

for document in cos_sim:
    print("\n")
    for second_doc in cos_sim[document]:
        print("{0:.6f}".format(cos_sim[document][second_doc]) + "|")



6.2846093167673765e-06
{'bannlyst.txt': {'bannlyst.txt': 0.0, 'gosta.txt': 0.005229877077878004, 'herrgard.txt': 0.0012990047699401013, 'jerusalem.txt': 0.007318172253124071, 'kejsaren.txt': 0.0021458005452060277, 'marbacka.txt': 0.005096498280768068, 'nils.txt': 0.005343494386428176, 'osynliga.txt': 0.0059846289209750676, 'troll.txt': 0.007336242553428816}, 'gosta.txt': {'bannlyst.txt': 0.005229877077878004, 'gosta.txt': 0.0, 'herrgard.txt': 0.015142549355563969, 'jerusalem.txt': 0.008540109292371853, 'kejsaren.txt': 0.0065098346457967485, 'marbacka.txt': 0.02856110652389178, 'nils.txt': 0.025006368728038183, 'osynliga.txt': 0.05574757215207852, 'troll.txt': 0.032494044336534515}, 'herrgard.txt': {'bannlyst.txt': 0.0012990047699401013, 'gosta.txt': 0.015142549355563969, 'herrgard.txt': 0.0, 'jerusalem.txt': 0.007621512812700546, 'kejsaren.txt': 0.0008924889218526751, 'marbacka.txt': 0.011697992749605354, 'nils.txt': 0.014146983656255958, 'osynliga.txt': 0.02125636466641977, 'troll.txt