## Install pymupdf before running the code! ##

In [None]:
!pip install pymupdf

## Main code ##

In [1]:
import pymupdf
import re
from math import sqrt

# step 2: bag of words creation
# fucntion converts text to single-line bag of words
# given text in sentences and list of words to include in the bag
# returns vector representing bag of words
def vectorize_text(sentences, words):
    vector = [0 for _ in range(len(words))]
    for sentence in sentences:
        for word in sentence.split():
            if word in words:
                vector[words.index(word)] += 1
    return vector

# step 1: parsing pdf to text and its tokenization
def parse_file(file):
    # converting pdf to python str page by page
    pdf = pymupdf.open(file)
    sentences = []
    for page_num in range(len(pdf)):
        page = pdf[page_num]
        text_from_page = page.get_text().replace("ﬁ", "fi").replace("ﬂ", "fl").lower() # replace f-ligaments according to advice in chat
        text_from_page = re.sub("[^\w|\.]", " ", text_from_page) # replacing any non-character symbol except full stop with spaces
        text_from_page = re.sub("\d+\s*", " ", text_from_page) # replacing all digits with spaces
        text_from_page = re.sub("\s{2,}", " ", text_from_page) # getting rid of extra spaces
        text_in_sentences = re.sub("\s\.", ".", text_from_page).split(".") # getting rid of extra spaces before full stops
        for sentence in text_in_sentences:
            # collecting all text in sentences
            sentences.append(sentence)
            for word in sentence.split():
                if len(word) <= 2 or word in stopwords: # skip short words and words from stopwords
                    continue
                if dict_for_bag.get(word) is None: # count word in dict for bag of words
                    dict_for_bag[word] = 0
                dict_for_bag[word] += 1
    return sentences

# cosine similarity without numpy
def cosine_similarity(first_vector, second_vector):
    if len(first_vector) != len(second_vector):
        print("Error: vectors have different dimmentionality")
        return None
    dot_product = sum([first_vector[i] * second_vector[i] for i in range(len(first_vector))])
    len_first = sqrt(sum([x**2 for x in first_vector]))
    len_second = sqrt(sum([x**2 for x in second_vector]))
    return dot_product / (len_first * len_second)

In [2]:
# stopwords were taken from nltk list of stopwords, since this is high quality source 
stopwords = ['i','me','my','myself','we','our','ours','ourselves','you',"you're","you've","you'll","you'd",'your','yours','yourself','yourselves','he','him','his','himself','she',"she's",'her','hers','herself','it',"it's",'its','itself','they','them','their','theirs','themselves','what','which','who','whom','this','that',"that'll",'these','those','am','is','are','was','were','werebe','been','being','have','has','had','having','do','does','did','doing','a','an','the','and','but','if','or','because','as','until','while','of','at','by','for','with','about','against','between','into','through','during','before','after','above','below','to','from','up','down','in','out','on','off','over','under','again','further','then','once','here','there','when','where','why','how','all','any','both','each','few','more','most','other','some','such','no','nor','not','only','own','same','so','than','too','very','s','t','can','will','just','don',"don't",'should',"should've",'now','d','ll','m','o','re','ve','y','ain','aren',"aren't",'couldn',"couldn't",'didn',"didn't",'doesn',"doesn't",'hadn',"hadn't",'hasn',"hasn't",'haven',"haven't",'isn',"isn't",'ma','mightn',"mightn't",'mustn',"mustn't",'needn',"needn't",'shan',"shan't",'shouldn',"shouldn't",'wasn',"wasn't",'weren',"weren't",'won',"won't",'wouldn',"wouldn't"]

# make sure to refactor names of articel files and put them next to the notebook
dist_rep = "Distributed Representations.pdf"
att_is_all = "Attention is All You Need.pdf"


dict_for_bag = {}
# parsing texts
dist_rep_in_sent = parse_file(dist_rep)
att_is_all_in_sent = parse_file(att_is_all)

# step 2: shrinking bag of words
# deleting all the words with less than 4 appearances
# this leads to bag of words shrink to approximately 400 words
amount_of_appearances_to_stay = 4
keys = list(dict_for_bag.keys())
for key in keys:
    if dict_for_bag[key] < amount_of_appearances_to_stay:
        del dict_for_bag[key]
        
word_list = list(dict_for_bag.keys())
# check out words from bag of words, if you want
#print(f"Bag of words: {word_list}")
print(f"Bag of words size: {len(word_list)}")

# step 3: generating vectors for both texts and calculating cosine similarity
dist_rep_vec = vectorize_text(dist_rep_in_sent, word_list)
att_is_all = vectorize_text(att_is_all_in_sent, word_list)

print(f"Cosine similarity of two articles: {cosine_similarity(dist_rep_vec, att_is_all)}")

Bag of words size: 392
Cosine similarity of two articles: 0.3444794116278851


## Conclusion ##

Step 4: interpret cosine similarity

Cosine similarity value ranges between 0 and 1, with 0 corresponds to completly different unlike texts and 1 indicates that texts are identical. In this case, I got cosine similarity 0.344, which means that the articles discuss related topics or themes, but they likely have distinct perspectives, writing styles, or focus areas. I think, articles share some conceptual overlap and vocabulary, but they are not highly similar.