# Imports

In [12]:
import os
import win32com.client
from collections import Counter
import re
import math


# Functions

In [20]:
def extract_text_from_doc(doc_path):
    # Create a COM object for Word
    word_app = win32com.client.Dispatch("Word.Application")

    # Open the document
    doc = word_app.Documents.Open(doc_path)

    # Extract text from paragraphs
    text = []
    for paragraph in doc.Paragraphs:
        text.append(paragraph.Range.Text)

    # Close the document and Word application
    doc.Close()
    word_app.Quit()

    return '\n'.join(text)

# calculating bag of words

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize the text
    words = text.split()
    return words

def calculate_bag_of_words(text):
        content = text
        words = preprocess_text(content)
        bag_of_words = Counter(words)
        return bag_of_words

# similarities

def jaccard_similarity(bag1, bag2):
    set1 = set(bag1.keys())
    set2 = set(bag2.keys())
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)

def cosine_similarity(bag1, bag2):
    dot_product = sum(bag1[word] * bag2.get(word, 0) for word in bag1)
    magnitude1 = math.sqrt(sum(value ** 2 for value in bag1.values()))
    magnitude2 = math.sqrt(sum(value ** 2 for value in bag2.values()))
    return dot_product / (magnitude1 * magnitude2)

def euclidean_distance(bag1, bag2):
    common_words = set(bag1.keys()) & set(bag2.keys())
    squared_diff_sum = sum((bag1[word] - bag2.get(word, 0)) ** 2 for word in common_words)
    return math.sqrt(squared_diff_sum)

# spot different words

def different_words_between_bags(bag1, bag2):
    words_in_bag1 = set(bag1.keys())
    words_in_bag2 = set(bag2.keys())
    
    words_only_in_bag1 = words_in_bag1 - words_in_bag2
    words_only_in_bag2 = words_in_bag2 - words_in_bag1
    
    return words_only_in_bag1, words_only_in_bag2

def different_words_indices(text1, text2):
    words1 = preprocess_text(text1)
    words2 = preprocess_text(text2)
    
    different_indices = []
    
    for index, (word1, word2) in enumerate(zip(words1, words2)):
        if word1 != word2:
            different_indices.append((index, word1))
            
    
    return different_indices


# Paths

In [14]:
# r"path"
mailink_path = 'your_document.doc'
engage_path = 'engage_doc.doc'


# Bag of Words

In [15]:

# extracted_mailink = extract_text_from_doc(mailink_path)
# extracted_engage = extract_text_from_doc(engage_path)

extracted_mailink = "A linear model is a mathematical representation that assumes a linear relationship between input variables and the output."
extracted_engage = "A linear model is a polish representation that assumes a german relationship between input variables and the output again and again."


In [26]:

bag_of_words_mailink = calculate_bag_of_words(extracted_mailink)
bag_of_words_engage = calculate_bag_of_words(extracted_engage)

print("Bag of Words:", bag_of_words_mailink)
print("Bag of Words:", bag_of_words_engage)


print("Euclidean distance: ", euclidean_distance(bag_of_words_mailink, bag_of_words_engage))
# print("Cosine distance: ", cosine_similarity(bag_of_words_mailink, bag_of_words_engage))
# print("Jaccard distance: ", jaccard_similarity(bag_of_words_mailink, bag_of_words_engage))

different_indices = different_words_indices(extracted_engage, extracted_mailink)
different_indices2 = different_words_indices(extracted_mailink, extracted_engage)


print("extracted_engage len", len(extracted_engage.split()))
print("extracted_mailink len", len(extracted_mailink.split()))

print("Different words indices", different_indices2)

print("Different words indices", different_indices)
print("Different words indices", different_indices2)



# print("Different words indices", different_words)









Bag of Words: Counter({'a': 3, 'linear': 2, 'model': 1, 'is': 1, 'mathematical': 1, 'representation': 1, 'that': 1, 'assumes': 1, 'relationship': 1, 'between': 1, 'input': 1, 'variables': 1, 'and': 1, 'the': 1, 'output': 1})
Bag of Words: Counter({'a': 3, 'and': 2, 'again': 2, 'linear': 1, 'model': 1, 'is': 1, 'polish': 1, 'representation': 1, 'that': 1, 'assumes': 1, 'german': 1, 'relationship': 1, 'between': 1, 'input': 1, 'variables': 1, 'the': 1, 'output': 1})
Euclidean distance:  1.4142135623730951
Cosine distance:  0.8453639444116164
Jaccard distance:  0.7777777777777778
extracted_engage len 21
extracted_mailink len 18
Different words indices [(5, 'mathematical'), (10, 'linear')]
Different words indices [(5, 'polish'), (10, 'german')]
Different words indices [(5, 'mathematical'), (10, 'linear')]
