In [33]:
from docx import Document
from docx.oxml.ns import nsdecls
from docx.oxml import parse_xml
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import xml.etree.ElementTree as ET
from docx.opc.constants import RELATIONSHIP_TYPE as RT

from lxml import etree
import zipfile

In [31]:
def extract_comments(filename):
    doc = Document(filename)
    try:
        comments_part = doc.part.package.part_related_by('http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments')
    except KeyError:
        return []
    root = ET.fromstring(comments_part.blob)
    comments = [elem.text for elem in root.iter() if elem.tag.endswith('comment')]
    return comments

In [34]:
ooXMLns = {'w':'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
#Function to extract all the comments of document(Same as accepted answer)
#Returns a dictionary with comment id as key and comment string as value
def get_document_comments(docxFileName):
    comments_dict={}
    docxZip = zipfile.ZipFile(docxFileName)
    commentsXML = docxZip.read('word/comments.xml')
    et = etree.XML(commentsXML)
    comments = et.xpath('//w:comment',namespaces=ooXMLns)
    for c in comments:
        comment=c.xpath('string(.)',namespaces=ooXMLns)
        comment_id=c.xpath('@w:id',namespaces=ooXMLns)[0]
        comments_dict[comment_id]=comment
    return comments_dict

In [36]:
#Function to fetch all the comments in a paragraph
def paragraph_comments(paragraph,comments_dict):
    comments=[]
    for run in paragraph.runs:
        comment_reference=run._r.xpath("./w:commentReference")
        if comment_reference:
            comment_id=comment_reference[0].xpath('@w:id',namespaces=ooXMLns)[0]
            comment=comments_dict[comment_id]
            comments.append(comment)
    return comments

In [35]:
def comments_with_reference_paragraph(docxFileName):
    document = Document(docxFileName)
    comments_dict=get_document_comments(docxFileName)
    comments_with_their_reference_paragraph=[]
    for paragraph in document.paragraphs:  
        if comments_dict: 
            comments=paragraph_comments(paragraph,comments_dict)  
            if comments:
                comments_with_their_reference_paragraph.append({paragraph.text: comments})
    return comments_with_their_reference_paragraph

In [62]:
path_A = r"C:\Users\frbre\OneDrive\01 Dokumenter\01 Uni\SDS Thesis\data\Word_test\Lorem ipsum_A.docx"
path_B = "C:/Users/frbre/OneDrive/01 Dokumenter/01 Uni/SDS Thesis/data/Word_test/Lorem ipsum_B.docx"
path_C = ".\data\Word_test\Lorem ipsum dolor_C.docx"

In [37]:
doc_A = get_document_comments(path_A)
doc_A

{'0': '35', '1': '02', '2': '28', '3': '04'}

In [39]:
ref_doc_A = comments_with_reference_paragraph(path_A)
ref_doc_A

[{'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam vehicula tellus ut sem rhoncus sodales. Mauris porta ultricies ligula, sit amet placerat diam tristique nec. Nullam id orci efficitur justo fringilla malesuada in eu tortor. Maecenas lectus sem, porta in sapien vel, finibus dictum tellus. Pellentesque aliquam elit in tellus efficitur rhoncus. Integer id lacinia nisi, non elementum quam. Proin eros nunc, aliquet eget blandit in, efficitur et lacus. Mauris egestas ultrices lacus sit amet consectetur. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc at diam quis nulla rhoncus aliquet. Suspendisse in elit non nibh porttitor gravida nec eget velit. Etiam rutrum bibendum nulla, vel pellentesque sem. Vivamus aliquet vitae ipsum ut auctor. Vestibulum bibendum condimentum turpis sit amet laoreet. Nunc fermentum blandit sapien, sed pulvinar lorem laoreet id. Suspendisse vitae sagittis dolor, a viverra turpis. ': ['35']},
 {'Sed tristique at neque ut porttitor. Nullam

In [63]:
doc_B = get_document_comments(path_B)
doc_B

{'0': '08', '1': '11', '2': '25', '3': '01', '4': '34', '5': '17'}

In [64]:
ref_doc_B = comments_with_reference_paragraph(path_B)
ref_doc_B

[{'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec at molestie nisl, malesuada tempus lectus. Mauris feugiat rutrum mauris, in dictum est volutpat a. Pellentesque dictum metus id mi aliquam ultrices. Donec nec est et magna mollis consectetur. Sed tincidunt magna in enim efficitur ullamcorper. Proin facilisis eleifend urna, eget fermentum odio eleifend a. Integer molestie facilisis nulla vitae mattis. Cras non pretium arcu. Nulla facilisi. Morbi quis commodo libero. ': ['08']},
 {'Cras mattis ipsum ac nisi suscipit, quis semper lectus porta. Aliquam erat volutpat. Quisque ac sem tempor mauris hendrerit consectetur. Vestibulum volutpat lorem vel nisi ullamcorper semper. Vestibulum eget tortor a ligula semper rhoncus non ut sem. Curabitur scelerisque mollis euismod. In interdum laoreet purus ut sollicitudin. Phasellus ut tempor dolor. Interdum et malesuada fames ac ante ipsum primis in faucibus. Nam interdum lacus sed mi dictum euismod. ': ['11']},
 {'Aliquam ac mauris eu e