###  Script aims to cross-reference an excel file (containing a dictionary) with a word document by searching for dictionary entries within the text of the document.

### EN text:

In [8]:
import pandas as pd
import re
from docx import Document
import warnings

warnings.filterwarnings('ignore')

def extract_text_runs(paragraph):
    return ''.join([run.text for run in paragraph.runs])

# Read xlsx file
excel_file = 'dictionary.xlsx'
df = pd.read_excel(excel_file, engine='openpyxl')
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Load Word doc
docx_file = 'en.docx'
document = Document(docx_file)

# Extract all text from the Word document, including runs
docx_text = ' '.join([extract_text_runs(paragraph) for paragraph in document.paragraphs])

# Find values in the Word document
found_values_en = []
for value in df.iloc[:, 0]:  # Reads 1 col
    if isinstance(value, str):
        search_pattern = re.compile(r'\b' + re.escape(value) + r'\w*', re.IGNORECASE)  # Updated pattern to match strings that start with the given value
        matches = search_pattern.finditer(docx_text)
        if matches:
            found_values_en.extend(match.group() for match in matches)


print("Found values (without duplicates):", ', '.join(set(map(str, found_values_en))))
print("Number of words: ", len(set(found_values_en)))

Found values (without duplicates): metric, Delivery, scorecards, Account Manager, Business Coach, Scorecard, scorecard, compliance, Compliance
Number of words:  9


In [9]:
# run in order to see original amount of matches
print("Found values (with duplicates):", ', '.join(set(map(str, found_values_en))))
print("Number of words: ", len(found_values_en))

Found values (with duplicates): metric, Delivery, scorecards, Account Manager, Business Coach, Scorecard, scorecard, compliance, Compliance
Number of words:  38


### DE text:

In [10]:
# import pandas as pd
# import re
# from docx import Document

def extract_text_runs(paragraph):
    return ''.join([run.text for run in paragraph.runs])

# Read xlsx file
excel_file = 'dictionary.xlsx'
df = pd.read_excel(excel_file, engine='openpyxl')
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Load Word doc
docx_file = 'de.docx'
document = Document(docx_file)

# Extract all text from the Word document, including runs
docx_text = ' '.join([extract_text_runs(paragraph) for paragraph in document.paragraphs])

# Find values in the Word document
found_values_de = []
for value in df.iloc[:, 2]:  # Reads 1 col
    if isinstance(value, str):
        search_pattern = re.compile(r'\b' + re.escape(value) + r'\w*', re.IGNORECASE)  # Updated pattern to match strings that start with the given value
        matches = search_pattern.finditer(docx_text)
        if matches:
            found_values_de.extend(match.group() for match in matches)



print("Found values (excluding duplicates):", ', '.join(set(map(str, found_values_de))))
print("Number of words: ", len(set(found_values_de)))

Found values (excluding duplicates): Fahrzeuginspektion, Sicherheit, Metrikziele, Account Manager, Scorecards, Delivery Vehicle Inspection Checklist (DVIC), Business Coach, Metrik, Einhaltungsmetrik, Metrikdefinition, Scorecard, Kundenfeedback, Einhaltung, Metrikbeiträge, Metriken, Fahrer, Metrikgewichtungen
Number of words:  17


In [11]:
print("Found values (including duplicates):", ', '.join(map(str, found_values_de)))
print("Number of words: ", len(found_values_de))

Found values (including duplicates): Account Manager, Account Manager, Account Manager, Business Coach, Einhaltungsmetrik, Einhaltung, Einhaltung, Einhaltungsmetrik, Einhaltungsmetrik, Einhaltung, Einhaltung, Einhaltung, Einhaltung, Einhaltung, Einhaltung, Fahrer, Sicherheit, Sicherheit, Scorecard, Scorecard, Scorecards, Scorecards, Scorecards, Scorecards, Scorecards, Scorecards, Scorecards, Scorecards, Scorecards, Scorecards, Scorecard, Scorecard, Scorecard, Scorecard, Fahrzeuginspektion, Account Manager, Account Manager, Account Manager, Kundenfeedback, Delivery Vehicle Inspection Checklist (DVIC), Metrikziele, Metrik, Metrikziele, Metrikdefinition, Metrikdefinition, Metrikziele, Metrikdefinition, Metrik, Metrik, Metrikgewichtungen, Metrikgewichtungen, Metrikgewichtungen, Metrikbeiträge, Metriken, Metrik
Number of words:  55
