In [64]:
import win32com.client

In [92]:
word = win32com.client.Dispatch("Word.Application")
word.Visible = False  

In [94]:
doc = word.Documents.Open(r"C:\Users\91943\OneDrive\Desktop\Coding_Test_1\test_folder\input.docx")
text = doc.Content.Text
# Opening the input file and extracting text

In [41]:
import re


extracting of quoted regions

In [95]:
quoted_spans = [(m.start(), m.end()) for m in re.finditer(r'["“](.+?)["”]', text)]


In [45]:
#checking whether any index falling inside quoted region
def is_inside_quotes(index):
    for start, end in quoted_spans:
        if start <= index < end:
            return True
    return False


In [46]:
print(f"Found {len(quoted_spans)} quoted sections.")


Found 12 quoted sections.


defining replacement rules (UK terminology)

In [102]:
import requests

# Load full American to British spelling dictionary from GitHub
url = "https://raw.githubusercontent.com/hyperreality/American-British-English-Translator/master/data/american_spellings.json"
american_to_british_dict = requests.get(url).json()

print(f"✅ Loaded {len(american_to_british_dict)} American to British spelling rules.")


✅ Loaded 1800 American to British spelling rules.


In [97]:
# Define acronym expansions (only for lowercase form)
acronym_expansions = {
    "eg": "for example",
    "etc": "and so on",
    "ie": "that is",
    "vs": "versus",
}

# Precompile word replacement patterns
compiled_replacements = [
    (re.compile(rf'\b{re.escape(am_word)}\b', flags=re.IGNORECASE), br_word)
    for am_word, br_word in american_to_british_dict.items()
]

# corrected_text will be the final output
corrected_text = ""
i = 0

while i < len(text):
    if is_inside_quotes(i):
        for start, end in quoted_spans:
            if start == i:
                corrected_text += text[start:end]
                i = end
                break
    else:
        # Process until next quote starts, or end of text
        next_quote_start = min([start for start, _ in quoted_spans if start > i], default=len(text))
        chunk = text[i:next_quote_start]

        # Tokenize the chunk, keeping punctuation and spacing
        tokens = re.split(r'(\W+)', chunk)

        for j, token in enumerate(tokens):
            if not token.strip():
                continue  # Skip whitespace-only tokens

            # Skip Capitalized words (likely nouns) and all-uppercase acronyms
            if token[0].isupper() or token.isupper():
                continue

            # Expand lowercase acronyms
            if token.lower() in acronym_expansions:
                tokens[j] = acronym_expansions[token.lower()]
                continue

            # Apply American → British spelling replacements
            for pattern, replacement in compiled_replacements:
                token = pattern.sub(replacement, token)
            tokens[j] = token

        corrected_text += ''.join(tokens)
        i = next_quote_start

print("✅ Conversion completed.")
print("\nPreview:\n", corrected_text[:300])


✅ Conversion completed.

Preview:
 On Monday, we organise a significant conference for example in London, hosted by the World Health Organization. The event aims to address pressing global policy issues, and Dr Manmohan Singh will deliver the opening remarks. Dr Manmohan Singh has been a key figure in economic policy, and he plans to


Task 2 

In [104]:
import collections

# Title prefixes to look for (can expand if needed)
titles = ["Dr", "Mr", "Mrs", "Ms", "Prof"]

# Regex to find titled full names (e.g., Dr Manmohan Singh)
name_pattern = re.compile(r'\b(?:' + '|'.join(titles) + r')\s+[A-Z][a-z]+\s+[A-Z][a-z]+\b')

# Dictionary to count how many people share the same last name
last_name_count = collections.Counter()
# Ordered dict to preserve appearance order
full_name_mentions = collections.OrderedDict()

# Find all full names with titles
for match in name_pattern.finditer(corrected_text):
    full_name = match.group()
    parts = full_name.split()
    title, first, last = parts
    last_name_count[last] += 1
    full_name_mentions[full_name] = {"title": title, "first": first, "last": last}

print("✅ Found full titled names:")
for name in full_name_mentions:
    print("-", name)


✅ Found full titled names:
- Dr Manmohan Singh
- Dr Aishwarya Rai


Replacing Names accordingly

In [99]:
# Function to do the name substitution
def replace_names(text):
    replaced = text
    for full_name, meta in full_name_mentions.items():
        title = meta["title"]
        last = meta["last"]
        
        if last_name_count[last] == 1:
            # Only one person with this last name → shorten on subsequent mentions
            pattern = rf'\b{re.escape(full_name)}\b'
            # Replace first occurrence with full, then rest with title + last
            occurrences = list(re.finditer(pattern, replaced))
            for i, match in enumerate(occurrences):
                short = f"{title} {last}"
                if i == 0:
                    continue  # first mention stays full
                start, end = match.span()
                replaced = replaced[:start] + short + replaced[end:]
        else:
            # Shared last name → keep full name always
            continue
    return replaced

# Apply name replacements
corrected_text = replace_names(corrected_text)
print("✅ Name shortening rules applied.")


✅ Name shortening rules applied.


Initials in names

In [108]:
# Add periods after single-letter initials (only if followed by a capitalized last name)
corrected_text = re.sub(r'\b([A-Z]) (?=[A-Z][a-z]+\b)', r'\1. ', corrected_text)
print("✅ Initials with periods fixed.")


✅ Initials with periods fixed.


In [109]:
# Start Word again
word = win32com.client.Dispatch("Word.Application")
word.Visible = False

# Create a new document
doc = word.Documents.Add()

# Add "Corrected:" heading
doc.Content.Text = "Corrected:\n\n" + corrected_text

# Save the output file (adjust the path as needed)
output_path = r"C:\Users\91943\OneDrive\Desktop\Coding_Test_1\test_folder\output.docx"
doc.SaveAs(output_path)

# Close Word
doc.Close()
word.Quit()

print(f"✅ Output saved to:\n{output_path}")


✅ Output saved to:
C:\Users\91943\OneDrive\Desktop\Coding_Test_1\test_folder\output.docx
