In [None]:
import language_tool_python
import re

tool = language_tool_python.LanguageTool('en-US')

In [24]:
def clean_text(text):
    corrections = []
    
    # Remove extra spaces (more than one space between words)
    new_text = re.sub(r'\s+', ' ', text)
    if new_text != text:
        corrections.append({
            'message': 'Multiple spaces found and replaced with a single space.',
            'original_text': text,
            'fixed_text': new_text
        })
    text = new_text
    
    # Remove spaces before punctuation
    new_text = re.sub(r'\s([?.!"])', r'\1', text)
    if new_text != text:
        corrections.append({
            'message': 'Spaces before punctuation removed.',
            'original_text': text,
            'fixed_text': new_text
        })
    text = new_text
    
    # Remove unwanted characters or special characters (including $$)
    new_text = re.sub(r'[^\w\s.,!?-]', '', text)  # Removing non-alphanumeric characters except common punctuation
    if new_text != text:
        corrections.append({
            'message': 'Removed unwanted special characters (e.g., $$).',
            'original_text': text,
            'fixed_text': new_text
        })
    text = new_text
    
    # Optionally, handle multiple punctuations (like "...." -> ".")
    new_text = re.sub(r'([.?!])\1+', r'\1', text)
    if new_text != text:
        corrections.append({
            'message': 'Reduced multiple punctuation marks to a single one.',
            'original_text': text,
            'fixed_text': new_text
        })
    text = new_text

    return text, corrections

In [31]:
def check_grammar_with_replacements(text):
    # Clean the text before checking for grammar
    cleaned_text, cleaning_corrections = clean_text(text)
    
    # Check for grammar issues using LanguageTool
    matches = tool.check(cleaned_text)
    
    # Split the cleaned text into words and calculate character ranges for each word
    words = cleaned_text.split()
    word_ranges = []
    current_offset = 0


    for word in words:
        start_offset = current_offset
        end_offset = start_offset + len(word)
        word_ranges.append((start_offset, end_offset))
        current_offset = end_offset + 1  # Account for space after each word

    # Prepare a list to store corrections
    corrections = []

    for match in matches:
        # Extract information about the grammar issue
        original_text = text[match.offset: match.offset + match.errorLength]

        # Find the word index using word_ranges
        word_index = next((i for i, (start, end) in enumerate(word_ranges) if start <= match.offset < end),-1)

        # Construct the correction object
        correction = {
            "word_index": word_index,  # Word-based index
            "character_offset": match.offset,  # Character-based offset
            "original_text": original_text,
            "message": match.message,
            "replacements": match.replacements or ["No suggestions"],  # Handle empty replacements
        }

        corrections.append(correction)

    # Return the corrections in a structured format
    return {
        "original_text": text,
        "cleaned_text": cleaned_text,  # Include cleaned version of the text
        "cleaning_corrections": cleaning_corrections,  # Corrections from clean_text()
        "grammar_corrections": correction  # Grammar corrections from LanguageTool
    }



In [32]:
# text = "The quickk brown fox jumps over the lazi dog."
text = "I wentt to the$$ store, I bought milk.I went to the store; I bought milk."
# text = """
# In times of emergency, firemen is the brave ones who risk their lives to save others, while policemen work tirelessly to enforce law and order on our streets. These men are just naturally inclined towards such roles, given there physical strength and cowrage. Firemen and policemen undergo rigorous training that prepare them for the challenging situations they face everyday, showing that some jobs simply fit men better. Women might work as policewomen or lady firefighters, but its often a tough fit for them as compared to their male colleagues. In the business world, a successful businessman is admire for his ability to negotiate and lead a team effectively. Many companies prefer male chairmen since they are knowed for their decisiveness and strategic thinking. Even at lower levels, salesmen is often seen as more persuasive than their female counterparts, as people tend to trust men in these roles. Women on the other hand, usually pursue careers as secretaries or assistants, providing the vital support to their male bosses whom handle the main responsibilities.
# """

# Call the function to get corrections
result = check_grammar_with_replacements(text)

# Print the results
import json
print(json.dumps(result, indent=4))

{
    "original_text": "I wentt to the$$ store, I bought milk.I went to the store; I bought milk.",
    "cleaned_text": "I wentt to the store, I bought milk.I went to the store I bought milk.",
    "cleaning_corrections": [
        {
            "message": "Removed unwanted special characters (e.g., $$).",
            "original_text": "I wentt to the$$ store, I bought milk.I went to the store; I bought milk.",
            "fixed_text": "I wentt to the store, I bought milk.I went to the store I bought milk."
        }
    ],
    "grammar_corrections": {
        "word_index": 7,
        "character_offset": 36,
        "original_text": "k",
        "message": "Add a space between sentences.",
        "replacements": [
            " I"
        ]
    }
}


In [65]:
def wew(text):
    # Check for grammar and spelling issues using LanguageTool
    matches = tool.check(text)

    # Split the text into words and calculate character ranges for each word (including punctuation)
    words, word_ranges = split_text_with_punctuation(text)

    # Prepare a list to store corrections
    corrections = []

    for match in matches:
        # Extract information about the grammar issue
        original_text = text[match.offset: match.offset + match.errorLength]
        
        # Find the word index using word_ranges (improved handling of punctuation)
        word_index = get_word_index_from_offset(word_ranges, match.offset)

        # Construct the correction object
        correction = {
            "word_index": word_index,  # Word-based index
            "character_offset": match.offset,  # Character-based offset
            "original_text": original_text,
            "message": match.message,
            "replacements": match.replacements or ["No suggestions"],  # Handle empty replacements
            "error_category": match.ruleId,  # Include rule category (useful for further insights)
        }

        corrections.append(correction)

    # Return the corrections in a structured format
    return {
        "original_text": text,
        "corrections": corrections
    }

def split_text_with_punctuation(text):
    """Splits the text into words and punctuation while keeping track of offsets."""
    # Regex to split on spaces, punctuation, and special characters
    words_with_punctuation = re.findall(r'\S+|[.,!?;()[]{}":]', text)
    
    word_ranges = []
    current_offset = 0
    for word in words_with_punctuation:
        start_offset = current_offset
        end_offset = start_offset + len(word)
        word_ranges.append((start_offset, end_offset))
        current_offset = end_offset + 1  # Account for space or punctuation after the word
    return words_with_punctuation, word_ranges

def get_word_index_from_offset(word_ranges, offset):
    """Finds the word index based on character offset."""
    for i, (start, end) in enumerate(word_ranges):
        if start <= offset < end:
            return i
    return -1  # Return -1 if no match is found, which shouldn't happen

def fix_punctuation(text):
    """
    Automatically fix common punctuation errors, like incorrect semicolon placement,
    misplaced commas, and missing punctuation marks.
    """
    corrected_text = text

    # Step 1: Remove redundant punctuation marks like '!!', ';;', ',,,', etc.
    corrected_text = re.sub(r'([.,!?;])\1+', r'\1', corrected_text)  # Collapsing multiple punctuation marks into one.

    # Step 2: Ensure there is a space after a period (between sentences)
    corrected_text = re.sub(r'([a-zA-Z0-9])\.(?=\S)', r'\1. ', corrected_text)

    # Step 3: Ensure there is a space after a semicolon (for readability between clauses)
    corrected_text = re.sub(r'([;,])(?=\S)', r'\1 ', corrected_text)

    # Step 4: Fix incorrect semicolon usage: semicolons should separate independent clauses
    corrected_text = re.sub(r'(\w+); (\w+)\s+(\w+)', r'\1; \2 \3', corrected_text)

    # Step 5: Fix extra spaces or missing spaces after punctuation marks
    corrected_text = re.sub(r'\s*([.,!?;()"])', r' \1', corrected_text)
    
    # Step 6: Remove unnecessary spaces before punctuation
    corrected_text = re.sub(r'\s+([.,!?;()"])', r'\1', corrected_text)

    # Step 7: Fix case where punctuation is at the start of the sentence (e.g. "!!I went...")
    corrected_text = re.sub(r'([.,!?;])(?=\S)', r' \1', corrected_text)

    return corrected_text.strip()

In [66]:
# text = "The quickk brown fox jumps over the lazi dog."
text = "!!I bought milk. I went;; to the store; I bought milk."
# text = """
# In times of emergency, firemen is the brave ones who risk their lives to save others, while policemen work tirelessly to enforce law and order on our streets. These men are just naturally inclined towards such roles, given there physical strength and cowrage. Firemen and policemen undergo rigorous training that prepare them for the challenging situations they face everyday, showing that some jobs simply fit men better. Women might work as policewomen or lady firefighters, but its often a tough fit for them as compared to their male colleagues. In the business world, a successful businessman is admire for his ability to negotiate and lead a team effectively. Many companies prefer male chairmen since they are knowed for their decisiveness and strategic thinking. Even at lower levels, salesmen is often seen as more persuasive than their female counterparts, as people tend to trust men in these roles. Women on the other hand, usually pursue careers as secretaries or assistants, providing the vital support to their male bosses whom handle the main responsibilities.
# """

# Call the function to get corrections
result = wew(text)

# Print the results
import json
print(result)
print(json.dumps(result, indent=4))

{'original_text': '!!I bought milk. I went;; to the store; I bought milk.', 'corrections': [{'word_index': 0, 'character_offset': 2, 'original_text': 'I', 'message': 'Add a space between sentences.', 'replacements': [' I'], 'error_category': 'SENTENCE_WHITESPACE'}]}
{
    "original_text": "!!I bought milk. I went;; to the store; I bought milk.",
    "corrections": [
        {
            "word_index": 0,
            "character_offset": 2,
            "original_text": "I",
            "message": "Add a space between sentences.",
            "replacements": [
                " I"
            ],
            "error_category": "SENTENCE_WHITESPACE"
        }
    ]
}
