In [1]:
import re
import os

### For English and Hindi Dataset

In [6]:
def clean_text(text):
    pattern = r'[^\u0900-\u097Fa-zA-Z\s]'
    cleaned_text = re.sub(pattern, '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return str.lower(cleaned_text)

In [7]:
english_uncleaned_file = "english_uncleaned_dataset.txt"
english_outfile_file = "english_dataset.txt"

indian_language_uncleaned_file = "indian_language_uncleaned_dataset.txt"
indian_language_outfile_file = "indian_language_dataset.txt"

with open(english_uncleaned_file, 'r', encoding='utf-8') as infile:
    with open(english_outfile_file, 'w', encoding='utf-8') as outfile:
        outfile.write(clean_text(infile.read()))

with open(indian_language_uncleaned_file, 'r', encoding='utf-8') as infile:
    with open(indian_language_outfile_file, 'w', encoding='utf-8') as outfile:
        outfile.write(clean_text(infile.read()))

### For Marathi Dataset

In [13]:
def extract_sentences(input_dir, output_file):
    """
    Traverse the directory, read all .txt files, and write their content into a single .txt file.

    Args:
    - input_dir: Root directory containing the year folders.
    - output_file: Path to the output file where sentences will be saved.
    """
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for root, dirs, files in os.walk(input_dir):
            for file in files:
                if file.endswith('.cms.txt'):
                    file_path = os.path.join(root, file)
                    try:
                        with open(file_path, 'r', encoding='utf-8') as infile:
                            content = infile.read()
                            outfile.write(content + '\n')
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")

input_directory = 'Downloads/MarathiData'
output_filepath = 'Downloads/MarathiData/output.txt'

extract_sentences(input_directory, output_filepath)

In [17]:
def clean_tags(input_file, output_file):
    """
    Remove specified tags from the input file and write the cleaned content to the output file.

    Args:
    - input_file: Path to the input file.
    - output_file: Path to the output file.
    """
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for line in infile:
            # Remove the specified tags
            line = re.sub(r'</?(doc|text)>', '', line)  # Remove <doc>, </doc>, <text>, </text>
            line = re.sub(r'<docno>.*?</docno>', '', line)  # Remove <docno>...</docno>
            # Write the cleaned line if it's not empty after stripping
            if line.strip():
                outfile.write(line)

input_filepath = "Downloads/MarathiData/output.txt"
output_filepath = "Downloads/MarathiData/output_no_tags.txt"

clean_tags(input_filepath, output_filepath)

In [None]:
def clean_and_convert_to_single_line(input_file, output_file):
    """
    Remove all datetime strings and convert the document to a single line.

    Args:
    - input_file: Path to the input file.
    - output_file: Path to the output file.
    """
    # Regular expression for matching datetime strings
    datetime_pattern = r'\[\s*[A-Za-z]+,\s+[A-Za-z]+\s+\d{1,2},\s+\d{4}\s+\d{2}:\d{2}:\d{2}\s+[apAP][mM]\s*\]'

    with open(input_file, 'r', encoding='utf-8') as infile:
        content = infile.read()
        # Remove datetime strings
        content = re.sub(datetime_pattern, '', content)
        # Convert to single line by replacing newlines with a space
        single_line_content = ' '.join(content.split())

    with open(output_file, 'w', encoding='utf-8') as outfile:
        outfile.write(single_line_content)

input_filepath = "Downloads/MarathiData/output_no_tags.txt"
output_filepath = "Downloads/MarathiData/output_cleaned.txt"

clean_and_convert_to_single_line(input_filepath, output_filepath)

In [15]:
def remove_punctuation_and_normalize(input_file, output_file):
    """
    Remove all punctuation (including Marathi punctuation) and normalize the text
    such that each word is followed by a single space.

    Args:
    - input_file: Path to the input file.
    - output_file: Path to the output file.
    """
    # Regular expression for matching punctuation (including Marathi punctuation)
    punctuation_pattern = r'[ред!?.,:;\'"()\[\]{}рее]+'

    with open(input_file, 'r', encoding='utf-8') as infile:
        content = infile.read()
        # Remove punctuation
        content = re.sub(punctuation_pattern, '', content)
        # Normalize spaces
        normalized_content = ' '.join(content.split())

    with open(output_file, 'w', encoding='utf-8') as outfile:
        outfile.write(normalized_content)

input_filepath = "Downloads/MarathiData/output_cleaned.txt"
output_filepath = "Downloads/MarathiData/marathi_dataset.txt"

remove_punctuation_and_normalize(input_filepath, output_filepath)