Displaying the number of .txt files in a Dholuo folder

In [1]:
import os

folder_path = 'Dholuo'  # Path to the "Dholuo" folder

# Function to count .txt files in a directory
def count_txt_files(directory):
    txt_count = 0
    for file_name in os.listdir(directory):
        if file_name.endswith('.txt') and os.path.isfile(os.path.join(directory, file_name)):
            txt_count += 1
    return txt_count

# Count .txt files in the specified folder
txt_file_count = count_txt_files(folder_path)

print(f"Number of .txt files in '{folder_path}': {txt_file_count}")


Number of .txt files in 'Dholuo': 99


Merging the Files

In [2]:
import os
import chardet

# Function to read and process the files
def merge_files(directory, output_file):
    merged_content = []
    
    # Iterate over all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'rb') as file:
                raw_data = file.read()
                result = chardet.detect(raw_data)
                encoding = result['encoding']
                if encoding is None:
                    print(f"Could not detect encoding for file {filename}, skipping.")
                    continue
                
                with open(file_path, 'r', encoding=encoding) as text_file:
                    lines = text_file.readlines()
                    for i in range(0, len(lines), 2):
                        if lines[i].startswith("O:") and lines[i+1].startswith("T:"):
                            merged_content.append(lines[i].strip())
                            merged_content.append(lines[i+1].strip())
                        else:
                            print(f"Warning: Mismatched lines in file {filename} at line {i+1}")
    
    # Write the merged content to the output file
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for line in merged_content:
            outfile.write(line + '\n')

# Specify the directory containing the files and the output file name
input_directory = 'Dholuo'
output_file = 'combined.txt'

# Merge the files
merge_files(input_directory, output_file)
print(f'Merged content written to {output_file}')




Merged content written to combined.txt


Dislaying the original data

In [3]:

file_path = 'combined.txt'

# Initialize counters
line_count = 0
word_count = 0
char_count = 0

# Open and read the file with utf-8 encoding
with open(file_path, 'r', encoding='utf-8') as file:
    contents = file.read()
    
    # Count lines
    lines = contents.split('\n')
    line_count = len(lines)
    
    # Count words and characters
    for line in lines:
        words = line.split()
        word_count += len(words)
        char_count += len(line)

# Display the file contents
# print("File Contents:")
# print(contents)
# print("\nStatistics:")
print(f"Lines: {line_count}")
# print(f"Words: {word_count}")
# print(f"Characters: {char_count}")


Lines: 8073



Confirming every Original sentence has a translation

In [4]:
def check_file(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Initialize flags and counters
    previous_was_o = False
    all_correct = True
    o_count = 0
    t_count = 0
    
    for i, line in enumerate(lines):
        if line.startswith('O:'):
            o_count += 1
            if previous_was_o:
                print(f"Line {i} is an 'O:' line without a corresponding 'T:' line.")
                all_correct = False
            previous_was_o = True
        elif line.startswith('T:'):
            t_count += 1
            if not previous_was_o:
                print(f"Line {i+1} is a 'T:' line without a preceding 'O:' line.")
                all_correct = False
            previous_was_o = False
        else:
            print(f"Line {i+1} does not start with 'O:' or 'T:'.")
            all_correct = False
    
    # Check if the file ended with an 'O:' line
    if previous_was_o:
        print(f"The file ends with an 'O:' line without a corresponding 'T:' line.")
        all_correct = False

    if all_correct and o_count == t_count:
        print("All 'O:' and 'T:' lines are correctly paired.")
    else:
        print("There are discrepancies in the 'O:' and 'T:' line pairs.")

# Calling the function with the path to the text file
check_file('combined.txt')


All 'O:' and 'T:' lines are correctly paired.


Cleaning and Testing if it can be tokenized

In [10]:
import re


def clean_text(line):
    # Convert to lowercase
    line = line.lower()
    # Preserve numbers, commas, and full stops; remove other non-alphabetic characters
    cleaned_line = re.sub(r'[^a-z0-9\s,.]', '', line)  # Remove non-alphabetic characters except spaces, numbers, commas, and full stops
    cleaned_line = re.sub(r'\s+', ' ', cleaned_line)  # Replace multiple spaces with a single space
    cleaned_line = cleaned_line.strip()  # Remove leading and trailing whitespace
    # Ensure the line ends with a full stop
    if not cleaned_line.endswith('.'):
        cleaned_line += '.'
    return cleaned_line

def clean_file(input_filename, output_filename):
    with open(input_filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Initialize flags and a list to hold the valid pairs
    previous_was_o = False
    valid_pairs = []
    
    for i, line in enumerate(lines):
        if line.startswith('O:'):
            if previous_was_o:
                # Previous 'O:' line didn't have a corresponding 'T:' line, skip this 'O:'
                previous_was_o = True
            else:
                previous_was_o = True
                current_o_line = line
        elif line.startswith('T:'):
            if previous_was_o:
                # Valid 'O:' followed by a valid 'T:'
                cleaned_o_line = clean_text(current_o_line[2:])  # Clean the text after 'O: '
                cleaned_t_line = clean_text(line[2:])  # Clean the text after 'T: '
                valid_pairs.append(f"O: {cleaned_o_line}\n")
                valid_pairs.append(f"T: {cleaned_t_line}\n")
                previous_was_o = False
            else:
                # 'T:' line without a preceding 'O:' line, skip this 'T:'
                previous_was_o = False
        else:
            # Line does not start with 'O:' or 'T:', ignore and reset flag
            previous_was_o = False

    # Write the valid pairs to a new file
    with open(output_filename, 'w', encoding='utf-8') as output_file:
        output_file.writelines(valid_pairs)

    print(f"Cleaned file written to {output_filename}")

    return valid_pairs

def tokenize_text(text_lines):
    # Join the lines to a single string
    text = ' '.join(text_lines)
    # Tokenize the text
    tokens = word_tokenize(text)
    return tokens

# Call the function with the path to your input and output text files
input_file = 'combined.txt'
output_file = 'N_combined.txt'
cleaned_lines = clean_file(input_file, output_file)


with open(output_file, 'r', encoding='utf-8') as file:
    cleaned_lines = file.readlines()  # Read lines into a list

# Display the first 5 lines
for line in cleaned_lines[:5]:
    print(line.strip())  # Use strip() to remove any leading/trailing whitespace characters


Cleaned file written to N_combined.txt
O: 6am dala fm news 9th march 2021 headline.
T: vidokezi vya habari ya saa kumi na mbili asubuhi tarehe 932021 katika stesheni ya dala.
O: migawo ma ochung ne weche medo remo, kenya national blood transfusion services otudore gi migawo mar coalition of blood for africacoba echenro mar neno ni jopiny ogolo remo maromo units gana 3 ei ndalo 3 manyalo konyo mine mawito remo mathoth kinde magikonyore.
T: wasimamizi wa huduma za kutiwa damu mishapani,kenya national blood transfusion services wameshirikiana na sekta ya coalition of blood for africa coba kuhakikisha kuwa wananchi wametoa damu units tatu kwa siku tatu ambayo inawezakuwasaidia wanawake wanaopoteza damu nyingi wakati wa kujifungua.
O: chienrono ma otisi ni keep mothers alive ibiro itayo egwenge 3 epachoka kaka yor hingo odiochieng mine ebuo piny mangima ka ochopo egikone kawuono.


Separating the lines that start with O: with the ones that start with T:

In [6]:
def separate_file(input_filename, output_filename_o, output_filename_t):
    with open(input_filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    o_lines = []
    t_lines = []
    
    previous_was_o = False
    
    for line in lines:
        if line.startswith('O:'):
            o_lines.append(line)
            previous_was_o = True
        elif line.startswith('T:'):
            if previous_was_o:
                t_lines.append(line)
                previous_was_o = False
            else:
                previous_was_o = False

    # Write the lines to separate files
    with open(output_filename_o, 'w', encoding='utf-8') as output_file_o:
        output_file_o.writelines(o_lines)
    
    with open(output_filename_t, 'w', encoding='utf-8') as output_file_t:
        output_file_t.writelines(t_lines)

    print(f"'O:' lines written to {output_filename_o}")
    print(f"'T:' lines written to {output_filename_t}")

    return o_lines, t_lines

# Call the function with the path to your input and output text files
input_file = 'N_combined.txt'
output_file_o = 'N_combined_O.txt'
output_file_t = 'N_combined_T.txt'
o_lines, t_lines = separate_file(input_file, output_file_o, output_file_t)

# Display the content of the new cleaned files and count the number of lines
print(f"Number of 'O:' lines: {len(o_lines)}")
print(f"Number of 'T:' lines: {len(t_lines)}")


'O:' lines written to N_combined_O.txt
'T:' lines written to N_combined_T.txt
Number of 'O:' lines: 4036
Number of 'T:' lines: 4036


Removing the O: and T: in each file

In [15]:
def strip_prefixes_and_save(input_filename, output_filename):
    with open(input_filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    cleaned_lines = []

    for line in lines:
        # Strip leading 'O:' or 'T:' and any surrounding whitespace
        cleaned_line = line.lstrip('O:').lstrip('T:').strip()
        cleaned_lines.append(cleaned_line)

    # Write cleaned lines to output file
    with open(output_filename, 'w', encoding='utf-8') as output_file:
        output_file.write('\n'.join(cleaned_lines))

    # print(f"Cleaned lines written to {output_filename}")

    return cleaned_lines

def count_lines(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return len(lines)

# Clean 'O:' lines file
input_file_o = 'N_combined_O.txt'
output_file_o_cleaned = 'N_combined_O_cleaned.txt'
cleaned_o_lines = strip_prefixes_and_save(input_file_o, output_file_o_cleaned)

# Clean 'T:' lines file
input_file_t = 'N_combined_T.txt'
output_file_t_cleaned = 'N_combined_T_cleaned.txt'
cleaned_t_lines = strip_prefixes_and_save(input_file_t, output_file_t_cleaned)

# Count lines in cleaned 'O:' file
num_o_lines = count_lines(output_file_o_cleaned)
# print(f"Number of lines in 'N_combined_O_cleaned.txt': {num_o_lines}")

# Count lines in cleaned 'T:' file
num_t_lines = count_lines(output_file_t_cleaned)
# print(f"Number of lines in 'N_combined_T_cleaned.txt': {num_t_lines}")

# print(num_o_lines, num_t_lines)


To save the two files in a csv

In [8]:
import csv

def strip_prefixes_and_save(input_filename, output_filename):
    with open(input_filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    cleaned_lines = []

    for line in lines:
        # Strip leading 'O:' or 'T:' and any surrounding whitespace
        cleaned_line = line.lstrip('O:').lstrip('T:').strip()
        cleaned_lines.append(cleaned_line)

    # Write cleaned lines to output file
    with open(output_filename, 'w', encoding='utf-8') as output_file:
        output_file.write('\n'.join(cleaned_lines))

    print(f"Cleaned lines written to {output_filename}")

    return cleaned_lines

def display_file_contents(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    return lines

def count_lines(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return len(lines)

# Clean 'O:' lines file
input_file_o = 'N_combined_O.txt'
output_file_o_cleaned = 'N_combined_O_cleaned.txt'
cleaned_o_lines = strip_prefixes_and_save(input_file_o, output_file_o_cleaned)

# Clean 'T:' lines file
input_file_t = 'N_combined_T.txt'
output_file_t_cleaned = 'N_combined_T_cleaned.txt'
cleaned_t_lines = strip_prefixes_and_save(input_file_t, output_file_t_cleaned)

# Display cleaned 'O:' lines file and count lines
o_lines = display_file_contents(output_file_o_cleaned)
num_o_lines = count_lines(output_file_o_cleaned)
print(f"Number of lines in 'N_combined_O_cleaned.txt': {num_o_lines}")

# Display cleaned 'T:' lines file and count lines
t_lines = display_file_contents(output_file_t_cleaned)
num_t_lines = count_lines(output_file_t_cleaned)
print(f"Number of lines in 'N_combined_T_cleaned.txt': {num_t_lines}")

# Write to CSV file
csv_filename = 'cleaned_data.csv'

with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['O', 'T'])  # Write header
    max_len = max(len(o_lines), len(t_lines))
    for i in range(max_len):
        o = o_lines[i].strip() if i < len(o_lines) else ''
        t = t_lines[i].strip() if i < len(t_lines) else ''
        writer.writerow([o, t])

print(f"Cleaned data written to '{csv_filename}' with {max_len} rows.")

Cleaned lines written to N_combined_O_cleaned.txt
Cleaned lines written to N_combined_T_cleaned.txt
Number of lines in 'N_combined_O_cleaned.txt': 4036
Number of lines in 'N_combined_T_cleaned.txt': 4036
Cleaned data written to 'cleaned_data.csv' with 4036 rows.


Splitting the data to training, Validation and Testing data.

In [9]:
import math

def divide_and_save(input_filename, train_filename, val_filename, test_filename, train_ratio=0.75, val_ratio=0.23):
    with open(input_filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    total_lines = len(lines)
    
    # Calculate counts for each set
    train_lines_count = math.ceil(total_lines * train_ratio)
    val_lines_count = math.ceil(total_lines * val_ratio)
    test_lines_count = total_lines - train_lines_count - val_lines_count

    # Divide lines into sets
    train_lines = lines[:train_lines_count]
    val_lines = lines[train_lines_count:train_lines_count + val_lines_count]
    test_lines = lines[train_lines_count + val_lines_count:]

    # Write training lines to file
    with open(train_filename, 'w', encoding='utf-8') as train_file:
        train_file.write(''.join(train_lines))

    # Write validation lines to file
    with open(val_filename, 'w', encoding='utf-8') as val_file:
        val_file.write(''.join(val_lines))

    # Write testing lines to file
    with open(test_filename, 'w', encoding='utf-8') as test_file:
        test_file.write(''.join(test_lines))

    print(f"Training lines ({train_lines_count} lines) written to {train_filename}")
    print(f"Validation lines ({val_lines_count} lines) written to {val_filename}")
    print(f"Testing lines ({test_lines_count} lines) written to {test_filename}")

    return train_lines_count, val_lines_count, test_lines_count, train_lines, val_lines, test_lines

def display_and_count(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    print(f"Contents of '{filename}':")
    for line in lines:
        print(line.rstrip())  # rstrip() to remove newline characters at the end

    num_lines = len(lines)
    print(f"Number of lines in '{filename}': {num_lines}")

    return num_lines, lines

# Clean 'O:' lines file
input_file_o_cleaned = 'N_combined_O_cleaned.txt'
train_file_o = 'N_combined_O_train.txt'
val_file_o = 'N_combined_O_val.txt'
test_file_o = 'N_combined_O_test.txt'
train_count_o, val_count_o, test_count_o, train_lines_o, val_lines_o, test_lines_o = divide_and_save(input_file_o_cleaned, train_file_o, val_file_o, test_file_o)

# Clean 'T:' lines file
input_file_t_cleaned = 'N_combined_T_cleaned.txt'
train_file_t = 'N_combined_T_train.txt'
val_file_t = 'N_combined_T_val.txt'
test_file_t = 'N_combined_T_test.txt'
train_count_t, val_count_t, test_count_t, train_lines_t, val_lines_t, test_lines_t = divide_and_save(input_file_t_cleaned, train_file_t, val_file_t, test_file_t)

Training lines (3027 lines) written to N_combined_O_train.txt
Validation lines (929 lines) written to N_combined_O_val.txt
Testing lines (80 lines) written to N_combined_O_test.txt
Training lines (3027 lines) written to N_combined_T_train.txt
Validation lines (929 lines) written to N_combined_T_val.txt
Testing lines (80 lines) written to N_combined_T_test.txt
