In [8]:
import re
import pandas as pd

# Open and read the file
with open('Tswana_sentence.txt', 'r', encoding='utf-8', errors='replace') as f:
    text = f.read()

# Print some info about the raw text
print(f"Text length: {len(text)} characters")
print(f"First 100 characters: {text[:100]}")
print(f"Number of periods in text: {text.count('.')}")

# A better regex for Tswana text that handles sentence boundaries
sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])$|(?<=["""])\s*(?=[A-Z])', text)

# Clean up sentences
sentences = [s.strip() for s in sentences if s.strip()]

# Print some info about what we found
print(f"Found {len(sentences)} sentences")

# Create a list of dictionaries with sentence ID and text
data = []
for i, sentence in enumerate(sentences, 1):
    sent_id = f"AWI-Tswana_{i}"
    data.append({"sent_id": sent_id, "text": sentence})

# Convert the list to a DataFrame
df = pd.DataFrame(data)

# Display information about the DataFrame
print(f"\nDataFrame created with {len(df)} rows")
print("\nFirst 3 rows:")
print(df.head(3))
print("\nLast 3 rows:")
print(df.tail(3))

# Print the length of the DataFrame
print(f"\nTotal number of sentences: {len(df)}")

# Export the DataFrame to a CSV file
df.to_csv('Tswana_sentences.csv', index=False)
print("\nSaved to Tswana_sentences.csv")

Text length: 6534 characters
First 100 characters: ��M A S U N G A :   B a g w e b i   b a n g w e   l e   m a p o l o t i k i   a   m o t s e   w a   
Number of periods in text: 27
Found 1 sentences

DataFrame created with 1 rows

First 3 rows:
        sent_id                                               text
0  AWI-Tswana_1  ��M A S U N G A :   B a g w e b i   b a n g w ...

Last 3 rows:
        sent_id                                               text
0  AWI-Tswana_1  ��M A S U N G A :   B a g w e b i   b a n g w ...

Total number of sentences: 1

Saved to Tswana_sentences.csv


In [12]:
import re

input_file = 'Tswana_sentence.txt'
output_file = 'Tswana_conll.txt'

# Try correct encoding
with open(input_file, 'r', encoding='utf-16', errors='replace') as f:
    text = f.read()

# Split into sentences
sentences = re.split(r'(?<=[.!?])\s+', text)

with open(output_file, 'w', encoding='utf-8') as f:
    sent_id = 1
    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue
        tokens = sentence.split()
        f.write(f"# sent_id = {sent_id}\n")
        f.write(f"# text = {sentence}\n")
        for idx, token in enumerate(tokens, 1):
            # Write 10 columns per line (word-level annotation)
            f.write(f"{idx}\t{token}\t_\t_\t_\t_\t_\t_\t_\t_\n")
        f.write('\n')  # Empty line between sentences
        sent_id += 1

print(f"Data converted to CoNLL-U format and saved to {output_file}")


Data converted to CoNLL-U format and saved to Tswana_conll.txt


In [10]:
print(type(text))

<class 'str'>
