In [1]:
import pandas as pd


# Step 1: Load Wordlist (skipping first 25 lines, taking column 2 only)
def load_wordlist(filepath, skip_lines=25):
    wordlist = set()
    try:
        with open(filepath, encoding='utf-8') as f:
            for idx, line in enumerate(f):
                if idx < skip_lines:
                    continue  # skip initial lines
                parts = line.strip().split('\t')
                if len(parts) >= 2:
                    word = parts[1].strip().lower()
                    if word.isalpha():  # Only add alphabetic words
                        wordlist.add(word)
        print(f"Loaded {len(wordlist)} words from {filepath}.")
    except FileNotFoundError:
        print(f"Error: Wordlist file not found at {filepath}")
    return wordlist

# Step 2: Word Segmentation Function (Greedy Max Matching)
def segment_word(text, wordlist):
    """
    Segments a string into words using the Maximum Matching algorithm with a given wordlist.
    """
    text = text.lower()
    i = 0
    result = []
    while i < len(text):
        match = None
        # Check from longest possible match to shortest
        for j in range(len(text), i, -1):
            word = text[i:j]
            if word in wordlist:
                match = word
                result.append(match)
                i += len(word)
                break
        if not match:
            # No match found, treat as single char or unknown
            result.append(text[i])
            i += 1
    return ' '.join(result)

# Example Usage (similar to the __main__ block in other files)
if __name__ == "__main__":
    wordlist_file = 'tsn-za_web_2020_10K-words.txt'
    wordlist = load_wordlist(wordlist_file)

    input_filename = "podcast_transcriptions_chunked_sorted_numbered+english.csv"
    output_csv_file = "podcast_transcriptions_chunked_sents_segmented.csv"

    # Load the input CSV
    try:
        df = pd.read_csv(input_filename)
        print(f"Loaded data from {input_filename}.")
    except FileNotFoundError:
        print(f"Error: Input CSV file not found at {input_filename}")
        exit()

    # Process each row in the 'transcription' column with the segment_word function
    # Ensure the 'transcription' column exists and handle potential NaN values
    if 'Transcription' in df.columns:
        df['Segmented_Transcription'] = df['Transcription'].apply(
            lambda x: segment_word(x, wordlist) if pd.notnull(x) else ''
        )
        print("Segmentation applied to 'transcription' column.")
    else:
        print("Error: 'transcription' column not found in the input CSV.")
        exit()


    # Save the processed DataFrame to a new CSV file
    df.to_csv(output_csv_file, index=False)
    print(f"Segmented data saved to {output_csv_file}.")

Loaded 17452 words from tsn-za_web_2020_10K-words.txt.
Loaded data from podcast_transcriptions_chunked_sorted_numbered+english.csv.
Segmentation applied to 'transcription' column.
Segmented data saved to podcast_transcriptions_chunked_sents_segmented.csv.


In [2]:
# %%
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("dsfsi/PuoBERTa-POS")
model = AutoModelForTokenClassification.from_pretrained("dsfsi/PuoBERTa-POS")

def split_sentences(text, max_length=30):
    """
    Splits text into sentences based on token length or specific keywords.
    """
    if pd.isna(text) or text.strip() == "":
        return []

    tokens = tokenizer.tokenize(text)
    sentences = []
    current = []

    for token in tokens:
        current.append(token)
        # Check if max length reached or token is a sentence boundary indicator
        if len(current) >= max_length or token.lower() in ['ke', 'mme', 'fa', 'jalo', 'le', 'ya', 'a', 'go', 'e']:
            sentences.append(tokenizer.convert_tokens_to_string(current).replace(" ##", "").strip())
            current = []

    # Add any remaining tokens as the last sentence
    if current:
        sentences.append(tokenizer.convert_tokens_to_string(current).replace(" ##", "").strip())

    return sentences

def process_csv_with_splitting(input_file, output_file):
    """
    Loads the segmented CSV, splits transcriptions into sentences,
    and saves the result to a new CSV.
    """
    try:
        df = pd.read_csv(input_file)
        print(f"Loaded data from {input_file}.")
    except FileNotFoundError:
        print(f"Error: Input CSV file not found at {input_file}")
        return

    # Ensure the 'Segmented_Transcription' column exists
    if 'Segmented_Transcription' in df.columns:
        # Apply the sentence splitting function to the 'Segmented_Transcription' column
        # Store the result as a list of sentences in a new column
        df['Split_Sentences'] = df['Segmented_Transcription'].apply(split_sentences)
        print("Sentence splitting applied to 'Segmented_Transcription' column.")
    else:
        print("Error: 'Segmented_Transcription' column not found in the input CSV.")
        return

    # Save the processed DataFrame to a new CSV file
    df.to_csv(output_file, index=False)
    print(f"Processed data with split sentences saved to {output_file}.")

if __name__ == "__main__":
    # Define input and output filenames
    input_csv_file = "podcast_transcriptions_chunked_sents_segmented.csv"
    output_csv_file = "podcast_transcriptions_chunked_sents_split.csv"

    # Process the CSV file
    process_csv_with_splitting(input_csv_file, output_csv_file)

# %%

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/877k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.28M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/332M [00:00<?, ?B/s]

Loaded data from podcast_transcriptions_chunked_sents_segmented.csv.
Sentence splitting applied to 'Segmented_Transcription' column.
Processed data with split sentences saved to podcast_transcriptions_chunked_sents_split.csv.


In [6]:
import re
import pandas as pd

class SetswanaLemmatizer:
    def __init__(self):
        # Initialize lookup tables for exceptions
        self.passive_exceptions = {'ungwa', 'wa', 'swa', 'nwa', 'hwa'}
        self.causative_exceptions = {'tataisa', 'gaisa', 'laisa', 'fisa'}
        self.applicative_exceptions = {'bela', 'sela', 'tlhatlhela'}
        self.reciprocal_exceptions = {'pana', 'gana', 'fapaana', 'rulagana'}
        self.neuter_passive_exceptions = {'sega', 'bega', 'anega', 'pega'}

        # Define transformation rules
        self.transformations = [
            self._remove_plural,
            self._remove_perfect_tense,
            self._remove_passive,
            self._remove_reciprocal,
            self._remove_applicative,
            self._remove_neuter_passive,
            self._remove_causative,
            self._remove_reversal,
            self._remove_reflexive,
            self._remove_object_markers,
            self._remove_iterative,
            self._fix_mood
        ]

        # Define perfect tense conversions
        self.perfect_conversions = {
            'etswe': 'lwa', 'otswe': 'lwa', 'utswe': 'lwa',
            'ditse': 'tsa', 'tswitse': 'tsa',
            'elle': 'aa',
            'ntse': 'nya',
            'tshtswe': 'tshwa',
            'sitswe': 'shwa', 'sitswe': 'swa',
            'tshtse': 'tsha',
            'ntswe': 'mapwa',
            'sitse': 'sa',
            'dile': 'la', 'tse': 'la',
            'lwe': 'wa',
            'ditswe': 'tswa', 'tswitswe': 'tswa', 'tsitswe': 'tswa',
            'ile': 'a',
            'nne': 'na',
            'nwe': 'nwa'
        }

        # Define reflexive transformations
        self.reflexive_transforms = {
            'a': 'ika',
            'e': 'ike',
            'i': 'iki',
            'o': 'iko',
            'u': 'iku',
            'w': 'ikw',
            'g': 'ikg',
            'b': 'ip',
            'l': 'it',
            'r': 'ith',
            's': 'itsh',
            'd': 'it',
            'h': 'iph',  # simplified - paper mentions more complex cases
            'f': 'iph'
        }

    def lemmatize(self, word):
        """
        Lemmatize a Setswana verb by applying transformation rules in sequence
        """
        original_word = word
        changed = True

        # Apply transformations until no more changes occur
        while changed:
            changed = False
            for transform in self.transformations:
                new_word = transform(word)
                if new_word != word:
                    word = new_word
                    changed = True
                    break  # restart transformations after each change

        return word if word != original_word else original_word

    def _remove_plural(self, word):
        """Remove plural suffix -ng"""
        if word.endswith('ng'):
            return word[:-2]
        return word

    def _remove_perfect_tense(self, word):
        """Remove perfect tense suffixes"""
        if word in self.passive_exceptions:
            return word

        for suffix, replacement in self.perfect_conversions.items():
            if word.endswith(suffix):
                return word[:-len(suffix)] + replacement

        # Special case for -ile (most common perfect tense)
        if word.endswith('ile'):
            return word[:-3] + 'a'

        return word

    def _remove_passive(self, word):
        """Remove passive suffixes"""
        if word in self.passive_exceptions:
            return word

        # Table I transformations from the paper
        passive_transforms = {
            'biwa': 'ba', 'jwa': 'ba',
            'fiwa': 'fa', 'swa': 'fa',
            'giwa': 'ga', 'gwa': 'ga',
            'piwa': 'pa', 'tswa': 'pa',
            'miwa': 'ma', 'ngwa': 'ma',
            'niwa': 'na', 'nwa': 'na',
            'nyiwa': 'nya', 'nywa': 'nya',
            'diwa': 'tsa', 'tswa': 'tsa',
            'tliwa': 'tlha', 'tlhwa': 'tlha',
            'tliwa': 'tla', 'tlhwa': 'tla',
            'tiwa': 'ta', 'twa': 'ta',
            'siwa': 'sa', 'swa': 'sa',
            'wiwa': 'wa',
            'wa': 'a'
        }

        for suffix, replacement in passive_transforms.items():
            if word.endswith(suffix):
                return word[:-len(suffix)] + replacement

        return word

    def _remove_causative(self, word):
        """Remove causative suffix -is-"""
        if word in self.causative_exceptions:
            return word

        if word.endswith('isha'):
            return word[:-4] + 'a'
        elif word.endswith('isa'):
            return word[:-3] + 'a'
        elif word.endswith('isisa'):  # intensity form
            return word[:-5] + 'a'

        return word

    def _remove_applicative(self, word):
        """Remove applicative suffix -el-"""
        if word in self.applicative_exceptions:
            return word

        if word.endswith('ela'):
            return word[:-3] + 'a'
        elif word.endswith('ele'):
            return word[:-3] + 'a'
        elif word.endswith('elwa'):
            return word[:-4] + 'a'

        return word

    def _remove_reciprocal(self, word):
        """Remove reciprocal suffix -an-"""
        if word in self.reciprocal_exceptions:
            return word

        if word.endswith('ana'):
            return word[:-3] + 'a'
        elif word.endswith('anya'):
            return word[:-4] + 'a'

        return word

    def _remove_neuter_passive(self, word):
        """Remove neuter-passive suffixes (-eg-, -al-, -agal-, -eseg-)"""
        if word in self.neuter_passive_exceptions:
            return word

        if word.endswith('ega'):
            return word[:-3] + 'a'
        elif word.endswith('ala'):
            return word[:-3] + 'a'
        elif word.endswith('agala'):
            return word[:-5] + 'a'
        elif word.endswith('esega'):
            return word[:-5] + 'a'

        return word

    def _remove_reversal(self, word):
        """Remove reversal suffix -olol-"""
        # As noted in the paper, most words with -olol- are basic forms
        # So we only handle specific cases that we know need transformation
        reversal_examples = {
            'bofolola': 'bofa',
            'kopolola': 'kopa'
        }

        return reversal_examples.get(word, word)

    def _remove_iterative(self, word):
        """Remove iterative suffix -ka-"""
        if 'kaka' in word:
            return re.sub(r'kaka$', '', word)
        elif word.endswith('ka'):
            return word[:-2]
        return word

    def _remove_reflexive(self, word):
        """Remove reflexive prefix i- with transformations"""
        if not word.startswith('i'):
            return word

        # Handle reflexive transformations from Table II
        for initial, prefix in self.reflexive_transforms.items():
            if word.startswith(prefix):
                # Remove the reflexive prefix and restore the original initial
                return initial + word[len(prefix):]

        # Simple case: just remove 'i' prefix
        if word.startswith('i'):
            return word[1:]

        return word

    def _remove_object_markers(self, word):
        """Remove object markers (first-person n-, third-person mo-)"""
        # First-person object marker n- becomes m- before certain consonants
        if word.startswith('m') and len(word) > 1:
            next_char = word[1]
            if next_char in {'p', 'b', 'ph', 'f'}:
                return 'b' + word[2:]  # n- becomes m- and original consonant was b/p/ph/f

        # Third-person object marker mo- contracted to m- and b- becomes -m
        if word.startswith('mm') and len(word) > 2:
            return 'b' + word[2:]  # e.g., mmetsa -> beta

        if word.startswith('n'):
            return word[1:]

        if word.startswith('mo'):
            return word[2:]

        return word

    def _fix_mood(self, word):
        """Fix mood by replacing -e with -a"""
        if word.endswith('e'):
            return word[:-1] + 'a'
        return word


def process_transcription(transcription, lemmatizer):
    """
    Process a transcription string with numbered sentences
    Returns a new string with lemmatized words
    """
    # Split into numbered sentences
    sentences = [s.strip() for s in transcription.split('\n') if s.strip()]

    processed_sentences = []
    for sentence in sentences:
        # Split into number and text (e.g., "1. e kitshedimosetso...")
        parts = sentence.split('.', 1)
        if len(parts) == 2:
            num_part, text_part = parts
            # Lemmatize each word in the text
            words = text_part.split()
            lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
            # Reconstruct the sentence
            processed_sentence = f"{num_part}. {' '.join(lemmatized_words)}"
            processed_sentences.append(processed_sentence)
        else:
            # If no number, just process the text
            words = sentence.split()
            lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
            processed_sentences.append(' '.join(lemmatized_words))

    return '\n'.join(processed_sentences)


def process_csv(input_file, output_file):
    """
    Process the CSV file, lemmatizing the transcription column
    """
    # Read the CSV file
    df = pd.read_csv(input_file)

    # Initialize lemmatizer
    lemmatizer = SetswanaLemmatizer()

    # Process each transcription from Split_Sentences

    df['Lemmatized_Transcription'] = df['Split_Sentences'].apply(
        lambda x: process_transcription(x, lemmatizer) if pd.notnull(x) else ''
    )

    # Save to new file
    df.to_csv(output_file, index=False)
    print(f"Processed data saved to {output_file}")


if __name__ == "__main__":
    # Example usage with the podcast transcriptions file
    input_filename = "podcast_transcriptions_chunked_sents_split.csv"
    output_filename = "lemmatized_transcriptions.csv"

    process_csv(input_filename, output_filename)

    print("\nTest Cases:")
    lemmatizer = SetswanaLemmatizer()
    test_words = [
        'supiwa',    # passive of supa (point)
        'supisa',    # causative of supa
        'supisisa',  # intensity of supa
        'supela',    # applicative of supa
        'supana',    # reciprocal of supa
        'ikopa',     # reflexive of kopa (ask)
        'iphenya',   # reflexive of fenya (win)
        'robakaka',  # iterative of roba (break)
        'bofolola',  # reversal of bofa (tie)
        'rapelang',  # plural of rapela (pray)
        'itshupile', # perfect reflexive of supa
        'mmetsa',    # third-person object marker of beta (ask)
        'mpona',     # first-person object marker of bona (see)
        'palame'     # mood form of palama (climb)
    ]

    print("Setswana Verb Lemmatizer Test Cases")
    print("=" * 40)
    for word in test_words:
        lemma = lemmatizer.lemmatize(word)
        print(f"{word:15} → {lemma}")


Processed data saved to lemmatized_transcriptions.csv

Test Cases:
Setswana Verb Lemmatizer Test Cases
supiwa          → supa
supisa          → supa
supisisa        → supa
supela          → supa
supana          → supa
ikopa           → opa
iphenya         → bhenya
robakaka        → roba
bofolola        → bofa
rapelang        → rapa
itshupile       → lshupa
mmetsa          → betsa
mpona           → bona
palame          → palama


In [7]:
def remove_stopwords(input_file, output_file, stopwords_file, num_stopwords=100):
    """
    Loads the processed CSV, removes top N stopwords from a file,
    and saves the result to a new CSV.

    Args:
        input_file (str): Path to the input CSV file (e.g., lemmatized_transcriptions.csv).
        output_file (str): Path to save the output CSV file.
        stopwords_file (str): Path to the stopwords text file (tswana_stopwords.txt).
        num_stopwords (int): The number of top stopwords to load from the file.
    """
    # Load the processed CSV
    df = pd.read_csv(input_file)

    # Load top N stopwords from the text file
    try:
        with open(stopwords_file, 'r', encoding='utf-8') as f:
            stopwords = [line.strip() for line in f][:num_stopwords]
    except FileNotFoundError:
        print(f"Error: Stopwords file not found at {stopwords_file}")
        return

    # Function to remove stopwords from a single string
    def remove_stopwords_from_text(text):
        if pd.isnull(text):
            return ""
        words = text.split()
        filtered_words = [word for word in words if word.lower() not in stopwords]
        return ' '.join(filtered_words)

    # Apply stop word removal to the lemmatized transcription column
    df['Lemmatized_Transcription_Normal'] = df['Lemmatized_Transcription'].apply(remove_stopwords_from_text)

    # Save the result to a new file
    df.to_csv(output_file, index=False)
    print(f"Data with stopwords removed saved to {output_file}")

if __name__ == "__main__":
    input_csv_file = "lemmatized_transcriptions.csv"
    output_csv_file = "lemmatized_transcriptions_nostopwords.csv"
    remove_stopwords(input_csv_file, output_csv_file, "tswana_stopwords.txt", num_stopwords=100)



Data with stopwords removed saved to lemmatized_transcriptions_nostopwords.csv


In [None]:
# prompt: Lemmatized_Transcription_Normal is like this 'e', 'kits h a dim o setso ko w d a u l lis a tswedi ', 'ko o so o y dan k ra leboga ea a tlola p on so kg a k ', 'go t l ho mn o k fi a h a fi t ha ra leke', 'k ho h l ea kapa ko a leba t hr baba a g loo ama eng', 'k bat h hona gona botsa thola aka', 'gora unibesiti star lan sh p a ta w '] i just want it as one sentenence

# Example of combining the list elements into a single string
lemmatized_transcription_list = ['e', 'kits h a dim o setso ko w d a u l lis a tswedi ', 'ko o so o y dan k ra leboga ea a tlola p on so kg a k ', 'go t l ho mn o k fi a h a fi t ha ra leke', 'k ho h l ea kapa ko a leba t hr baba a g loo ama eng', 'k bat h hona gona botsa thola aka', 'gora unibesiti star lan sh p a ta w ']

combined_sentence = ''.join(lemmatized_transcription_list)

combined_sentence
