In [12]:
# Import the library
from seram_tokenizer.paragog_normaliser import GeserParagogNormalizer

# Example usage of the GeserParagogNormalizer
text = ['aku fas motor a tura maniwa ra']

# Initialize the normalizer with the text
normalizer = GeserParagogNormalizer(text)

# Normalize the text
normalized_text = normalizer.normalize()

# Print the results
print(f"Original text: {text}")
print(f"Tokens: {normalized_text}")

Original text: ['aku fas motor a tura maniwa ra']
Tokens: ['aku fas motora tura maniwara']


In [7]:
# Import the library
from seram_tokenizer import SeramTokenizer

# Text in Seram language
text = "aku nugu ngasana habiba, aku atamari wanu karay."

# Initialize the tokenizer
tokenizer = SeramTokenizer(text)

# Perform tokenization
tokens = tokenizer.tokenize()

# Print the results
print(f"Original text: {text}")
print(f"Tokens: {tokens}")

Original text: aku nugu ngasana habiba, aku atamari wanu karay.
Tokens: ['aku', 'nugu', 'ngasana', 'habiba', ',', 'aku', 'atamari', 'wanu', 'karay', '.']


In [None]:
# import pandas as pd
# dataset = 'dataset_seram_geser.csv'
# df = pd.read_csv(dataset)
# df.columns = ['Indonesian', 'Geser']
# text = df['Geser'].tolist()
# filtered_text = [s for s in text if len(s.split()) >= 2]
# normalizer = GeserParagogNormalizer(filtered_text)
# text_normalizer = normalizer.normalize()

In [34]:
import re
text = "i natagi bua sikolara"
pattern = re.compile('na\w+')
matches = pattern.findall(text)
print(matches)

['natagi']


In [81]:
import re

# Membaca daftar kata dari file
with open('seram_tokenizer/data/geser_word.txt', 'r', encoding='utf-8') as file:
    word_list = file.read().splitlines()

# Pola regex untuk kata yang diakhiri dengan 'ra'
ra_suffix_pattern = re.compile(r'\w+ra$')

# Ambil hanya entri yang terdiri dari satu kata
single_word_entries = [word for word in word_list if len(word.split()) == 1]

# Cari kata yang berakhiran 'ra' dalam daftar
words_ending_with_ra = [word for word in single_word_entries if ra_suffix_pattern.search(word)]

# Contoh teks untuk diuji
text = "aku fas motor a tura maniwara"
text_split = text.split()

# Cari kata dalam teks yang cocok dengan pola akhiran 'ra'
ra_words = [word for word in text_split if ra_suffix_pattern.search(word)]

# Ambil kata yang tidak ada dalam daftar words_ending_with_ra
final_words = [word for word in ra_words if word not in words_ending_with_ra]
final_words

['tura', 'maniwara']

In [94]:
text = 'aku fas motor a tura maniwara'
lower = text.lower()
text_split = lower.split()
ra_words = [word for word in text_split if ra_suffix_pattern.search(word)]
ra_words

['tura', 'maniwara']

In [95]:

import os
from typing import List, Set
import pkg_resources
import re

# INSPIRED BY The implementation of https://github.com/AbdullahAlabbas/Wordle/blob/main/play_wordle.py

def load_word_set(file_path: str) -> Set[str]:
    """
    Load words from a file into a set for O(1) lookup performance.
    
    Args:
        file_path (str): Path to the file containing words
        
    Returns:
        Set[str]: Set of words from the file
        
    Raises:
        FileNotFoundError: If the file doesn't exist
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found at: {file_path}")
    
    with open(file_path, 'r', encoding='utf-8') as f:
        return {line.strip() for line in f if line.strip()}


# Define file paths using pkg_resources to ensure compatibility with package data
DICTIONARY_FILE_PATH = pkg_resources.resource_filename(__name__, 'seram_tokenizer/data/geser_word.txt')
VOCAL_FILE_PATH = pkg_resources.resource_filename(__name__, 'seram_tokenizer/data/vocal.txt')
CONSONANT_FILE_PATH = pkg_resources.resource_filename(__name__, 'seram_tokenizer/data/consonant.txt')

# INITIALIZE SETS
DICTIONARY_WORDS: Set[str] = set()
VOCAL_LETTERS: Set[str] = set()
CONSONANT_LETTERS: Set[str] = set()

# Load Dictionary, Vocal, and Consonant letters from files
try:
    DICTIONARY_WORDS = load_word_set(DICTIONARY_FILE_PATH)
    VOCAL_LETTERS = load_word_set(VOCAL_FILE_PATH)
    CONSONANT_LETTERS = load_word_set(CONSONANT_FILE_PATH)
except FileNotFoundError as e:
    print(f"Error: {e}. Please ensure all necessary files exist.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

# Compiled regex patterns for better performance
RA_PATTERN = re.compile(r'\w+ra$')
A_PATTERN = re.compile(r'\w+a$')

# Pre-filter dictionary words for efficiency - computed once at module load
SINGLE_WORD_DICTIONARY_ENTRIES: Set[str] = {
    word for word in DICTIONARY_WORDS 
    if ' ' not in word  
}

LEMMA_WITH_RA: Set[str] = {
    word for word in SINGLE_WORD_DICTIONARY_ENTRIES 
    if word.endswith('ra')
}

LEMMA_WITH_A: Set[str] = {
    word for word in SINGLE_WORD_DICTIONARY_ENTRIES 
    if word.endswith('a')
}

In [97]:
LEMMA_WITH_RA

{'arumbaura',
 'asara',
 'balara',
 'bora',
 'butira',
 'cara',
 'dadafira',
 'datara',
 'gambara',
 'gambira',
 'gembira',
 'inu-inura',
 'ira',
 'jahera',
 'jambura',
 'juara',
 'kayara',
 'lebara',
 'madiara',
 'malura',
 'manira',
 'matara',
 'mera',
 'mumura',
 'naira',
 'nanara',
 'natara',
 'nyira',
 'odi-odira',
 'pera',
 'rantira',
 'rara',
 'salompara',
 'sapatura',
 'sarasara',
 'sawara',
 'sibura',
 'sikarura',
 'sinelara',
 'sira',
 'sukara',
 'tabara',
 'tetewara',
 'tinira',
 'tinumura',
 'togira',
 'tonsoara',
 'topira',
 'udara',
 'yayaira'}