In [2]:
from pathlib import Path
from typing import List
import nltk

In [3]:
class Phraser:
    """
    Represents a text corpus segmented into phrases.
    A 'phrase' is defined by the segmentation function below (uses standard nltk sentence tokenizer).
    
    Expects a txt file of any size.
    """
    
    def __init__(
        self,
        filehandle: str,
        normalize: bool = True,
        language: str = 'es',
    ):
        self.raw_text = self._load_text(filehandle)
        self.raw_text = self._normalize(normalize)
        
        possible_languages = {'es':'spanish', 'en':'english', 'pt':'portuguese'}
        self.language = possible_languages[language]
        
        self.phrases: List[str] = self._segment()
    
    def _load_text(self, filehandle) -> str:
        with open(filehandle,'r', encoding='utf-8') as f:
            text = f.read()
        return text
    
    def _normalize(self, normalize) -> str:
        if normalize:
            return self.raw_text.replace('\n',' ')
        return self.raw_text
    
    def _segment(self) -> List[str]:
        return nltk.sent_tokenize(self.raw_text, self.language)
