In [23]:
import re
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.corpus import stopwords
from langdetect import detect

from nltk.stem import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
import demoji

# Map that associates the abbreviation with the nationality.
language_map = {
    'ar': 'arabic',
    'az': 'azerbaijani',
    'da': 'danish',
    'nl': 'dutch',
    'en': 'english',
    'fi': 'finnish',
    'fr': 'french',
    'de': 'german',
    'el': 'greek',
    'hu': 'hungarian',
    'id': 'indonesian',
    'it': 'italian',
    'kk': 'kazakh',
    'ne': 'nepali',
    'no': 'norwegian',
    'pt': 'portuguese',
    'ro': 'romanian',
    'ru': 'russian',
    'sl': 'slovene',
    'es': 'spanish',
    'sv': 'swedish',
    'tg': 'tajik',
    'tr': 'turkish'
}

In [24]:

class TextProcessor:
    def __init__(self, use_stemming=True, use_stopwords=True):
        self.use_stemming = use_stemming # flag to disable stemming
        self.use_stopwords = use_stopwords # flag to disable remove stopwords
        self.reg_exp_punctuation = r'[^\w\s]'
        self.reg_exp_hashtags = r'#\w+' 
        self.reg_exp_usernames = r'@\w+'
        self.control_char_pattern=r'[\x00-\x1F\x7F-\x9F]'
        self.reg_exp_web_link_pattern=r'https*://\S+|www.\S+'

    def process_text(self, text):
        """ Process a text by cleaning, tokenizing, removing stopwords and stemming.

            Args:
                text: The text to be processed.
            Returns:
                The processed text after cleaning, tokenizing, removing stopwords, and stemming.
        """
        # Remove special characters
        text = self.clean_text(text)

        # Get the language
        language = language_map.get(detect(text))

        # Transform the document in a list of tokens
        tokenizer = RegexpTokenizer(r'\w+')
        word_tokens = tokenizer.tokenize(text)

        # Remove stopwords (if its flag is enable)
        if self.use_stopwords:
            word_tokens = self.remove_stopwords(word_tokens, language)

        print(word_tokens)
        # Do stemming (if its flag is enable)
        if self.use_stemming:
            word_tokens = self.stem_text(word_tokens, language)

        # Transform the list of tokens in a document
        print(word_tokens)
        return TreebankWordDetokenizer().detokenize(word_tokens)

    def stem_text(self, tokens, language):
        """Stem a list of tokens based on the specified language using Snowball Stemmer.

        Args:
            tokens: List of tokens to be stemmed.
            language: The language used for stemming.
        Returns:
            The list of tokens after stemming based on the provided language. 
            If the language is not supported, the function returns the original tokens.
        """
        try:
            if language != None:
                stemmer = SnowballStemmer(language)
            else:
                return tokens
        except ValueError: 
                return tokens
        
        # Do stem for each tokens in the list
        stemmed_words = [stemmer.stem(word)
                          for word in tokens]
        return stemmed_words

    def remove_stopwords(self, tokens, language):
        """Remove stopwords from a list of tokens based on the specified language.

        Args:
            tokens: List of tokens to be processed.
            language: The language used to identify the stopwords.
        Returns:
            The list of tokens after removing the stopwords based on the provided language. 
            If the language is not supported, the function returns the original tokens.
        """
        try:
            if language != None:
                stop_words = set(stopwords.words(language))
            else:
                return tokens
        except ValueError:
            return tokens

        # If the tokens is in the list of stopwords, remove it
        filtered_words = [word for word in tokens 
                        if word.lower() not in stop_words]
        return filtered_words
    
    def clean_text(self,text):
        """Clean the text by converting to lowercase, replacing special characters, and removing emojis.

        Args:
            text: The text to be cleaned.
        Returns:
            The cleaned text after converting to lowercase, replacing special characters, and removing emojis.
        """
        text = str(text).lower()

        # Replace special characters 
        combined_pattern = re.compile(self.reg_exp_hashtags + '|' + self.reg_exp_punctuation + '|' + self.reg_exp_usernames + '|' + self.control_char_pattern + '|' + self.reg_exp_web_link_pattern)
        text = re.sub(combined_pattern, " ", text)

        # Remove emoji
        text = demoji.replace(text, " ")

        return text.strip()
    
    '''
    def lemmatize(self, tokens, language):
        lemmatizer = WordNetLemmatizer()
        
        lemmatized = [lemmatizer.lemmatize(w, language) 
                      for w in tokens]
        
        return lemmatized
    '''