In [1]:
# Raw data preprocessing

In [2]:
"""
This function transfroms escaped unicode to real unicode for tokenization
"""
import re


def _process_unicode(m):
    '''process(m) -> Unicode code point

    m is a regular expression match object that has groups below:
     1: high Unicode surrogate 4-digit hex code d800-dbff
     2: low  Unicode surrogate 4-digit hex code dc00-dfff
     3: None
    OR
     1: None
     2: None
     3: Unicode 4-digit hex code 0000-d700,e000-ffff
    '''
    if m.group(3) is None:
        # Construct code point from UTF-16 surrogates
        hi = int(m.group(1),16) & 0x3FF
        lo = int(m.group(2),16) & 0x3FF
        cp = 0x10000 | hi << 10 | lo
    else:
        cp = int(m.group(3),16)
    return chr(cp)

def process_unicode(m):
    return re.sub(r'\\u(d[89ab][0-9a-f]{2})\\u(d[cdef][0-9a-f]{2})|\\u([0-9a-f]{4})',process_unicode,m)

In [5]:
"""
This part has the function for removing punctuation from string
"""
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

In [8]:
from sklearn.base import TransformerMixin, BaseEstimator
import preprocessor as p


class RawDataTokenizer(TransformerMixin, BaseEstimator):
    def __init__(self, unescape_unicode=True, clean_type="Tokenize", remove_punct=True, stemming=True, text_col="Text"):
        self._unescape_unicode = unescape_unicode, 
        self._clean_type = clean_type
        self._remove_punct = remove_punct
        self._stemming = stemming
        self._text_col = t
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        if self._unescape_unicode:
            X[self._text_col] = X.apply(lambda row: full_process_unicode(row[self._text_col]), axis=1, result_type="expand")
            
        if self._clean_type == "tokenize":
            X[self._text_col] = X.apply(lambda row: p.tokenize(row[self._text_col]), axis=1, result_type="expand")
        elif self._clean_type == "clean":
            X[self._text_col] = X.apply(lambda row: p.clean(row[self._text_col]), axis=1, result_type="expand")
        else:
            raise ValueError(f"Invalid clean_type: {self._clean_type}")
        
        if self._remove_punct:
            X[self._text_col] = X.apply(lambda row: remove_punct(row[self._text_col]))
            
        if self._stemming:
            # TODO put stemming here
            pass
        
        return X
    
class TweetLengthExtractor(TransformerMixin, BaseEstimator):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        # TODO
        return