In [1]:
class Preprocessor:
  
  def __init__(self, path):
    self.path = path
    self.nlp = spacy.load('en_core_web_sm', disable = ['parser', 'ner'])
  
  # Read the data
  def read_file(self):
    df = pd.read_csv(self.path)
    return df
  
  # Remove rows with missing values
  def drop_missing_vals(self, df):
    df.dropna(axis = 0, how = 'any', inplace = True)
  
  # Itentify long enough review
  def mark_long_reviews_method(self, df):
    df['Num_words_text'] = df['text'].apply(lambda x:len(str(x).split()))
    mask = (df['Num_words_text'] < 100) & (df['Num_words_text'] >=20)
    df = df[mask]
    return df
  
  # Remove punctuation
  def remove_punctuation_method(self, text):
    return text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
  
  # Remove newline characters
  def remove_newline_characters_method(self, df):
    df['text'] = df['text'].str.replace('\n', ' ')
    
  # Remove non alphanumeric characters
  def remove_nonalphanumeric_characters_method(self, df):
    df['text'] = df['text'].str.replace('[^a-zA-Z0-9\s]', ' ')
  
  # Lowercase the text
  def lowercase_text_method(self, df):
    df['text'] = df['text'].str.lower()
    
  # Normalize whitespaces
  def normalize_whitespace_method(self, text):
    return re.sub('[\s]+', ' ', text)
  
  # Remove stopwords
  def remove_stopwords_method(self, text):
    return " ".join([word for word in text.split() if word not in stops])
  
  # Lemmatization
  def lemmatization_method(self, text, allowed_postags = ['NOUN', 'ADJ', 'ADV', 'PRON', 'PROPN']):
    doc = self.nlp(''.join(text))
    doc = [token.lemma_ for token in doc if token.pos_ in allowed_postags]
    return ' '.join(doc)
  
  # Stemming
  def stemming_method(self, text):
    doc = [PorterStemmer().stem(w).strip() for w in text.split()]
    return ' '.join(doc)
  
  # Remove words with digits
  def remove_digit_words_method(self, text):
    pattern = re.compile(r'\b\w*\d\w*\b')
    return pattern.sub('', text)
  
  # Remove short words
  def remove_short_words_method(self, text):
    text = text.split()
    text = [word for word in text if len(word) > 2]
    text = ' '.join(text)
    return text
  
  def remove_short_texts_method(self, df, length = 5):
    df = df[df['text'].map(lambda x: len(x.split())) > length]
    return df
  
  
  
  def __call__(self, 
               drop_missing=True, 
               mark_long_reviews=False, 
               remove_punctuation=True,
               remove_newline_characters=True,
               remove_nonalphanumeric_characters=True,
               lowercase_text=True,
               normalize_whitespace=True,
               remove_stopwords=True,
               lemmatize=True,
               stem=True,
               remove_digit_words=False,
               remove_short_words=False,
               remove_short_texts=True):
    
    df = self.read_file()
    
    if drop_missing:
      self.drop_missing_vals(df)
    
    if mark_long_reviews:
      df = self.mark_long_reviews_method(df)
    
    if remove_punctuation:
      df['text'] = df['text'].apply(self.remove_punctuation_method)
    
    if remove_newline_characters:
      self.remove_newline_characters_method(df)
    
    if remove_nonalphanumeric_characters:
      self.remove_nonalphanumeric_characters_method(df)
      
    if lowercase_text:
      self.lowercase_text_method(df)
      
    if normalize_whitespace:
      df['text'] = df['text'].map(self.normalize_whitespace_method)
    
    if remove_stopwords:
      df['text'] = df['text'].apply(self.remove_stopwords_method)
    
    if lemmatize:
      df['text'] = df['text'].map(self.lemmatization_method)
    
    if stem:
      df['text'] = df['text'].map(self.stemming_method)
      
    if remove_digit_words:
      df['text'] = df['text'].apply(self.remove_digit_words_method)
    
    if remove_short_words:
      df['text'] = df['text'].map(self.remove_short_words_method)
      
    if normalize_whitespace:
      df['text'] = df['text'].map(self.normalize_whitespace_method)
    
    if remove_short_texts:
      df = self.remove_short_texts_method(df)
    
    df.reset_index(inplace = True, drop = True)
    
    return df