In [5]:
import numpy as np

import re
import string
from xml.sax.saxutils import unescape

import demoji
import numpy as np
from bs4 import BeautifulSoup
from unidecode import unidecode

In [33]:
class TweetPreprocessor:
    def __init__(self, preprocess_funcs=None):
        """ Constructs a new TweetPreprocessor object, given a list of tweets. """        
        if preprocess_funcs is None:
            self.preprocess_funcs = [
                self.tag_indicators,
                self.replace_xml_and_html,
                self.replace_emojis,
                self.remove_punctuation,
                self.replace_tags,
                self.remove_hashtag_chars,
                self.replace_accented_chars,
                self.remove_extra_spacing,
            ]
        else:
            self.preprocess_funcs = preprocess_funcs

    def preprocess(self, X):
        """ Preprocess a dataset (array of arrays of Tweets)"""
        self.preprocessed_X = [
            [self._process_single_tweet(tweet.text) for tweet in tweet_feed] for tweet_feed in X
        ]
    
    def _process_single_tweet(self, tweet):
        """ Process a single tweet """
        for f in self.preprocess_funcs:
            tweet = f(tweet)

        return tweet
    
    # tag urls, hashtags, user mentions
    def tag_indicators(self, tweet):
        """ Replace URLs, hastags and user mentions with a tag (e.g. #HASHTAG#) """
        hashtags_tagged = re.sub(r"#[^\s]*", "#HASHTAG#", tweet, flags=re.MULTILINE)
        urls_tagged = re.sub(
            r"https?\:\/\/[^\s]*", "#URL#", hashtags_tagged, flags=re.MULTILINE
        )
        users_tagged = re.sub(r"@[^\s]*", "#USER#", urls_tagged, flags=re.MULTILINE)
        return users_tagged

    def replace_xml_and_html(self, tweet):
        """ Replace XML encodings (&amp; &lt; &gt;) and HTML tags (<br>) """
        replace_xml = unescape(tweet)
        replace_html = BeautifulSoup(replace_xml, features="lxml").get_text()
        return replace_html

    def remove_punctuation(self, tweet):
        """ Remove punctuation, except hashtags '#' """
        punc = set(string.punctuation)
        prin = set(string.printable)
        punc.remove("#")
        return "".join(c for c in tweet if c not in punc and c in prin)

    def replace_emojis(self, tweet):
        """ Replace emojis with their meaning ':smiling_face:' """
        return demoji.replace_with_desc(tweet, ":")

    def replace_tags(self, tweet):
        """ Replace #HASHTAG# and #URL# #USER# with tags [tag] and [url], [user] """
        tweet = tweet.replace("#HASHTAG#", "[tag]")
        tweet = tweet.replace("#URL#", "[url]")
        tweet = tweet.replace("#USER#", "[user]")
        return tweet

    def remove_hashtag_chars(self, tweet):
        """ Remove hashtags '#' """
        return "".join(c for c in tweet if c != "#")

    def replace_accented_chars(self, tweet):
        """ Replace accented characters with their ASCII equivalent """
        return unidecode(tweet)

    def remove_extra_spacing(self, tweet):
        """ Remove extra spaces """
        return " ".join(tweet.split())

    def get_individual_tweets_dataset(self):
        """ Returns an array of preprocessed individual tweets """
        return np.asarray(self.preprocessed_X)

    def get_tweet_feed_dataset(self):
        """ Concatenates all tweets in each feed """
        return np.asarray([" ".join(tweet_feed) for tweet_feed in self.preprocessed_X])