In [None]:
import re
import copy

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [None]:
def remove_links(text, regex=r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}     /)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))'):
    copy_text = copy.copy(text)
    
    copy_text = re.sub("https|http", " https", copy_text)
    
    copy_text = re.sub(regex, "", copy_text)
    
    copy_text = re.sub("https|http", "", copy_text)
    
    return copy_text

In [None]:
def remove_numbers_special_characters(text, regex="[^a-zA-Z@#]"):
    return re.sub(regex, " ", text)

In [None]:
def remove_empty_hashtags(words):
    return [word for word in words if word != "#"]

In [None]:
def remove_rt(words):
    return [word for word in words if word != "rt"]

In [None]:
def remove_mentions(words):
    return [word for word in words if not re.search("@", word)]

In [None]:
def remove_stopwords(words, language="english"):
    return [word for word in words if word not in set(stopwords.words(language))]

In [None]:
def stem_words(words, stemmer):
    return [stemmer.stem(word) for word in words]

In [None]:
def get_users(words):
    return [word for word in words if word.startswith("@")]

In [None]:
def pre_processing_words(tweets, stemmer=PorterStemmer(), language="english"):
    clean_tweets = []
    
    for tweet in tweets:
        tweet_copy = copy.copy(tweet)
        
        # Remove links
        tweet_copy = remove_links(tweet_copy)

        # Deixa apenas as letras e os @
        tweet_copy = remove_numbers_special_characters(tweet_copy)

        # Transforma para lower case
        tweet_copy = tweet_copy.lower()

        # Quebra em uma lista de palavras
        tweet_words = tweet_copy.split()
        
        # Remove todas as hashtags vazias
        tweet_words = remove_empty_hashtags(tweet_words)

        # Remove as palavras RT
        tweet_words = remove_rt(tweet_words)

        # Remove todos os mentions
        tweet_words = remove_mentions(tweet_words)

        # Remove as stopwords
        tweet_words = remove_stopwords(tweet_words, language=language)
        
        tweet_words = stem_words(tweet_words, stemmer)
        
        clean_tweets.append(tweet_words)
        
    return clean_tweets

In [None]:
def pre_processing_users(authors, tweets, keep_at_symbol=False):
    clean_tweets = []
    
    if len(authors) != len(tweets):
        raise ValueError("As listas de Autores e Tweets devem ter o mesmo tamanho!")
    
    for author, tweet in zip(authors, tweets):
        tweet_copy = copy.copy(tweet)
        
        # Remove links
        tweet_copy = remove_links(tweet_copy)
        
        tweet_copy = remove_numbers_special_characters(tweet_copy, "[^a-zA-Z0-9_@]")

        # Transforma para lower case
        tweet_copy = tweet_copy.lower()

        # Quebra em uma lista de palavras
        tweet_words = tweet_copy.split()
        
        tweet_users = get_users(tweet_words)
        
        clean_tweets.append(tweet_users)
        
    return clean_tweets