# Preprocessing

Function below allows for preprocessing and normalisation of texts for use in further nlp functions
- Each process is run separately to allow easy commenting out or addition of further processes as required

In [None]:
# import required libraries
import re
import emoji
import contractions
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

def preprocess(text):

    '''
    Function for text preprocessing before nlp modelling.
    Designed to be a list where functions can easily be removed or added as 
    required by the user.
    '''

    # remove twitter handles/mentions
    text = re.sub('\B@\w+', "", text)

    # remove url links
    text = re.sub('(http|https):\/\/\S+', "", text)

    # remove emojis
    text = emoji.demojize(text)

    # remove contractions (e.g. I've > I have)
    text = contractions.fix(text)

    # remove stopwords
    token_list = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    token_list = [token for token in token_list if token not in stop_words]
    text = TreebankWordDetokenizer().detokenize(token_list)

    # remove punctuation
    token_list = word_tokenize(text)
    token_list = [token for token in token_list if token not in string.punctuation]
    text = TreebankWordDetokenizer().detokenize(token_list)

    # convert to lowecase
    text = text.lower()

    return text