In [11]:
import nltk
from autocorrect import Speller
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import re

In [12]:
# Removal of whitespace
def remove_whitespace(data : str):
    return " ".join(data.split())
remove_whitespace("this   is s\
                    test")    

'this is s test'

In [13]:
# Remove urls
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)
remove_urls("Check out this cool website! https://hyperperform.cs.up.ac.za/commits")

'Check out this cool website! '

In [14]:
# Word tokenization
def tokenize(data : str):
    return nltk.word_tokenize(data)
tokenize("I do really really hope this works!")

['I', 'do', 'really', 'really', 'hope', 'this', 'works', '!']

In [15]:
# Spelling Correction
def correct_spelling(word : str):
    spell = Speller(lang='en')
    return spell(word)
correct_spelling("happpyness")

'happiness'

In [17]:
# Stopword removal
def remove_stopwords(tokens):
    sws = stopwords.words('english')
    new_tokens = []
    for t in tokens:
        if t not in sws:
            new_tokens.append(t)
    return new_tokens
remove_stopwords(nltk.word_tokenize("I am very interested to see what exactly this will give me only"))

['I', 'interested', 'see', 'exactly', 'give']

In [18]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatize_word(word : str):
    return lemmatizer.lemmatize(word)
lemmatizer.lemmatize("runs")

'run'

In [19]:
# Stemming
def stem_word(word :str):
    porter = PorterStemmer()
    return porter.stem(word)
stem_word("running")
    

'run'

In [10]:
# Applying all the preprocessing steps
sample_text = "If I could live here, I would. i love Starbucks on Rosebank that much. The best blueberry muffins I have ever tasted. Great coffee as well :-)"

def process_data(raw_text : str):
    new_data = remove_whitespace(sample_text)
    new_data = remove_urls(new_data)
    tokens = tokenize(new_data)
    for index, token in enumerate(tokens):
        tokens[index] = correct_spelling(token)
    tokens = remove_stopwords(tokens)
    for index, token in enumerate(tokens):
        tokens[index] = lemmatize_word(token)
#     for index, token in enumerate(tokens):
#         tokens[index] = stem_word(token)
    return tokens

process_data(sample_text)
    


['If',
 'I',
 'could',
 'live',
 ',',
 'I',
 'would',
 '.',
 'love',
 'Starbucks',
 'Roseland',
 'much',
 '.',
 'The',
 'best',
 'blueberry',
 'muffin',
 'I',
 'ever',
 'tasted',
 '.',
 'Great',
 'coffee',
 'well',
 ':',
 '-',
 ')']