In [None]:
## DATA CLEANING
## 1. remove non ascii values
## 2. remove @ 
## 3. convert data to lowercase
## 4. remove punctuations
## 5. remove remove number
## 6. remove stop words
## 7. stem_words
## 8. lemmatize_verbs
## 9. remove urls

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def remove_at(words):
    """Replace @username with "AT_USER"""
    new_words = []
    for word in words:
        new_word = re.sub('@[^\s]+',"AT_USER",word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_numbers(words):
    """Remove all interger occurrences in list of tokenized words with textual representation"""
    new_words = []
    for word in words:
        new_word = re.sub("\d+", "", word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def remove_url(words):
    """Remove URLS list of tokenized words"""
    new_words=[]
    for word in words:
        new_word = re.sub(r'^https?:\/\/.*[\r\n]*', 'URL', word,flags=re.MULTILINE)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = remove_numbers(words)
    words = remove_at(words)
    words = remove_url(words)
#    words = remove_stopwords(words)
    return words

In [None]:
# tokenize words
data['Words'] = data['TweetText'].apply(nltk.word_tokenize)

# apply preprocessing to words
data['Words'] = data['Words'].apply(normalize) 
data['Words'].head()

In [None]:
testdata['Words'] = testdata['text'].apply(nltk.word_tokenize)

testdata['Words'] = testdata['Words'].apply(normalize)
testdata.head()