In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import twitter_samples

In [3]:
twitter_samples?

In [4]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [6]:
len(all_positive_tweets)

5000

In [7]:
len(all_negative_tweets)

5000

### Split Data into 80% training set and 20% test set

In [8]:
train_pos = all_positive_tweets[:4000]
test_pos = all_positive_tweets[4000:]
train_neg = all_negative_tweets[:4000]
test_neg = all_negative_tweets[4000:]

In [9]:
train_x = train_pos + train_neg
test_x = test_pos + test_neg

In [15]:
train_y = np.append(np.ones((len(train_pos),1)), np.zeros((len(train_neg),0)), axis=1)
test_y = np.append(np.ones((len(test_pos),1)), np.zeros((len(test_neg),0)), axis=1)

## Preprocess Tweets
1. Remove URLs, twitter marks and styles
2. Tokenize and Lowercase
3. Remove stopwords and punctuation
4. Stemming

In [16]:
tweet = all_positive_tweets[2277]
tweet

'My beautiful sunflowers on a sunny Friday morning off :) #sunflowers #favourites #happy #Friday off… https://t.co/3tfYom0N1i'

In [17]:
import re
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

In [41]:
def process_tweet(tweet):
    ''' Process Tweet Function
    Input:
        tweet: a st
    '''
    
    # remove old style retweet text RT
    tweet = re.sub(r'^RT[\s]+', '', tweet)

    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

    # remove hashtags
    tweet = re.sub(r'#', '', tweet)
    
    # tokenize the string: split the strings into individual words without blanks or tabs
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    
    # Removing stopwords and punctuation
    stopwords_english = stopwords.words('english')
    tweet_clean = []

    for word in tweet_tokens:
        if word not in stopwords_english and word not in string.punctuation:
            tweet_clean.append(word)
    
    # Stemming
    stemmer = PorterStemmer()

    tweet_stem = []

    for word in tweet_clean:
        stem_word = stemmer.stem(word)
        tweet_stem.append(stem_word)
    
    return tweet_stem

In [42]:
process_tweet(tweet)

['beauti',
 'sunflow',
 'sunni',
 'friday',
 'morn',
 ':)',
 'sunflow',
 'favourit',
 'happi',
 'friday',
 '…']

In [18]:
# remove oldstyle retweet text RT
tweet = re.sub(r'^RT[\s]+', '', tweet)

# remove hyperlinks
tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

# remove hashtags
tweet = re.sub(r'#', '', tweet)

tweet

'My beautiful sunflowers on a sunny Friday morning off :) sunflowers favourites happy Friday off… '

In [22]:
TweetTokenizer?

In [28]:
# tokenize the string: split the strings into individual words without blanks or tabs
tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
tweet_tokens = tokenizer.tokenize(tweet)

In [29]:
print(tweet_tokens)

['my', 'beautiful', 'sunflowers', 'on', 'a', 'sunny', 'friday', 'morning', 'off', ':)', 'sunflowers', 'favourites', 'happy', 'friday', 'off', '…']


In [30]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [32]:
stopwords.words?

In [33]:
stopwords_english = stopwords.words('english')

In [35]:
print(stopwords_english)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [36]:
tweet_clean = []

for word in tweet_tokens:
    if word not in stopwords_english and word not in string.punctuation:
        tweet_clean.append(word)

In [37]:
print(tweet_tokens)
print(tweet_clean)

['my', 'beautiful', 'sunflowers', 'on', 'a', 'sunny', 'friday', 'morning', 'off', ':)', 'sunflowers', 'favourites', 'happy', 'friday', 'off', '…']
['beautiful', 'sunflowers', 'sunny', 'friday', 'morning', ':)', 'sunflowers', 'favourites', 'happy', 'friday', '…']


In [38]:
# stemming
stemmer = PorterStemmer()

tweet_stem = []

for word in tweet_clean:
    stem_word = stemmer.stem(word)
    tweet_stem.append(stem_word)
    
print(tweet_stem)

['beauti', 'sunflow', 'sunni', 'friday', 'morn', ':)', 'sunflow', 'favourit', 'happi', 'friday', '…']
