In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import twitter_samples

In [2]:
twitter_samples?

## 0. Load Dataset

In [3]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [4]:
len(all_positive_tweets)

5000

In [5]:
len(all_negative_tweets)

5000

**Split Data into 80% training set and 20% test set**

In [6]:
train_pos = all_positive_tweets[:4000]
test_pos = all_positive_tweets[4000:]
train_neg = all_negative_tweets[:4000]
test_neg = all_negative_tweets[4000:]

In [7]:
train_x = train_pos + train_neg
test_x = test_pos + test_neg

In [8]:
train_y = np.append(np.ones((len(train_pos),1)), np.zeros((len(train_neg),1)), axis=0)
test_y = np.append(np.ones((len(test_pos),1)), np.zeros((len(test_neg),1)), axis=0)

In [29]:
test_y.shape

(2000, 1)

## 1. Preprocess Tweets
1. Remove URLs, twitter marks and styles
2. Tokenize and Lowercase
3. Remove stopwords and punctuation
4. Stemming

In [9]:
tweet = all_positive_tweets[2277]
tweet

'My beautiful sunflowers on a sunny Friday morning off :) #sunflowers #favourites #happy #Friday off… https://t.co/3tfYom0N1i'

In [10]:
import re
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

In [11]:
def process_tweet(tweet):
    ''' Process Tweet Function
    Input:
        tweet: a string containing a tweet
    Output:
        A list of words containing preprocessed tweet
    '''
    stopwords_english = stopwords.words('english')
    stemmer = PorterStemmer()
    
    # remove old style retweet text RT
    tweet = re.sub(r'^RT[\s]+', '', tweet)

    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

    # remove hashtags
    tweet = re.sub(r'#', '', tweet)
    
    # tokenize the string: split the strings into individual words without blanks or tabs
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    
    # Removing stopwords and punctuation and Stemming  
    tweet_stem = []

    for word in tweet_tokens:
        if word not in stopwords_english and word not in string.punctuation:
#             tweet_clean.append(word)
            stem_word = stemmer.stem(word)
            tweet_stem.append(stem_word)
    
    return tweet_stem

In [12]:
print(process_tweet(tweet))

['beauti', 'sunflow', 'sunni', 'friday', 'morn', ':)', 'sunflow', 'favourit', 'happi', 'friday', '…']


## 2. Build Frequencies
* Build vocabulory dictionary from training data in the form of {(word,label):freq}. 
* train_x & train_y is the corpus of tweets that is used to build frequency dictionary
* Vocabulory is the set of unique words from corpus and its positive frequency is the number of times that word has appeared in positive tweets and negative frequency is the number of times that word has appeared in negative tweets.

In [13]:
def build_freqs(tweets, labels):
    ''' Build Frequencies
    Input:
        tweets: A list of tweets
        labels: An mx1 array with the sentiment label of each tweet (1 or 0)
    Output:
        freqs: A dictionary mapping each (word,sentiment) pair to its frequency
    '''
    
    labels_list = np.squeeze(labels).tolist()
    
    freqs = {} # empty dictionary
    
    for label,tweet in zip(labels_list,tweets):
        for word in process_tweet(tweet):
            pair = (word,label)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    
    return freqs

In [14]:
# Build frequency dictionary
freqs = build_freqs(train_x, train_y)

## 3. Extract Features
* Features are extracted from frequency dictionary.
* Feature of tweet m: X_m = [1, sum of positive frequencies, sum of negative frequencies]

In [15]:
def extract_features(tweet, freqs):
    '''
    Input:
        tweet: raw tweet without any processing
        freqs: frequency dictionary (word,label):frequency
    Output:
        feature = [1, sum of positive frequencies, sum of negative frequencies]
    '''
    # process tweet
    words = process_tweet(tweet)
    
    # initialize 1X3 vector to populate with features later
    x = np.zeros((1,3))
    
    # bias term = 1
    x[0,0] = 1
    
    # calculate sum of positive frequencies and sum of negative frequencies
    for word in words:
        if (word,1.0) in freqs.keys():
            x[0,1] += freqs[(word,1.0)]
        if (word,0.0) in freqs.keys():
            x[0,2] += freqs[(word,0.0)]
    
    return x

In [16]:
# extract features from training data
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i,:] = extract_features(train_x[i], freqs)

## 4. Train Logistic Regression Model
* Write gradient descent function to minimize cost of training
* Call `Gradient Descent` function on training features X

In [17]:
def gradient_descent(x, y, theta, alpha, num_iteration):
    '''
    Input:
        x: matrix of features
        y: labels for feature matrix
        theta: weight vector
        alpha: learning rate
        num_iteration: number of iterations
    Output:
        J: final cost
        theta: final weight vectore
    '''
    
    m = len(x) #number of rows
    
    for i in range(0,num_iteration):
        # call sigmoid function
        z = np.dot(x,theta)
        h = 1/(1 + np.exp(-z))
        
        # calculate cost function
        J = (-1/float(m))*(np.dot(np.transpose(y), np.log(h)) + np.dot(np.transpose(1-y), np.log(1-h)))
        
        # update theta
        theta = theta - (alpha/m)*np.dot(np.transpose(x), (h-y))
        
    J = float(J)
    return J,theta

In [18]:
J,theta = gradient_descent(X, train_y, np.zeros((3,1)), 1e-9, 1500)

In [23]:
J

0.24216576985691673

In [24]:
theta

array([[ 7.25263662e-08],
       [ 5.23898548e-04],
       [-5.55169894e-04]])

## 5. Test Logistic Regression Model
* write predict_tweet() function to predict the sentiment of tweet
* write test_logistic_regression function to evaluate the model on test data

In [26]:
def predict_tweet(tweet, freqs, theta):
    '''
    Input:
        tweet: a string
        freqs: frequency dictionary
        theta: weight vector
    Output:
        y_predict: probability of tweet being positive or negative
    '''
    # extract features of tweet
    x = extract_features(tweet, freqs)
    
    # predict sentiment using updated weight vector from gradient_descent function
    z = np.dot(x,theta)
    y_predict = 1/(1 + np.exp(-z))
        
    return y_predict

In [28]:
tweets = ['This movie is good','I am happy','I am sad','This movie is bad']
for tweet in tweets:
    print('{} -> {}'.format(tweet, predict_tweet(tweet,freqs,theta)))

This movie is good -> [[0.51336031]]
I am happy -> [[0.51858011]]
I am sad -> [[0.48677873]]
This movie is bad -> [[0.49420623]]


In [30]:
def test_logistic_regression(test_x, test_y, freqs, theta):
    '''
    Input:
        test_x: list of tweets
        test_y: corresponding labels for list of tweets
        freqs: frequency dictionary
        theta: weight vector    
    Output:
        accuracy: number of tweets classified correctly/ total number of tweets
    '''
    
    y_hat = [] #list for storing predictions
    
    for tweet in test_x:
        y_predict = predict_tweet(tweet, freqs, theta)
        
        if y_predict > 0.5:
            y_hat.append(1.0)
        else:
            y_hat.append(0)
            
    # Calculate Accuracy
    # y_hat: a list  test_y:m,1 array convert it to m, dimension for comparison
    test_y = np.squeeze(test_y)
    
    accuracy = sum(y_hat == test_y)/len(test_y)
    
    return accuracy

In [31]:
test_logistic_regression(test_x, test_y, freqs, theta)

0.995