In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import twitter_samples

In [3]:
twitter_samples?

In [4]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [6]:
len(all_positive_tweets)

5000

In [7]:
len(all_negative_tweets)

5000

### Split Data into 80% training set and 20% test set

In [8]:
train_pos = all_positive_tweets[:4000]
test_pos = all_positive_tweets[4000:]
train_neg = all_negative_tweets[:4000]
test_neg = all_negative_tweets[4000:]

In [9]:
train_x = train_pos + train_neg
test_x = test_pos + test_neg

In [90]:
train_y = np.append(np.ones((len(train_pos),1)), np.zeros((len(train_neg),1)), axis=0)
test_y = np.append(np.ones((len(test_pos),1)), np.zeros((len(test_neg),1)), axis=0)

## Preprocess Tweets
1. Remove URLs, twitter marks and styles
2. Tokenize and Lowercase
3. Remove stopwords and punctuation
4. Stemming

In [16]:
tweet = all_positive_tweets[2277]
tweet

'My beautiful sunflowers on a sunny Friday morning off :) #sunflowers #favourites #happy #Friday off… https://t.co/3tfYom0N1i'

In [17]:
import re
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

In [44]:
def process_tweet(tweet):
    ''' Process Tweet Function
    Input:
        tweet: a string containing a tweet
    Output:
        A list of words containing preprocessed tweet
    '''
    stopwords_english = stopwords.words('english')
    stemmer = PorterStemmer()
    
    # remove old style retweet text RT
    tweet = re.sub(r'^RT[\s]+', '', tweet)

    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

    # remove hashtags
    tweet = re.sub(r'#', '', tweet)
    
    # tokenize the string: split the strings into individual words without blanks or tabs
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    
    # Removing stopwords and punctuation and Stemming  
    tweet_stem = []

    for word in tweet_tokens:
        if word not in stopwords_english and word not in string.punctuation:
#             tweet_clean.append(word)
            stem_word = stemmer.stem(word)
            tweet_stem.append(stem_word)
    
    return tweet_stem

In [45]:
print(process_tweet(tweet))

['beauti', 'sunflow', 'sunni', 'friday', 'morn', ':)', 'sunflow', 'favourit', 'happi', 'friday', '…']


## Build Frequencies
* Build vocabulory dictionary from training data in the form of {(word,label):freq}. 
* train_x & train_y is the corpus of tweets that is used to build frequency dictionary
* Vocabulory is the set of unique words from corpus and its positive frequency is the number of times that word has appeared in positive tweets and negative frequency is the number of times that word has appeared in negative tweets.

In [58]:
def build_freqs(tweets, labels):
    ''' Build Frequencies
    Input:
        tweets: A list of tweets
        labels: An mx1 array with the sentiment label of each tweet (1 or 0)
    Output:
        freqs: A dictionary mapping each (word,sentiment) pair to its frequency
    '''
    
    labels_list = np.squeeze(labels).tolist()
    
    freqs = {} # empty dictionary
    
    for label,tweet in zip(labels_list,tweets):
        for word in process_tweet(tweet):
            pair = (word,label)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    
    return freqs

In [72]:
# Build frequency dictionary
freqs = build_freqs(train_x, train_y)

## Feature Extraction
* Features are extracted from frequency dictionary.
* Feature of tweet m: X_m = [1, sum of positive frequencies, sum of negative frequencies]

In [71]:
def extract_features(tweet, freqs):
    '''
    Input:
        tweet: raw tweet without any processing
        freqs: frequency dictionary (word,label):frequency
    Output:
        feature = [1, sum of positive frequencies, sum of negative frequencies]
    '''
    # process tweet
    words = process_tweet(tweet)
    
    # initialize 1X3 vector to populate with features later
    x = np.zeros((1,3))
    
    # bias term = 1
    x[0,0] = 1
    
    # calculate sum of positive frequencies and sum of negative frequencies
    for word in words:
        if (word,1.0) in freqs.keys():
            x[0,1] += freqs[(word,1.0)]
        if (word,0.0) in freqs.keys():
            x[0,2] += freqs[(word,0.0)]
    
    return x

In [73]:
# extract features from training data
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i,:] = extract_features(train_x[i], freqs)

## Train Logistic Regression Model
* Write gradient descent function to minimize cost of training
* Call `Gradient Descent` function on training features X

In [77]:
def gradient_descent(x, y, theta, alpha, num_iteration):
    '''
    Input:
        x: matrix of features
        y: labels for feature matrix
        theta: weight vector
        alpha: learning rate
        num_iteration: number of iterations
    Output:
        J: final cost
        theta: final weight vectore
    '''
    
    m = len(x) #number of rows
    
    for i in range(0,num_iteration):
        # call sigmoid function
        z = np.dot(x,theta)
        h = 1/(1 + np.exp(-z))
        
        # calculate cost function
        J = (-1/float(m))*(np.dot(np.transpose(y), np.log(h)) + np.dot(np.transpose(1-y), np.log(1-h)))
        
        # update theta
        theta = theta - (alpha/m)*np.dot(np.transpose(x), (h-y))
        
    J = float(J)
    return J,theta

In [78]:
J,theta = gradient_descent(X, train_y, np.zeros((3,1)), 1e-9, 1500)

ValueError: shapes (1,4000) and (8000,1) not aligned: 4000 (dim 1) != 8000 (dim 0)

In [79]:
len(X)

8000

In [80]:
len(train_y)

4000