In [1]:
import nltk

In [2]:
nltk.download("twitter_samples")
nltk.download("stopwords")

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\anura\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples

In [4]:
all_positive_tweets = twitter_samples.strings("positive_tweets.json")
all_negative_tweets = twitter_samples.strings("negative_tweets.json")

In [5]:
train_pos =all_positive_tweets[:4000]
train_neg = all_negative_tweets[:4000]

test_pos = all_positive_tweets[4000:]
test_neg = all_negative_tweets[4000:]

X_train = train_pos + train_neg
X_test = test_pos + test_neg

In [6]:
y_train = np.append(np.ones((len(train_pos), 1)), np.zeros(( len(train_neg), 1)))
y_test = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)))

In [7]:
print("train_y.shape = " + str(y_train.shape))
print("test_y.shape = " + str(y_test.shape))

train_y.shape = (8000,)
test_y.shape = (2000,)


#### Implementing Logistic Regression and Preprocessor

In [8]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

class LogisticRegression:
    def __init__(self, preprocessor):
        self.cost_history = []
        self.preprocessor = preprocessor

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def gradient_descent(self, X, y, theta, alpha=0.01, num_iters=1000, debug=False):
        """
        Input:
            X: matrix of features which is (m,n+1)
            y: corresponding labels of the input matrix x, dimensions (m,1)
            theta: weight vector of dimension (n+1,1)
            alpha: learning rate
            num_iters: number of iterations you want to train your model for
        Output:
            J: the final cost
            theta: your final weight vector
        """
        m = X.shape[0]
        for _ in range(num_iters):
            z = np.dot(X, theta)
            h = self.sigmoid(z)
            J = self.cost(m, h, y)
            theta -= (alpha / m) * np.dot(X.T, (h - y))
            self.cost_history.append(J)
            if debug and _ % 50 == 0:
                print("Cost : ", self.cost_history)
        self.theta = theta

        return J, theta

    def cost(self, m, h, y):
        return (-1.0 / m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))

    def predict(self, tweet, freqs, theta):
        X = self.preprocessor.extract_features(tweet, freqs)
        z = np.dot(X, theta)
        return self.sigmoid(z)


class Preprocessor:
    def __init__(self):
        pass

    def process_tweet(self, tweet):
        """Process tweet function.
        Input:
            tweet: a string containing a tweet
        Output:
            tweets_clean: a list of words containing the processed tweet

        """
        stemmer = PorterStemmer()
        stopwords_english = stopwords.words("english")
        # remove stock market tickers like $GE
        tweet = re.sub(r"\$\w*", "", tweet)
        # remove old style retweet text "RT"
        tweet = re.sub(r"^RT[\s]+", "", tweet)
        # remove hyperlinks
        tweet = re.sub(r"https?:\/\/.*[\r\n]*", "", tweet)
        # remove hashtags
        # only removing the hash # sign from the word
        tweet = re.sub(r"#", "", tweet)
        # tokenize tweets
        tokenizer = TweetTokenizer(
            preserve_case=False, strip_handles=True, reduce_len=True
        )
        tweet_tokens = tokenizer.tokenize(tweet)

        tweets_clean = []
        for word in tweet_tokens:
            if (
                word not in stopwords_english  # remove stopwords
                and word not in string.punctuation
            ):  # remove punctuation
                # tweets_clean.append(word)
                stem_word = stemmer.stem(word)  # stemming word
                tweets_clean.append(stem_word)

        return tweets_clean

    def build_freqs(self, tweets, ys):
        """Build frequencies.
        Input:
            tweets: a list of tweets
            ys: an m x 1 array with the sentiment label of each tweet
                (either 0 or 1)
        Output:
            freqs: a dictionary mapping each (word, sentiment) pair to its
            frequency
        """
        yslist = np.squeeze(ys).tolist()

     
        freqs = {}
        for y, tweet in zip(yslist, tweets):
            for word in self.process_tweet(tweet):
                pair = (word, y)
                if pair in freqs:
                    freqs[pair] += 1
                else:
                    freqs[pair] = 1

        return freqs

    def extract_features(self, tweet, freqs):
        """
        Input:
            tweet: a list of words for one tweet
            freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
        Output:
            x: a feature vector of dimension (1,3)
        """

        word_l = self.process_tweet(tweet)
        x = np.zeros((1, 3))
        x[0, 0] = 1  # bias

        for word in word_l:
            positive_pair_key = (word, 1)
            negative_pair_key = (word, 0)

            x[0, 1] += freqs.get(positive_pair_key, 0)
            x[0, 2] += freqs.get(negative_pair_key, 0)

        assert x.shape == (1, 3)

        return x


preprocessor = Preprocessor()
logistic_regression = LogisticRegression(preprocessor)

In [9]:
freqs = preprocessor.build_freqs(X_train, y_train)

print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 11337


In [10]:
print('This is an example of a positive tweet: \n', X_train[0])
print('\nThis is an example of the processed version of the tweet: \n', preprocessor.process_tweet(X_train[0]))

This is an example of a positive tweet: 
 #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)

This is an example of the processed version of the tweet: 
 ['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']


In [11]:
# Testing your function 
sigmoid = logistic_regression.sigmoid
if (sigmoid(0) == 0.5):
    print('SUCCESS!')
else:
    print('Oops!')

if (sigmoid(4.92) == 0.9927537604041685):
    print('CORRECT!')
else:
    print('Oops again!')

SUCCESS!
CORRECT!


In [12]:
# Check the function
# Construct a synthetic test case using numpy PRNG functions
np.random.seed(1)
# X input is 10 x 3 with ones for the bias terms
tmp_X = np.append(np.ones((10, 1)), np.random.rand(10, 2) * 2000, axis=1)
# Y Labels are 10 x 1
tmp_Y = (np.random.rand(10, 1) > 0.35).astype(float)

# Apply gradient descent
tmp_J, tmp_theta = logistic_regression.gradient_descent(tmp_X, tmp_Y, np.zeros((3, 1)), 1e-8, 700)
print(f"The cost after training is {tmp_J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(tmp_theta)]}")

The cost after training is 0.67094970.
The resulting vector of weights is [4.1e-07, 0.00035658, 7.309e-05]


### Extracting Features

In [13]:
tmp1 = preprocessor.extract_features(X_train[0], freqs)
print(tmp1)

[[1.00e+00 3.02e+03 6.10e+01]]


In [14]:
tmp2 = preprocessor.extract_features('blorb bleeeeb bloooob', freqs)
print(tmp2)

[[1. 0. 0.]]


In [15]:
X = np.zeros((len(X_train), 3))
for i in range(len(X_train)):
    X[i : ] = preprocessor.extract_features(X_train[i], freqs)
    
Y = y_train.reshape((y_train.shape[0], 1))




In [16]:
J, theta = logistic_regression.gradient_descent(X, Y, np.zeros((3, 1)), 1e-9, 1500)
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

The cost after training is 0.24215478.
The resulting vector of weights is [7e-08, 0.00052391, -0.00055517]


In [17]:

# Run this cell to test your function
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    print( '%s -> %f' % (tweet, logistic_regression.predict(tweet, freqs, theta)))

I am happy -> 0.518581
I am bad -> 0.494339
this movie should have been great. -> 0.515331
great -> 0.515464
great great -> 0.530899
great great great -> 0.546274
great great great great -> 0.561562


  print( '%s -> %f' % (tweet, logistic_regression.predict(tweet, freqs, theta)))


In [18]:
my_tweet = 'I am learning :)'
logistic_regression.predict(my_tweet, freqs, theta)

array([[0.8163691]])

In [19]:
def test_logistic_regression(test_x, test_y, freqs, theta):
    """
    Input: 
        test_x: a list of tweets
        test_y: (m, 1) vector with the corresponding labels for the list of tweets
        freqs: a dictionary with the frequency of each pair (or tuple)
        theta: weight vector of dimension (3, 1)
    Output: 
        accuracy: (# of tweets classified correctly) / (total # of tweets)
    """
    
    
    # the list for storing predictions
    y_hat = []
    
    for tweet in test_x:
        # get the label prediction for the tweet
        y_pred = logistic_regression.predict(tweet, freqs, theta)
        
        if y_pred > 0.5:
            y_hat.append(1)
        else:
            y_hat.append(0)

    # With the above implementation, y_hat is a list, but test_y is (m,1) array
    # convert both to one-dimensional arrays in order to compare them using the '==' operator
    y_hat = np.array(y_hat)
    accuracy = list(y_hat == y_test).count(True) / len(y_test)
    return accuracy

In [20]:
tmp_accuracy = test_logistic_regression(X_test, y_test, freqs, logistic_regression.theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.9950


In [21]:
print('Label Predicted Tweet')
for x,y in zip(X_test,y_test):
    y_hat = logistic_regression.predict(x, freqs, theta)

    if np.abs(y - (y_hat > 0.5)) > 0:
        print('THE TWEET IS:', x)
        print('THE PROCESSED TWEET IS:', preprocessor.process_tweet(x))
        print('%d\t%0.8f\t%s' % (y, y_hat, ' '.join(preprocessor.process_tweet(x)).encode('ascii', 'ignore')))

Label Predicted Tweet
THE TWEET IS: @jaredNOTsubway @iluvmariah @Bravotv Then that truly is a LATERAL move! Now, we all know the Queen Bee is UPWARD BOUND : ) #MovingOnUp
THE PROCESSED TWEET IS: ['truli', 'later', 'move', 'know', 'queen', 'bee', 'upward', 'bound', 'movingonup']
1	0.49996920	b'truli later move know queen bee upward bound movingonup'
THE TWEET IS: @MarkBreech Not sure it would be good thing 4 my bottom daring 2 say 2 Miss B but Im gonna be so stubborn on mouth soaping ! #NotHavingit :p
THE PROCESSED TWEET IS: ['sure', 'would', 'good', 'thing', '4', 'bottom', 'dare', '2', 'say', '2', 'miss', 'b', 'im', 'gonna', 'stubborn', 'mouth', 'soap', 'nothavingit', ':p']
1	0.48663815	b'sure would good thing 4 bottom dare 2 say 2 miss b im gonna stubborn mouth soap nothavingit :p'
THE TWEET IS: I'm playing Brain Dots : ) #BrainDots
http://t.co/UGQzOx0huu
THE PROCESSED TWEET IS: ["i'm", 'play', 'brain', 'dot', 'braindot']
1	0.48370697	b"i'm play brain dot braindot"
THE TWEET IS: I'm p

  print('%d\t%0.8f\t%s' % (y, y_hat, ' '.join(preprocessor.process_tweet(x)).encode('ascii', 'ignore')))


THE TWEET IS: I'm playing Brain Dots : ) #BrainDots http://t.co/R2JBO8iNww http://t.co/ow5BBwdEMY
THE PROCESSED TWEET IS: ["i'm", 'play', 'brain', 'dot', 'braindot']
1	0.48370697	b"i'm play brain dot braindot"
THE TWEET IS: off to the park to get some sunlight : )
THE PROCESSED TWEET IS: ['park', 'get', 'sunlight']
1	0.49578796	b'park get sunlight'
THE TWEET IS: @msarosh Uff Itna Miss karhy thy ap :p
THE PROCESSED TWEET IS: ['uff', 'itna', 'miss', 'karhi', 'thi', 'ap', ':p']
1	0.48212905	b'uff itna miss karhi thi ap :p'
THE TWEET IS: @phenomyoutube u probs had more fun with david than me : (
THE PROCESSED TWEET IS: ['u', 'prob', 'fun', 'david']
0	0.50020391	b'u prob fun david'
THE TWEET IS: pats jay : (
THE PROCESSED TWEET IS: ['pat', 'jay']
0	0.50039295	b'pat jay'
THE TWEET IS: my beloved grandmother : ( https://t.co/wt4oXq5xCf
THE PROCESSED TWEET IS: ['belov', 'grandmoth']
0	0.50000002	b'belov grandmoth'


In [22]:
my_tweet = 'This is a ridiculously bright movie. The plot was terrible and I was sad until the ending!'
print(preprocessor.process_tweet(my_tweet))
y_hat = logistic_regression.predict(my_tweet, freqs, theta)
print(y_hat)
if y_hat > 0.5:
    print('Positive sentiment')
else: 
    print('Negative sentiment')

['ridicul', 'bright', 'movi', 'plot', 'terribl', 'sad', 'end']
[[0.48139084]]
Negative sentiment
