In [2]:
import nltk                                # Python library for NLP
from nltk.corpus import twitter_samples    # sample Twitter dataset from NLTK
import matplotlib.pyplot as plt            # library for visualization
import random

In [3]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\8086f\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\twitter_samples.zip.


True

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\8086f\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [5]:
# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [6]:
print(len(all_positive_tweets))
print(len(all_negative_tweets))


5000
5000


In [7]:
# Our selected sample. Complex enough to exemplify each step
tweet = all_positive_tweets[2277]
print(tweet)

My beautiful sunflowers on a sunny Friday morning off :) #sunflowers #favourites #happy #Friday off… https://t.co/3tfYom0N1i


In [8]:
# download the stopwords from NLTK
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\8086f\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [9]:
import re
import string
from nltk.stem import PorterStemmer
ps=PorterStemmer()
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [10]:
stopwords=nltk.corpus.stopwords.words('english')
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [11]:
def clean_text(text):
  stopwords=nltk.corpus.stopwords.words('english')
  ps = PorterStemmer()
  text=re.sub(r'[^A-Z a-z]','',text)
  text=re.sub(r'^# @ _ ',' ',text)
  text=text.lower()
  text=text.split()
  text=[ps.stem(word) for word in text if word not in stopwords]
  text=" ".join(text)
  return text


In [12]:
all_positive_tweets[:5]

['#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)',
 '@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!',
 '@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!',
 '@97sides CONGRATS :)',
 'yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days']

In [13]:
clean_text(all_positive_tweets[0])

'followfriday franceint pkuchli milipolpari top engag member commun week'

In [14]:
for i in range(len(all_positive_tweets)):
  all_positive_tweets[i]=clean_text(all_positive_tweets[i])

In [15]:
for i in range(len(all_negative_tweets)):
  all_negative_tweets[i]=clean_text(all_negative_tweets[i])

In [16]:
tweeTtok=all_positive_tweets+all_negative_tweets

tweeTtok[:5]

['followfriday franceint pkuchli milipolpari top engag member commun week',
 'lambja hey jame odd pleas call contact centr abl assist mani thank',
 'despiteoffici listen last night bleed amaz track scotland',
 'side congrat',
 'yeaaaah yippppi accnt verifi rqst succeed got blue tick mark fb profil day']

## Tokenize the string

In [17]:
from nltk.tokenize import word_tokenize
def process_tweet(tweets):
  tokenized_tweets = []  # Initialize an empty list to store tokenized tweets
  for tweet in tweets:
      tokenized_words = word_tokenize(tweet)
      tokenized_tweets.append(tokenized_words)
  return tokenized_tweets

In [21]:
tweeTtok=process_tweet(tweet)
tweeTtok[0]

['M']

In [22]:
import numpy as np

labels = np.append(np.ones((len(all_positive_tweets))), np.zeros((len(all_negative_tweets))))

In [23]:
labels.shape

(10000,)

In [24]:
def build_freqs(tweeTtok, ys):
    """Build frequencies.
    Input:
        tweeTtok: a list of tweets
        ys: an m x 1 array with the sentiment label of each tweet
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # Also note that this is just a NOP if ys is already a list.
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all tweets
    freqs = {}
    for y, tweet in zip(yslist, tweeTtok):
        for word in process_tweet(tweet):
            if isinstance(word, list):
                for w in word:
                    pair = (w, y)
                    if pair in freqs:
                        freqs[pair] += 1
                    else:
                        freqs[pair] = 1
            else:
                pair = (word, y)
                if pair in freqs:
                    freqs[pair] += 1
                else:
                    freqs[pair] = 1
    return freqs


In [None]:
freqs = build_freqs(tweeTtok, labels)

In [None]:
# check the output
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

In [27]:
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

In [30]:
# combine positive and negative labels
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [31]:
print("train_y.shape = " + str(train_y.shape))
print("test_y.shape = " + str(test_y.shape))

train_y.shape = (8000, 1)
test_y.shape = (2000, 1)


In [32]:
train_x=process_tweet(train_x)


In [33]:
train_x[0]

['followfriday',
 'franceint',
 'pkuchli',
 'milipolpari',
 'top',
 'engag',
 'member',
 'commun',
 'week']

In [39]:
freqs = build_freqs(train_x, train_y)
#just print a value from dictionary
first_key = next(iter(freqs))
print(first_key, freqs[first_key])
# check the output
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

('followfriday', 1.0) 23
type(freqs) = <class 'dict'>
len(freqs) = 17865


In [40]:
# UNQ_C1 GRADED FUNCTION: sigmoid
def sigmoid(z):
    '''
    Input:
        z: is the input (can be a scalar or an array)
    Output:
        h: the sigmoid of z
    '''

    # calculate the sigmoid of z
    h = 1/(1+np.exp(-z))

    return h

In [41]:
def gradientDescent(x, y, theta, alpha, num_iters):
    '''
    Input:
        x: matrix of features which is (m,n+1)
        y: corresponding labels of the input matrix x, dimensions (m,1)
        theta: weight vector of dimension (n+1,1)
        alpha: learning rate
        num_iters: number of iterations you want to train your model for
    Output:
        J: the final cost
        theta: your final weight vector
    Hint: you might want to print the cost to make sure that it is going down.
    '''
    ### START CODE HERE ###
    # get 'm', the number of rows in matrix x
    m = x.shape[0]

    for i in range(0, num_iters):

        # get z, the dot product of x and theta
        z = np.dot(x,theta)

        # get the sigmoid of z
        h = sigmoid(z)

        # calculate the cost function
        J = (-1/m)*(np.matmul(np.transpose(y),np.log(h)) + np.matmul(np.transpose(1-y),np.log(1-h)))

        # update the weights theta
        theta = theta - (alpha/m)*np.dot(np.transpose(x),(h-y))

    ### END CODE HERE ###
    J = float(J)
    return J, theta

In [42]:
def extract_features(tweet, freqs, process_tweet=process_tweet):
    '''
    Input:
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output:
        x: a feature vector of dimension (1,3)
    '''

    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3))

    #bias term is set to 1
    x[0,0] = 1

    ### START CODE HERE ###

    # loop through each word in the list of words
    for word in tweet:

        if (word, 1.0) in freqs.keys() :
        # increment the word count for the positive label 1
            x[0,1] += freqs[(word, 1.0)]
        if (word, 0.0) in freqs.keys() :
        # increment the word count for the negative label 0
            x[0,2] += freqs[(word, 0.0)]

    ### END CODE HERE ###
    assert(x.shape == (1, 3))
    return x

In [43]:
extract_features(train_x[0], freqs=freqs)

array([[  1., 174.,  60.]])

In [44]:
tmp2 = extract_features('blorb bleeeeb bloooob', freqs)
print(tmp2)

[[ 1. 65. 78.]]


In [45]:
tmp2 = extract_features('hi this is fijaz', freqs)
print(tmp2)

[[ 1.  7. 21.]]


In [46]:

# collect the features 'x' and stack them into a matrix 'X'
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :]= extract_features(train_x[i], freqs)

# training labels corresponding to X
Y = train_y


In [47]:
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-9, 1500)
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}." )

The cost after training is 0.69123464.
The resulting vector of weights is [-0.0, 4.949e-05, -2.052e-05].


  J = float(J)


In [48]:
def predict_tweet(tweet, freqs, theta):
    '''
    Input:
        tweet: a string
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
        theta: (3,1) vector of weights
    Output:
        y_pred: the probability of a tweet being positive or negative
    '''
    ### START CODE HERE ###

    # extract the features of the tweet and store it into x
    x = extract_features(tweet, freqs)

    # make the prediction using x and theta
    y_pred = sigmoid(np.dot(x,theta))

    ### END CODE HERE ###

    return y_pred

In [49]:
# Run this cell to test your function
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    print( '%s -> %f' % (tweet, predict_tweet(tweet, freqs, theta)))

I am happy -> 0.502489
I am bad -> 0.500021
this movie should have been great. -> 0.502135
great -> 0.500159
great great -> 0.500319
great great great -> 0.500478
great great great great -> 0.500637


  print( '%s -> %f' % (tweet, predict_tweet(tweet, freqs, theta)))


In [50]:
my_tweet = 'I am learning :)'
predict_tweet(my_tweet, freqs, theta)

array([[0.50025928]])

In [51]:
# UNQ_C5 GRADED FUNCTION: test_logistic_regression
def test_logistic_regression(test_x, test_y, freqs, theta, predict_tweet=predict_tweet):
    """
    Input:
        test_x: a list of tweets
        test_y: (m, 1) vector with the corresponding labels for the list of tweets
        freqs: a dictionary with the frequency of each pair (or tuple)
        theta: weight vector of dimension (3, 1)
    Output:
        accuracy: (# of tweets classified correctly) / (total # of tweets)
    """

    ### START CODE HERE ###

    # the list for storing predictions
    y_hat = []

    for tweet in test_x:
        # get the label prediction for the tweet
        y_pred = predict_tweet(tweet, freqs, theta)

        if y_pred > 0.5:
            # append 1.0 to the list
            y_hat.append(1.0)
        else:
            # append 0 to the list
            y_hat.append(0.0)

    # With the above implementation, y_hat is a list, but test_y is (m,1) array
    # convert both to one-dimensional arrays in order to compare them using the '==' operator

    accuracy = 0

    for i in range(test_y.shape[0]):
        if test_y[i]==y_hat[i]:
            accuracy+=1

    accuracy=np.float64(accuracy/test_y.shape[0])

    ### END CODE HERE ###

    return accuracy


In [52]:
tmp_accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.5040


In [53]:
# Feel free to change the tweet below
my_tweet = 'I won!'
def process_tweet(my_tweet):
  return word_tokenize(my_tweet)
print(process_tweet(my_tweet))
y_hat = predict_tweet(my_tweet, freqs, theta)
print(y_hat)
if y_hat > 0.5:
    print('Positive sentiment')
else:
    print('Negative sentiment')

['I', 'won', '!']
[[0.50010083]]
Positive sentiment
