In [3]:
import nltk
from  os import getcwd

In [4]:
filePath = f"{getcwd()}/../tmp2/"
nltk.data.path.append(filePath)

In [5]:
import re
import string
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer


def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean


def build_freqs(tweets, ys):
    """Build frequencies.
    Input:
        tweets: a list of tweets
        ys: an m x 1 array with the sentiment label of each tweet
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # Also note that this is just a NOP if ys is already a list.
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all tweets
    # and over all processed words in each tweet.
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

In [6]:
#!wget https://raw.githubusercontent.com/ibrahimjelliti/Deeplearning.ai-Natural-Language-Processing-Specialization/master/1%20-%20Natural%20Language%20Processing%20with%20Classification%20and%20Vector%20Spaces/Labs/Week%201/utils.py

import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples


In [13]:
import nltk

nltk.download('twitter_samples')
nltk.download('stopwords')
from nltk.corpus import twitter_samples


[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [14]:
all_positive = twitter_samples.strings('positive_tweets.json')
all_negative = twitter_samples.strings('negative_tweets.json')
#selection of positive and negative tweets
#print(len(all_positive))

In [15]:
#data spliting
test_pos= all_positive[4000:]
train_pos = all_positive[:4000]
test_neg = all_negative[4000:]
train_neg = all_negative[:4000]
train_X= train_pos + train_neg
test_X = test_pos + test_neg

In [16]:
#creating numpy array of 0s and 1s
train_y =  np.append(np.ones((len(train_pos),1)),np.zeros((len(train_neg),1)), axis =
 0)
#print(train_Y)
test_y = np.append(np.ones((len(test_pos),1)),np.zeros((len(test_neg),1)), axis = 0)

In [17]:
print("train_y.shape = " + str(train_y.shape))
print("test_y.shapr =" + str(test_y.shape))

train_y.shape = (8000, 1)
test_y.shapr =(2000, 1)


In [18]:
freqs =  build_freqs(train_X,train_y)#creating frequency dictionary


In [19]:
print("type(freqs) = " + str(type(freqs)))
print("len(freqs)= " + str(len(freqs.keys())) )

type(freqs) = <class 'dict'>
len(freqs)= 11337


In [20]:
#testing the function
print(" eg for positive tweet : \n" , train_X[0])
print("eg of processed positive tweet : \n" ,process_tweet(train_X[0]))

 eg for positive tweet : 
 #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
eg of processed positive tweet : 
 ['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']


In [21]:
# Applying function
def sigmoid(z):
  h = 1 / (1 + np.exp(-z))
  return h

In [23]:
##testing function in sigmoid:
#if(sigmoid(0)==0.5):
#  print("SUCCESS")
#else:
#  print("oops!")
#if (sigmoid(4.92) == 0.9927537604041685):
#    print('CORRECT!')
#else:
#    print('Oops again!')

In [26]:
def gradientDescent(x,y, theta, alpha, num_iter):
    '''
    Input:
        x: matrix of features which is (m,n+1)
        y: corresponding labels of the input matrix x, dimensions (m,1)
        theta: weight vector of dimension (n+1,1)
        alpha: learning rate
        num_iters: number of iterations you want to train your model for
    Output:
        J: the final cost
        theta: your final weight vector
 '''
    m = x.shape[0]
    for i in range(0,num_iter):
      z = np.dot(x,theta)#dot product of x and  theta
      h =  sigmoid(z)
      J = -1/m*(np.dot(y.transpose(),np.log(h))+np.dot((1-y).transpose(), np.log(1-h)))
      #update the weights theta
      theta = theta - (alpha/m) * np.dot(x.transpose(),(h-y))


    J = float(J)
    return J, theta


In [27]:
#checking function
np.random.seed(1)
tmp_x = np.append(np.ones((10,1)),np.random.rand(10,2)*2000,axis = 1)
tmp_y = (np.random.rand(10,1)>0.35).astype(float)

tmp_J , tmp_theta = gradientDescent(tmp_x, tmp_y , np.zeros((3,1)), 1e-8 , 700)
print(f"the cost after training is {tmp_J:.8f}.")
print(f"the resulting vector of weights is {[round(t,8) for  t in np.squeeze(tmp_theta)]}")

the cost after training is 0.67094970.
the resulting vector of weights is [4.1e-07, 0.00035658, 7.309e-05]


FEATURE EXTRACTION

In [28]:
def extract_features(tweet,freqs):
  #basic processing
  word_l = process_tweet(tweet)
  x = np.zeros((1,3))# 1 x 3 vector
  x[0,0] = 1 #setting  bias to 1
  for word in word_l:
     # increment the word count for the positive label 1
        x[0,1] += freqs.get((word, 1.0),0)

        # increment the word count for the negative label 0
        x[0,2] += freqs.get((word, 0.0),0)
  assert(x.shape == (1,3))
  return x





TRAINING


In [29]:
#over gradient descent
X = np.zeros((len(train_X),3))
for i in range(len(train_X)):
  X[i,:] = extract_features(train_X[i], freqs)

Y  = train_y
J, theta = gradientDescent(X,Y,np.zeros((3,1)),1e-9,1500)
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

The cost after training is 0.24215478.
The resulting vector of weights is [7e-08, 0.00052391, -0.00055517]


In [30]:
def predict_tweet(tweet, freqs , theta):
  x =  extract_features(tweet, freqs)
  y_pred = sigmoid(np.dot(x,theta))

  return y_pred

In [31]:
## test your function
#for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
#    print( '%s -> %f' % (tweet, predict_tweet(tweet, freqs, theta)))

I am happy -> 0.518581
I am bad -> 0.494339
this movie should have been great. -> 0.515331
great -> 0.515464
great great -> 0.530899
great great great -> 0.546274
great great great great -> 0.561562


CHECKING PERFORMANCE

In [32]:
def test_LR(test_x , test_y , freqs , theta):
    """
    Input:
        test_x: a list of tweets
        test_y: (m, 1) vector with the corresponding labels for the list of tweets
        freqs: a dictionary with the frequency of each pair (or tuple)
        theta: weight vector of dimension (3, 1)
    Output:
        accuracy: (# of tweets classified correctly) / (total # of tweets)
    """
    y_hat = [] #list for storing predctions
    for tweet in test_x :
      y_pred = predict_tweet(tweet,freqs,theta)
      if y_pred > 0.5:
        y_hat.append(1)
      else:
        y_hat.append(0)

    accuracy = (y_hat == np.squeeze(test_y)).sum()/len(test_x)
    return accuracy


ERROR ANALYSIS

In [37]:
# Some error analysis done for you
print('Label Predicted Tweet')
for x,y in zip(test_X,test_y):
    y_hat = predict_tweet(x, freqs, theta)

    if np.abs(y - (y_hat > 0.5)) > 0:
        print('THE TWEET IS:', x)
        print('THE PROCESSED TWEET IS:', process_tweet(x))
        print('%d\t%0.8f\t%s' % (y, y_hat, ' '.join(process_tweet(x)).encode('ascii', 'ignore')))


Label Predicted Tweet
THE TWEET IS: @jaredNOTsubway @iluvmariah @Bravotv Then that truly is a LATERAL move! Now, we all know the Queen Bee is UPWARD BOUND : ) #MovingOnUp
THE PROCESSED TWEET IS: ['truli', 'later', 'move', 'know', 'queen', 'bee', 'upward', 'bound', 'movingonup']
1	0.49996920	b'truli later move know queen bee upward bound movingonup'
THE TWEET IS: @MarkBreech Not sure it would be good thing 4 my bottom daring 2 say 2 Miss B but Im gonna be so stubborn on mouth soaping ! #NotHavingit :p
THE PROCESSED TWEET IS: ['sure', 'would', 'good', 'thing', '4', 'bottom', 'dare', '2', 'say', '2', 'miss', 'b', 'im', 'gonna', 'stubborn', 'mouth', 'soap', 'nothavingit', ':p']
1	0.48663815	b'sure would good thing 4 bottom dare 2 say 2 miss b im gonna stubborn mouth soap nothavingit :p'
THE TWEET IS: I'm playing Brain Dots : ) #BrainDots
http://t.co/UGQzOx0huu
THE PROCESSED TWEET IS: ["i'm", 'play', 'brain', 'dot', 'braindot']
1	0.48370697	b"i'm play brain dot braindot"
THE TWEET IS: I'm p

In [39]:
my_tweet = input("enter your tweet")
print(process_tweet(my_tweet))
y_hat = predict_tweet(my_tweet, freqs, theta)
print(y_hat)
if y_hat > 0.5:
    print('Positive sentiment')
else:
    print('Negative sentiment')

enter your tweeti found that drawing beautiful hence i am happy
['found', 'draw', 'beauti', 'henc', 'happi']
[[0.52363861]]
Positive sentiment
