In [5]:
import nltk
from os import getcwd
nltk.download('twitter_samples')
nltk.download('stopwords')
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples
import re
import string
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer


[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:

def preprocess(tweet):
    stemmer=PorterStemmer()
    stop_words=stopwords.words('english')
    tweet = re.sub(r'\$\w*', '', tweet)#remove stock market stickers
    tweet = re.sub(r'^RT[\s]+', '', tweet) #remove retweet
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet) #remove retweet
    tweet = re.sub(r'#', '', tweet) #remove hash
    
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_words = tokenizer.tokenize(tweet)
    
    
    processed_tweet=[]
    for word in tweet_words:
        if word not in stop_words and word not in string.punctuation:
            stem_word=stemmer.stem(word)
            processed_tweet.append(stem_word)
    
    return processed_tweet
            

In [7]:
def build_freqs(tweets,ys):
    yslist = np.squeeze(ys).tolist()
    
    freq={}
    
    for y, tweet in zip(yslist,tweets):
        for word in preprocess(tweet):
            pair=(word,y)
            if pair in freq:
                freq[pair]+=1
            else:
                freq[pair]=1
    return freq
        
    

In [8]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')


In [9]:
# splitting into train and test sets

from sklearn.model_selection import train_test_split
X_train_pos, X_test_pos= train_test_split(all_positive_tweets, test_size = 0.2, random_state = 0)
X_train_neg, X_test_neg= train_test_split(all_negative_tweets, test_size = 0.2, random_state = 0)

X_train=X_train_pos+X_train_neg
X_test=X_test_pos+X_test_neg

Y_train = np.append(np.ones((len(X_train_pos), 1)), np.zeros((len(X_train_neg), 1)), axis=0)
Y_test = np.append(np.ones((len(X_test_pos), 1)), np.zeros((len(X_test_neg), 1)), axis=0)





In [10]:
freqs=build_freqs(X_train, Y_train)
print(freqs)

{("here'", 1.0): 21, ('life', 1.0): 41, ('lesson', 1.0): 4, ('teach', 1.0): 2, ('kid', 1.0): 16, ('weekend', 1.0): 55, (':)', 1.0): 2837, ('order', 1.0): 13, ('frightl', 1.0): 1, ('undead', 1.0): 1, ('plush', 1.0): 1, ('cushion', 1.0): 1, (':D', 1.0): 510, ('hope', 1.0): 105, ("they'll", 1.0): 5, ('soon', 1.0): 36, ('xxx', 1.0): 11, ('hi', 1.0): 142, ('gorgeou', 1.0): 8, ('nice', 1.0): 79, ('swim', 1.0): 3, ('si', 1.0): 3, ('get', 1.0): 170, ('birthday', 1.0): 57, ('shout', 1.0): 10, ('pleas', 1.0): 86, ('would', 1.0): 69, ('make', 1.0): 82, ('day', 1.0): 188, ('perfect', 1.0): 19, ('xx', 1.0): 33, ('derek', 1.0): 1, ('could', 1.0): 29, ('tri', 1.0): 33, ('use', 1.0): 39, ('parkshar', 1.0): 1, ('gloucestershir', 1.0): 1, ('onair', 1.0): 1, ('jame', 1.0): 7, ('bay', 1.0): 1, ('hold', 1.0): 4, ('back', 1.0): 78, ('river', 1.0): 3, ('0878 0388', 1.0): 1, ('1033', 1.0): 1, ('0272 3306', 1.0): 1, ('70', 1.0): 5, ('obvious', 1.0): 2, ('better', 1.0): 44, ('tweet', 1.0): 51, (':-)', 1.0): 550

In [12]:
def extract_features(tweet,freqs):
    word=preprocess(tweet)
    x=np.zeros((1,3))
    x[0,0]=1 #bias term
    
    for i in word:
        x[0,1]+=freqs.get((i,1.0),0) #positive label
        
        x[0,2]+=freqs.get((i,0.0),0) #negative label
    
    return x


In [13]:
X = np.zeros((len(X_train), 3))
for i in range(len(X_train)):
    X[i, :]= extract_features(X_train[i], freqs)

# training labels corresponding to X
Y = Y_train

In [14]:
def sigmoid(z): 
    
    # calculate the sigmoid of z
    h = 1 / (1 + np.exp(-z))
    
    
    return h

In [15]:
def gradientDescent(x, y, theta, alpha, num_iters):
    
    m = x.shape[0]
    
    for i in range(0, num_iters):
        
        # get z, the dot product of x and theta
        z = np.dot(x,theta)
        
        # get the sigmoid of z
        h = sigmoid(z)
        
        # calculate the cost function
        J = -1./m * (np.dot(y.transpose(), np.log(h)) + np.dot((1-y).transpose(),np.log(1-h)))    

        # update the weights theta
        theta = theta = theta - (alpha/m) * np.dot(x.transpose(),(h-y))
        
    
    J = float(J)
    return J, theta

In [16]:
np.random.seed(1)
# X input is 10 x 3 with ones for the bias terms
tmp_X = np.append(np.ones((10, 1)), np.random.rand(10, 2) * 2000, axis=1)
# Y Labels are 10 x 1
tmp_Y = (np.random.rand(10, 1) > 0.35).astype(float)

# Apply gradient descent
tmp_J, tmp_theta = gradientDescent(tmp_X, tmp_Y, np.zeros((3, 1)), 1e-8, 700)
print(f"The cost after training is {tmp_J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(tmp_theta)]}")

The cost after training is 0.67094970.
The resulting vector of weights is [4.1e-07, 0.00035658, 7.309e-05]


In [17]:


# Apply gradient descent
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-9, 1500)
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

The cost after training is 0.24378735.
The resulting vector of weights is [7e-08, 0.00052394, -0.00055517]


In [18]:
def predict_tweet(tweet, freqs, theta):
   
    x = extract_features(tweet,freqs)
    
    # make the prediction using x and theta
    y_pred = sigmoid(np.dot(x,theta))
    
   
    
    return y_pred

In [19]:
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'I am in a terrible mood', 'she is crying', 'this is the greatest invention']:
    print( '%s -> %f' % (tweet, predict_tweet(tweet, freqs, theta)))

I am happy -> 0.518689
I am bad -> 0.494324
this movie should have been great. -> 0.514532
great -> 0.514541
I am in a terrible mood -> 0.500069
she is crying -> 0.495512
this is the greatest invention -> 0.500262


In [20]:
my_tweet = 'I am learning :)'
predict_tweet(my_tweet, freqs, theta)

array([[0.81550726]])

In [21]:
my_tweet = "I am happy for you"
print(preprocess(my_tweet))
y_hat = predict_tweet(my_tweet, freqs, theta)
print(y_hat)
if y_hat > 0.5:
    print('Positive sentiment')
else: 
    print('Negative sentiment')

['happi']
[[0.51868919]]
Positive sentiment


In [50]:
def pred_review(review_list):
    pos_reviews=[]
    neg_reviews=[]
    for review in review_list:
        y=predict_tweet(review)
        if y>0.5:
            pos_reviews.append(review)
        else:
            neg_reviews.append(review)
    return pos_reviews, neg_reviews