In [1]:
import nltk                                # Python library for NLP
from nltk.corpus import twitter_samples    # sample Twitter dataset from NLTK
import matplotlib.pyplot as plt            # library for visualization
import random
import numpy as np
import pandas as pd
import re                                  # library for regular expression operations
import string                              # for string operations

from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings
import glob

path_train_neg=r'C:\Users\raja4\Downloads\aclImdb_v1.tar\aclImdb\train\neg'
path_train_pos=r'C:\Users\raja4\Downloads\aclImdb_v1.tar\aclImdb\train\pos'
file_list_neg = glob.glob(path_train_neg + "/*.txt")
file_list_pos = glob.glob(path_train_pos + "/*.txt")

In [2]:
neg_reviews=[]
for i in range(0,len(file_list_neg)):
    data=pd.read_table(file_list_neg[i])
    neg_reviews.append(data.columns.tolist()[0])
pos_reviews=[]
for i in range(0,len(file_list_pos)):
    data=pd.read_table(file_list_pos[i])
    pos_reviews.append(data.columns.tolist()[0])
reviews=pos_reviews+neg_reviews

In [3]:
def process_review(review):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    review = re.sub(r'\$\w*', '', str(review))
    review = re.sub(r'<br /><br />+', '', str(review))

    # remove hyperlinks
    review = re.sub(r'https?://[^\s\n\r]+', '', str(review))
    #remove suspension points
    review = re.sub("\.", " ", str(review))

    review = re.sub(r'www.[^\s\n\r]+', '', str(review))

    #review2 = re.sub(r'...+', '', review2)
    # remove hashtags
    # only removing the hash # sign from the word
    review = re.sub(r'#', '', str(review))
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    review_tokens = tokenizer.tokenize(review)

    reviews_clean = []
    for word in review_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            reviews_clean.append(stem_word)
    return reviews_clean

In [4]:
# choose the same tweet
review = pos_reviews[0]

print()
print('\033[92m')
print(review)
print('\033[94m')

# call the imported function
review_stem = process_review(review); # Preprocess a given tweet

print('preprocessed review:')
print(review_stem) # Print the result


[92m
Bromwell High is a cartoon comedy.It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!
[94m
preprocessed review:
['bromwel', 'high', 'cartoon', 'comedi', 'ran', 'time', 'program', 'school', 'life', 'teacher', '35', 'year', 'teach', 'profess', 'lead', 'believ', 'bromwel', "

In [5]:
def build_freqs(reviews, ys):
    """Build frequencies.
    Input:
        tweets: a list of tweets
        ys: an m x 1 array with the sentiment label of each review
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all tweets
    # and over all processed words in each tweet.
    freqs = {}
    for y, review in zip(yslist, reviews):
        for word in process_review(review):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1    
    return freqs


In [6]:
labels = np.append(np.ones((len(pos_reviews))), np.zeros((len(neg_reviews))), axis =0)
freqs = build_freqs(reviews, labels)

In [7]:
#  extract_features
def extract_features(tweet, freqs, process_review=process_review):
    '''
    Input: 
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process_review(tweet)
    
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3)) 
    
    #bias term is set to 1
    x[0,0] = 1 
    
    ### START CODE HERE ###
    
    # loop through each word in the list of words
    for word in word_l:
        if (word,1.0) in freqs:
        # increment the word count for the positive label 1
            x[0,1] += freqs[(word,1.0)]
        if (word,0.0) in freqs:
        # increment the word count for the negative label 0
            x[0,2] += freqs[(word,0.0)]
        
    ### END CODE HERE ###
    assert(x.shape == (1, 3))
    return x

In [8]:
# UNQ_C1 GRADED FUNCTION: sigmoid
def sigmoid(z): 
    '''
    Input:
        z: is the input (can be a scalar or an array)
    Output:
        h: the sigmoid of z
    '''
    
    ### START CODE HERE ###
    # calculate the sigmoid of z
    h = 1/(1+np.exp(-z))
    ### END CODE HERE ###
    
    return h

In [9]:
def gradientDescent(x, y, theta, alpha, num_iters):
    '''
    Input:
        x: matrix of features which is (m,n+1)
        y: corresponding labels of the input matrix x, dimensions (m,1)
        theta: weight vector of dimension (n+1,1)
        alpha: learning rate
        num_iters: number of iterations we want to train our model for
    Output:
        J: the final cost
        theta: our final weight vector
    '''
    ### START CODE HERE ###
    # get 'm', the number of rows in matrix x
    m = len(x)
    
    for i in range(0, num_iters):
        
        # get z, the dot product of x and theta
        z = np.dot(x,theta)
        # get the sigmoid of z
        h = sigmoid(z)
        
        # calculate the cost function
        J =((-1)/m)*(np.dot(np.transpose(y),np.log(h))+np.dot(np.transpose(1-y),np.log(1-h)))

        # update the weights theta
        theta = theta-(1/m)*alpha*np.dot(np.transpose(x),(h-y))
        
    ### END CODE HERE ###
    J = float(J)
    return J, theta


In [40]:
X = np.zeros((len(reviews), 3))
for i in range(len(reviews)):
    X[i, :]= extract_features(reviews[i], freqs)

# training labels corresponding to X
Y = np.reshape(labels,(len(labels),1))


# Apply gradient descent
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-9, 50000)
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

  J =((-1)/m)*(np.dot(np.transpose(y),np.log(h))+np.dot(np.transpose(1-y),np.log(1-h)))


The cost after training is nan.
The resulting vector of weights is [-6e-08, 0.0011103, -0.00098155]


In [53]:
theta=[-6e-08, 0.0011103, -0.00098155]

In [54]:
def predict_tweet(tweet, freqs, theta):
    '''
    Input: 
        tweet: a string
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
        theta: (3,1) vector of weights
    Output: 
        y_pred: the probability of a tweet being positive or negative
    '''
    ### START CODE HERE ###
    
    # extract the features of the tweet and store it into x
    x =  extract_features(tweet, freqs)
    
    # make the prediction using x and theta
    y_pred = sigmoid(np.dot(x,theta))
    
    ### END CODE HERE ###
    
    return y_pred

In [55]:
for tweet in ['I am happy', 'I am bad', 'this is a bad movie','this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    print( '%s -> %f' % (tweet, predict_tweet(tweet, freqs, theta)))    

I am happy -> 0.597262
I am bad -> 0.006147
this is a bad movie -> 0.000250
this movie should have been great. -> 0.792008
great -> 0.989489
great great -> 0.999887
great great great -> 0.999999
great great great great -> 1.000000


In [43]:
path_test_neg=r'C:\Users\raja4\Downloads\aclImdb_v1.tar\aclImdb\test\neg'
path_test_pos=r'C:\Users\raja4\Downloads\aclImdb_v1.tar\aclImdb\test\pos'
file_list_neg_test = glob.glob(path_test_neg + "/*.txt")
file_list_pos_test = glob.glob(path_test_pos + "/*.txt")

In [44]:
test_neg_reviews=[]
for i in range(0,len(file_list_neg_test)):
    data=pd.read_table(file_list_neg_test[i])
    test_neg_reviews.append(data.columns.tolist()[0])
test_pos_reviews=[]
for i in range(0,len(file_list_pos_test)):
    data=pd.read_table(file_list_pos_test[i])
    test_pos_reviews.append(data.columns.tolist()[0])

test_x=test_pos_reviews+test_neg_reviews
test_y=np.append(np.ones((len(test_pos_reviews))), np.zeros((len(test_neg_reviews))))

In [45]:
# UNQ_C5 GRADED FUNCTION: test_logistic_regression
def test_logistic_regression(test_x, test_y, freqs, theta, predict_tweet=predict_tweet):
    """
    Input: 
        test_x: a list of tweets
        test_y: (m, 1) vector with the corresponding labels for the list of tweets
        freqs: a dictionary with the frequency of each pair (or tuple)
        theta: weight vector of dimension (3, 1)
    Output: 
        accuracy: (# of tweets classified correctly) / (total # of tweets)
    """
  
    # the list for storing predictions
    y_hat =[]
    
    for tweet in test_x:
        # get the label prediction for the tweet
        y_pred = predict_tweet(tweet, freqs, theta)
        
        if y_pred > 0.5:
            # append 1.0 to the list
            y_hat.append(1.0)
        else:
            # append 0 to the list
            y_hat.append(0.0)
    
    # With the above implementation, y_hat is a list, but test_y is (m,1) array
    # convert both to one-dimensional arrays in order to compare them using the '==' operator
    a=np.reshape(np.asarray(y_hat),(len(np.asarray(y_hat)),1))
    b=np.reshape(test_y,(len(test_y),1))
    accuracy=0
    for i in range(len(a)):
        if a[i]==b[i]:
            accuracy+=1/len(a)
    
    


    ### END CODE HERE ###
    
    return accuracy

In [56]:
tmp_accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

KeyboardInterrupt: 

In [17]:
import math
def train_naive_bayes(freqs, train_x, train_y):
    '''
    Input:
        freqs: dictionary from (word, label) to how often the word appears
        train_x: a list of tweets
        train_y: a list of labels correponding to the tweets (0,1)
    Output:
        logprior: the log prior. (equation 3 above)
        loglikelihood: the log likelihood of you Naive bayes equation. (equation 6 above)
    '''
    loglikelihood = {}
    logprior = 0

    ### START CODE HERE ###

    # calculate V, the number of unique words in the vocabulary
    vocab = set(sum(list(set(freqs.keys())),()))
    vocab.discard(1.0)
    vocab.discard(0.0)
    V=len(vocab)

    # calculate N_pos, N_neg, V_pos, V_neg
    N_pos = N_neg = 0
    for pair in freqs.keys():
        # if the label is positive (greater than zero)
        if pair[1] > 0:
            # Increment the number of positive words by the count for this (word, label) pair
            N_pos += freqs[pair]
        # else, the label is negative
        else:
            # increment the number of negative words by the count for this (word,label) pair
            N_neg += freqs[pair]
    
    # Calculate D, the number of documents
    D = len(train_y)

    # Calculate D_pos, the number of positive documents
    D_pos = list(train_y).count(1)

    # Calculate D_neg, the number of negative documents
    D_neg = list(train_y).count(0)

    # Calculate logprior
    logprior =math.log( D_pos/D_neg)
    
    # For each word in the vocabulary...
    for word in vocab:
        # get the positive and negative frequency of the word
        if (word,1.0) in freqs:
            freq_pos = freqs[(word,1.0)] 
        else :
            freq_pos=0
        if (word,0.0) in freqs:
            freq_neg = freqs[(word,0.0)]
        else :
            freq_neg=0
        # calculate the probability that each word is positive, and negative
        p_w_pos = (freq_pos+1)/(N_pos+V)
        p_w_neg = (freq_neg+1)/(N_neg+V)

        # calculate the log likelihood of the word
        loglikelihood[word] = math.log(p_w_pos/p_w_neg)

    ### END CODE HERE ###

    return logprior, loglikelihood

In [18]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    '''
    Input:
        tweet: a string
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: the sum of all the logliklihoods of each word in the tweet (if found in the dictionary) + logprior (a number)

    '''
    ### START CODE HERE ###
    # process the tweet to get a list of words
    word_l = process_review(tweet)

    # initialize probability to zero
    p = 0

    # add the logprior
    p += logprior

    for word in word_l:

        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            p += loglikelihood[word]

    ### END CODE HERE ###

    return p

In [19]:
logprior, loglikelihood = train_naive_bayes(freqs, reviews, labels)
print(logprior)
print(len(loglikelihood))

0.0
76245


In [20]:
my_tweet = 'this movie is interesting'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print('The expected output is', p)

The expected output is -0.38796742591768973


In [21]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood, naive_bayes_predict=naive_bayes_predict):
    """
    Input:
        test_x: A list of tweets
        test_y: the corresponding labels for the list of tweets
        logprior: the logprior
        loglikelihood: a dictionary with the loglikelihoods for each word
    Output:
        accuracy: (# of tweets classified correctly)/(total # of tweets)
    """
    accuracy = 0  # return this properly

    ### START CODE HERE ###
    y_hats = []
    for tweet in test_x:
        # if the prediction is > 0
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            # the predicted class is 1
            y_hat_i = 1
        else:
            # otherwise the predicted class is 0
            y_hat_i = 0

        # append the predicted class to the list y_hats
        y_hats.append(y_hat_i)

    # error is the average of the absolute values of the differences between y_hats and test_y
    error=0
    for i in range(len(y_hats)):
        if y_hats[i]!=test_y[i]:
            error +=1
    error=error/len(y_hats)

    # Accuracy is 1 minus the error
    accuracy = 1-error

    ### END CODE HERE ###

    return accuracy

In [22]:
freqs[('inter',0.0)]

6

In [23]:
print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(test_x, test_y, logprior, loglikelihood)))

Naive Bayes accuracy = 0.8236


In [61]:
print('Truth Predicted Tweet')
for x, y in zip(test_x, test_y):
    y_hat = naive_bayes_predict(x, logprior, loglikelihood)
    if y != (np.sign(y_hat) > 0):
        print('%d\t%0.2f\t%s' % (y, np.sign(y_hat) > 0, ''.join(x)))

Truth Predicted Tweet
1	0.00	I loved this movie from beginning to end.I am a musician and i let drugs get in the way of my some of the things i used to love(skateboarding,drawing) but my friends were always there for me.Music was like my rehab,life support,and my drug.It changed my life.I can totally relate to this movie and i wish there was more i could say.This movie left me speechless to be honest.I just saw it on the Ifc channel.I usually hate having satellite but this was a perk of having satellite.The ifc channel shows some really great movies and without it I never would have found this movie.Im not a big fan of the international films because i find that a lot of the don't do a very good job on translating lines.I mean the obvious language barrier leaves you to just believe thats what they are saying but its not that big of a deal i guess.I almost never got to see this AMAZING movie.Good thing i stayed up for it instead of going to bed..well earlier than usual.lol.I hope you al

In [25]:
def unigram_process(data):
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer()
    vectorizer = vectorizer.fit(data)
    return vectorizer
def bigram_process(data):
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer(ngram_range=(1,2))
    vectorizer = vectorizer.fit(data)
    return vectorizer
def tfidf_process(data):
    from sklearn.feature_extraction.text import TfidfTransformer 
    transformer = TfidfTransformer()
    transformer = transformer.fit(data)
    return transformer

In [26]:
def stochastic_descent(Xtrain, Ytrain, Xtest):
    from sklearn.linear_model import SGDClassifier 
    clf = SGDClassifier(loss="hinge", penalty="l1", max_iter=10000)
    print ("SGD Fitting")
    clf.fit(Xtrain, Ytrain)
    print ("SGD Predicting")
    Ytest = clf.predict(Xtest)
    return Ytest

'''
ACCURACY finds the accuracy in percentage given the training and test labels 
Ytrain - One set of labels 
Ytest - Other set of labels 
'''
def accuracy(Ytrain, Ytest):
    assert (len(Ytrain)==len(Ytest))
    num =  sum([1 for i, word in enumerate(Ytrain) if Ytest[i]==word])
    n = len(Ytrain)  
    return (num)/n

In [27]:
uni_vectorizer = unigram_process(reviews)
print ("Fitting the unigram model")
Xtrain_uni = uni_vectorizer.transform(reviews)
print ("After fitting ")
Xtest_uni = uni_vectorizer.transform(test_x)

Fitting the unigram model
After fitting 


In [28]:
y_hat=stochastic_descent(Xtrain_uni,labels,Xtest_uni)

SGD Fitting
SGD Predicting


In [29]:
accuracy(y_hat,test_y)

0.85876

In [30]:
bi_vectorizer = bigram_process(reviews)
print ("Fitting the bigram model")
Xtrain_bi = bi_vectorizer.transform(reviews)
print ("After fitting ")
print ("Bigram Model on the Test Data--")
Xtest_bi = bi_vectorizer.transform(test_x)

Fitting the bigram model
After fitting 
Bigram Model on the Test Data--


In [31]:
y_hat_bi = stochastic_descent(Xtrain_bi, labels, Xtest_bi)

SGD Fitting
SGD Predicting


In [32]:
accuracy(y_hat_bi,test_y)

0.8672

In [33]:
uni_tfidf_transformer = tfidf_process(Xtrain_uni)
print ("Fitting the tfidf for unigram model")
Xtrain_tf_uni = uni_tfidf_transformer.transform(Xtrain_uni)
print ("After fitting TFIDF")
Xtest_tf_uni = uni_tfidf_transformer.transform(Xtest_uni)

Fitting the tfidf for unigram model
After fitting TFIDF


In [34]:
y_hat_tf_uni = stochastic_descent(Xtrain_tf_uni, labels, Xtest_tf_uni)

SGD Fitting
SGD Predicting


In [35]:
accuracy(y_hat_tf_uni,test_y)

0.87604

In [36]:
bi_tfidf_transformer = tfidf_process(Xtrain_bi)
print ("Fitting the tfidf for bigram model")
Xtrain_tf_bi = bi_tfidf_transformer.transform(Xtrain_bi)
print ("After fitting TFIDF")
Xtest_tf_bi = bi_tfidf_transformer.transform(Xtest_bi)

Fitting the tfidf for bigram model
After fitting TFIDF


In [37]:
y_hat_tf_bi = stochastic_descent(Xtrain_tf_bi, labels, Xtest_tf_bi)

SGD Fitting
SGD Predicting


In [38]:
accuracy(y_hat_tf_bi,test_y)

0.8652