<a href="https://colab.research.google.com/github/DavidBrynnHouse/NLP-Logistic-Regression/blob/main/NLP_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [121]:
import numpy as np
import nltk
from nltk.corpus import stopwords        
from nltk.stem import PorterStemmer        
from nltk.tokenize import RegexpTokenizer
import re
from nltk.corpus import twitter_samples
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


nltk.download('stopwords')
nltk.download('twitter_samples')
stops = set(stopwords.words('english'))

pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')
m = len(pos_tweets) + len(neg_tweets)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


In [77]:
def preprocess_tweet(tweet):
    """
    attributes:
    tweet: list of strings that represents a single tweet in the dataset
    returns:
    tweet: lowercased and removed stopwords, punctuations, handles, stemmed, and url's
    """

    # instantiate stemmer
    stemmer = PorterStemmer()

    for word in tweet.split(' '):
        # remove stopwords and stem
        if word in stops:
            tweet = tweet.replace(word, '')
        else:
            tweet = tweet.replace(word, stemmer.stem(word))


    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)

    # remove hyperlinks
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)

    # remove hashtags
    tweet = re.sub(r'#', '', tweet)

    # lowercase tweet
    tweet = str.lower(tweet)

    # instantiate tokenizer class
    tokenizer = RegexpTokenizer(r'\w+')

    # tokenize tweet and remove punctuation
    tweet = tokenizer.tokenize(tweet)

    return tweet

In [78]:
tweet = 'Hello there Young man @How are you doing this morning? so archiving'
print(preprocess_tweet(tweet))

['hello', 'ng', 'man', 'how', 'morning', 'archiv']


In [198]:
def create_vocab_dict(pos_tweets, neg_tweets):
    """
    """
    vocab = {}
    for tweet in pos_tweets:
        for word in preprocess_tweet(tweet):
            if vocab.get(word):
                vocab[word] = (vocab[word][0] + 1, vocab[word][1])
            else:
                vocab[word] = (1, 0)
    for tweet in neg_tweets:
        for word in preprocess_tweet(tweet):
            if vocab.get(word):
                vocab[word] = (vocab[word][0], vocab[word][1] + 1)
            else:
                vocab[word] = (0, 1)
    return vocab
vocab = create_vocab_dict(pos_tweets, neg_tweets)

In [197]:
def embed_tweet(tweet):
    """
    """
    x = np.zeros((1, 3))
    x[0,0] = int(1)
    pos = 0
    neg = 0
    for word in tweet:
        if vocab.get(word):
            pos += int(vocab[word][0])
            neg += int(vocab[word][1])
    x[0,1] = int(pos)
    x[0,2] = int(neg)
    return x
tweet = preprocess_tweet(pos_tweets[100])
print(embed_tweet(tweet))

[[1.000e+00 2.271e+03 2.706e+03]]


In [155]:
X = np.zeros((m, 3))
for i in range(len(pos_tweets)):
    tweet = preprocess_tweet(pos_tweets[i])
    X[i, :]= embed_tweet(tweet)
for j in range(len(neg_tweets)):
    tweet = preprocess_tweet(neg_tweets[j])
    X[len(pos_tweets) + j, :] = embed_tweet(tweet)   
y_pos = np.ones((int(m/2), 1))
y_neg = np.zeros((int(m/2), 1))
y = np.append(y_pos, y_neg, axis=0)

In [186]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [157]:
def sigmoid(z):
    """
    """
    return 1/(1 + np.exp(-z))

In [164]:
def gradientDescent(X, y, theta, alpha, num_iter):
    """
    """
    m = X.shape[0]
    for i in range(num_iter):
        z = np.dot(X, theta)
        h = sigmoid(z)
        J = (-1 / m) * (np.dot(y.T, np.log(h)) + np.dot((1 - y.T), np.log(1 - h)) )
        theta = theta - (alpha / m) * (np.dot(X.T, (h - y)))
    J = float(J)
    return J, theta

In [165]:
J, theta = gradientDescent(X_train, y_train, np.zeros((3, 1)), 1e-9, 1500)
print('cost: ', J)
print('theta: ', theta)

cost:  0.6847755515628218
theta:  [[ 1.09696961e-08]
 [ 5.26825749e-05]
 [-9.61885504e-05]]


In [180]:
def step_function(y_pred):
    if y_pred < 0.5:
        return 'positive'
    else:
        return 'negative'

In [188]:
def predict_tweet(tweet, theta):
    '''
    '''
    # extract the features of the tweet and store it into x
    x = embed_tweet(tweet)
    
    # make the prediction using x and theta
    y_pred = sigmoid(np.dot(x, theta))
    
    return y_pred

### The following code blocks were taken from the NLP Course by Deeplearning.ai I wanted to use them to check my work.

In [189]:
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    print( (tweet, predict_tweet(tweet, theta))) 

('I am happy', array([[0.49535578]]))
('I am bad', array([[0.50120451]]))
('this movie should have been great.', array([[0.40863361]]))
('great', array([[0.49210544]]))
('great great', array([[0.48421482]]))
('great great great', array([[0.47633206]]))
('great great great great', array([[0.46846107]]))


In [192]:
def test_logistic_regression(test_x, test_y, theta, predict_tweet=predict_tweet):
    """
    Input: 
        test_x: a list of tweets
        test_y: (m, 1) vector with the corresponding labels for the list of tweets
        freqs: a dictionary with the frequency of each pair (or tuple)
        theta: weight vector of dimension (3, 1)
    Output: 
        accuracy: (# of tweets classified correctly) / (total # of tweets)
    """
    
    ### START CODE HERE ###
    
    # the list for storing predictions
    y_hat = []
    
    for tweet in test_x:
        # get the label prediction for the tweet
        y_pred = predict_tweet(tweet, theta)
        
        if y_pred > 0.5:
            # append 1.0 to the list
            y_hat.append(1.0)
        else:
            # append 0 to the list
            y_hat.append(0.0)

    # With the above implementation, y_hat is a list, but test_y is (m,1) array
    # convert both to one-dimensional arrays in order to compare them using the '==' operator
    accuracy = np.sum(np.array(y_hat) == test_y.flatten(order='C')) / len(y_hat)
    ### END CODE HERE ###
    
    return accuracy

In [193]:
tmp_accuracy = test_logistic_regression(X_test, y_test, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.5028
