<a href="https://colab.research.google.com/github/ANR22/New_repo/blob/main/Sentiment_Analysis_Tweets_LR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [39]:
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples
import re                                  # library for regular expression operations
import string                              # for string operations

from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings

In [40]:
nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [41]:
def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean


def build_freqs(tweets, ys):
    """Build frequencies.
    Input:
        tweets: a list of tweets
        ys: an m x 1 array with the sentiment label of each tweet
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    yslist = np.squeeze(ys).tolist()
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs


In [42]:

all_positive_tweets = twitter_samples.strings("positive_tweets.json")
all_negative_tweets = twitter_samples.strings("negative_tweets.json")

In [43]:
test_pos=all_positive_tweets[4000:]
train_pos = all_positive_tweets[0:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[0:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg
type(train_x)

list

In [44]:
train_y = np.append(np.ones((len(train_pos),1)) , np.zeros((len(train_neg),1)), axis=0)
test_y = np.append(np.ones((len(test_pos),1)) , np.zeros((len(test_neg),1)), axis=0)

In [45]:
print("train_y.shape=",str(train_y.shape))
print("test_y.shape=",str(test_y.shape))

train_y.shape= (8000, 1)
test_y.shape= (2000, 1)


In [46]:
freqs = build_freqs(train_x,train_y)
print("type(freqs) =",str(type(freqs)))
print("len(frqs) =",str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(frqs) = 11346


In [47]:
def sigmoid(z): 
    '''
    Input:
        z: is the input (can be a scalar or an array)
    Output:
        h: the sigmoid of z
    '''
    z=np.multiply(z,-1)
    h = 1/(1+np.exp(z))

    return h

In [48]:
def gradientDescent(x, y, theta, alpha, num_iters):
    '''
    Input:
        x: matrix of features which is (m,n+1)
        y: corresponding labels of the input matrix x, dimensions (m,1)
        theta: weight vector of dimension (n+1,1)
        alpha: learning rate
        num_iters: number of iterations you want to train your model for
    Output:
        J: the final cost
        theta: your final weight vector
    '''
    m = x.shape[0]
    
    for i in range(0, num_iters):
        z = np.dot(x,theta)
        h = sigmoid(z)
        
        # calculate the cost function
        J = (-1./m)*(np.dot(np.transpose(y),np.log(h)) + np.dot(np.transpose(1-y),np.log(1-h)))
        
        # update the weights theta
        theta = theta-(alpha/m)*(np.dot(np.transpose(x),(h-y)))
        
    
    J = float(J)
    return J, theta

In [49]:
def extract_features(tweet, freqs):
    '''
    Input: 
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
    word_l = process_tweet(tweet)
    x = np.zeros((1, 3)) 
    x[0,0] = 1 

    for word in word_l:
        
        if (word,1) in freqs:
            x[0,1] += freqs[(word,1)]
        
        if (word,0) in freqs:
            x[0,2] += freqs[(word,0)]
        
    assert(x.shape == (1, 3))
    return x

In [50]:
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :]= extract_features(train_x[i], freqs)
    
Y = train_y

# Apply gradient descent
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-9, 1500)
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

The cost after training is 0.24216529.
The resulting vector of weights is [7e-08, 0.0005239, -0.00055517]


In [51]:

def predict_tweet(tweet, freqs, theta):
    '''
    Input: 
        tweet: a string
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
        theta: (3,1) vector of weights
    Output: 
        y_pred: the probability of a tweet being positive or negative
    '''
    x = extract_features(tweet, freqs)
    

    y_pred = sigmoid(np.dot(x,theta))
    
    
    return y_pred

In [52]:
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    print( '%s -> %f' % (tweet, predict_tweet(tweet, freqs, theta)))

I am happy -> 0.518580
I am bad -> 0.494339
this movie should have been great. -> 0.515331
great -> 0.515464
great great -> 0.530898
great great great -> 0.546273
great great great great -> 0.561561


In [53]:
def test_logistic_regression(test_x, test_y, freqs, theta):
    """
    Input: 
        test_x: a list of tweets
        test_y: (m, 1) vector with the corresponding labels for the list of tweets
        freqs: a dictionary with the frequency of each pair (or tuple)
        theta: weight vector of dimension (3, 1)
    Output: 
        accuracy: (# of tweets classified correctly) / (total # of tweets)
    """
    
    y_hat = []
    
    for tweet in test_x:
        
        y_pred = predict_tweet(tweet, freqs, theta)
        
        if y_pred > 0.5:
            y_hat.append(1)
        else:
            y_hat.append(0)

    m=test_y.shape[0]
    y_hat=np.array(y_hat)
    y_hat = np.resize(y_hat,(m,1))
    accuracy = (np.sum(y_hat==test_y))/m

    
    return accuracy

In [54]:
tmp_accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.9950


In [55]:
my_tweet = '"If you have several dozen or several hundred contacts, then imagine the fun of sending each of them one by one."Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!'
print(process_tweet(my_tweet))
y_hat = predict_tweet(my_tweet, freqs, theta)
print(y_hat)
if y_hat > 0.5:
    print('Positive sentiment')
else: 
    print('Negative sentiment')

['sever', 'dozen', 'sever', 'hundr', 'contact', 'imagin', 'fun', 'send', 'one', 'one', 'tie', 'charger', 'convers', 'last', '45', 'minutes.major', 'problem']
[[0.49513105]]
Negative sentiment
