In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import nltk,re,string
from nltk.corpus import stopwords, twitter_samples

In [None]:
def process_tweet(tweet):
  stemmer=nltk.PorterStemmer()
  stopwords_english=stopwords.words('english')
  tweet = re.sub(r'\$\w*', '', tweet)
  tweet = re.sub(r'^RT[\s]+', '', tweet)
  tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
  tweet = re.sub(r'#', '', tweet)
  tokenizer=nltk.TweetTokenizer(preserve_case=False,strip_handles=True,reduce_len=True)
  tweet_tokens=tokenizer.tokenize(tweet)

  tweets_clean=[]
  for word in tweet_tokens:
    if(word not in stopwords_english and word not in string.punctuation):
      stem_word=stemmer.stem(word) #stemming a word
      tweets_clean.append(stem_word)

  return tweets_clean


In [None]:
def build_freqs(tweets,ys):
  ##input:
  ##tweets:a list of tweets
  ##ys: a m x 1 arraywith the sentiment label of each tweet as 0 or 1
  ##output:
  ##freqs: a dictionary mapping each (word,sentiment) pair to its frequency
  yslist=np.squeeze(ys).tolist()
  freqs={}
  for y, tweet in zip(yslist,tweets):
    for word in process_tweet(tweet):
      pair=(word,y)
      if pair in freqs:
        freqs[pair]=freqs[pair]+1
      else:
        freqs[pair]=1
  return freqs

In [None]:
nltk.download('stopwords')
nltk.download('twitter_samples')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [None]:
tweets=['i am happy','i am tricked','i am sad','i am tired','i am tired']
ys=[1,0,0,0,0]
res=build_freqs(tweets,ys)
print(res)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}


In [None]:
all_positive_tweets=twitter_samples.strings('positive_tweets.json')
all_negative_tweets=twitter_samples.strings('negative_tweets.json')

In [None]:
test_pos=all_positive_tweets[4000:]
train_pos=all_positive_tweets[:4000]
test_neg=all_negative_tweets[4000:]
train_neg=all_negative_tweets[4000:]

In [None]:
train_x=train_pos+train_neg #x label
test_x=test_pos+test_neg

In [None]:
train_y=np.append(np.ones((len(train_pos),1)),np.zeros((len(train_neg),1)),axis=0) #y label
test_y=np.append(np.ones((len(test_pos),1)),np.zeros((len(test_neg),1)),axis=0)

In [None]:
freqs=build_freqs(train_x,train_y) #creating a frequency dictionary

In [None]:
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 8001


In [None]:
#testing the model
print('This is an example of positive tweet: \n',train_x[22])
print('\n This is an example of processed version of the tweet:\n',process_tweet(train_x[22]))

This is an example of positive tweet: 
 @gculloty87 Yeah I suppose she was lol! Chat in a bit just off out x :))

 This is an example of processed version of the tweet:
 ['yeah', 'suppos', 'lol', 'chat', 'bit', 'x', ':)']


In [None]:
#buiding the sigmoid function
def sigmoid(z):
  zz=np.negative(z)
  h=1/(1+np.exp(zz))
  return h

In [None]:
#cost function implementation and gradient descent
def gradientDescent(x,y,theta,alpha,num_iters):
  m=x.shape[0]
  for i in range(0,num_iters):
    z=np.dot(x,theta)
    h=sigmoid(z)
    cost=-1/m*(np.dot(y.transpose(),np.log(h))+np.dot((1-y).transpose(),np.log(1-h)))
    theta=theta-(alpha/m)*np.dot(x.transpose(),(h-y))
  cost=float(cost)
  return cost,theta

In [None]:
def extract_features(tweet,freqs):
  word_1=process_tweet(tweet)
  x=np.zeros((1,3))
  x[0,0]=1 #bias term is set to 1
  for word in word_1:
    x[0,1]=x[0,1]+freqs.get((word,1.0),0) #increment the word count for positive label 1
    x[0,2]=x[0,2]+freqs.get((word,0.0),0) #increment the word count for negative label 0
  assert(x.shape==(1,3))
  return x

In [None]:
#testing on training data
tmp1=extract_features(train_x[22],freqs)
print(tmp1) #The three numbers obtained in the output are the feature set that we build during the build_freq and extract_features function
#build_freq builds a dictionary having words as keys and the no of times they have occured in the corpus as values
#extract_features take a sum of these values for positive and negative words i.e tmp1[1] and tmp[2]

[[1.000e+00 3.006e+03 3.200e+01]]


In [None]:
#Training the model
x=np.zeros((len(train_x),3)) #collecting features of x and stacking them into matrix 'x'
for i in range(len(train_x)):
  x[i, :]=extract_features(train_x[i],freqs)
y=train_y #training labels corresponding to x
J,theta=gradientDescent(x,y,np.zeros((3,1)),1e-9,1500) #applying gradient descent
#these values are predefined

In [None]:
def predict_tweet(tweet,freqs,theta):
  #input:
  #tweet:a string
  #freqs: a dictionary corresponding to the feature of each tuple (word,label)
  #theta: a (3,1) vector of weights
  #output:
  #y_pred: the probability of a tweet being positive or negative
  x=extract_features(tweet,freqs) #extracting the features of tweet and storing it into x
  y_pred=sigmoid(np.dot(x,theta))
  return y_pred

In [None]:
def test_logistic_regression(test_x,test_y,freqs,theta):
  y_hat=[] #the list for storing predictions
  for tweet in test_x:
    y_pred=predict_tweet(tweet,freqs,theta)
    if y_pred>0.5:
      y_hat.append(1)
    else:
      y_hat.append(0)
  accuracy=(y_hat==np.squeeze(test_y)).sum()/len(test_x)
  return accuracy

In [None]:
tmp_accuracy=test_logistic_regression(test_x,test_y,freqs,theta)
print(f'Logistic regression models accuracy={tmp_accuracy:.4f}')

Logistic regression models accuracy=0.7780


In [None]:
#predicting with your own tweet
def pre(sentence):
  yhat=predict_tweet(sentence,freqs,theta)
  if yhat>0.5:
    return 'positive sentiment'
  elif yhat==0:
    return 'neutral sentiment'
  else:
    return 'negative sentiment'

In [None]:
my_tweet='It is so hot today but it is the perfect day for a beach party'
res=pre(my_tweet)
print(res)

positive sentiment
