In [285]:
import pandas as pd
import numpy as np
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)
from collections import Counter
from sklearn.model_selection import train_test_split

In [286]:
URL = 'https://raw.githubusercontent.com/AliElnafad/Sentiment-Analysis-Task/master/cleaned_dataset%20(1).csv'
data  = pd.read_csv(URL)

In [287]:
data.drop(columns='Unnamed: 0' , axis=1 , inplace = True)

In [288]:
data

Unnamed: 0,text,label
0,always wrote series complete stink fest jim be...,0
1,st watched dir steve purcell typical mary kate...,0
2,movie poorly written directed fell asleep minu...,0
3,interesting thing miryang secret sunshine acto...,1
4,first read berlin meer expect much thought rig...,0
...,...,...
4995,kind picture john lassiter would making today ...,1
4996,must see saw whipped press screening hilarious...,1
4997,nbc ashamed allow child see definitely would t...,0
4998,movie clumsy mishmash various ghost story susp...,0


In [289]:
data.shape

(5000, 2)

In [290]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.20, random_state=42)

In [291]:
print(X_train.shape)
print(X_test.shape)

(4000,)
(1000,)


In [292]:
def sigmoid(z): 
  return 1/( 1 + np.exp(-z))
def sigmoid_prime(z):
  return sigmoid(z) * (1-sigmoid(z))

In [293]:
def get_freq(data):
  posWord =[]
  negWord = []
  for idx in data.index:
    tweet = data.iloc[idx]['text']
    if data.iloc[idx]['label'] == 1:
      posWord.extend(tokenizer.tokenize(tweet))
    else:
      negWord.extend(tokenizer.tokenize(tweet))
  posCounts = Counter(posWord)
  negCounts = Counter(negWord)
  posFreq = {}
  negFreq = {}
  for key , value in posCounts.items():
    posFreq[(key,1.0)] = value
  for key , value in negCounts.items():
    negFreq[(key,0.0)] = value
  posFreq.update(negFreq) #all Freq
  return posFreq

In [294]:
Freqs = get_freq(data)

In [295]:
def extract_features(tweet , Freqs):
  words = tokenizer.tokenize(tweet)
  X = np.zeros((1,3))
  X[0,0] = 1 #bais
  for word in words:
    X[0,1] += Freqs.get((word , 1.0) , 0)
    X[0,2] += Freqs.get((word , 0.0) , 0)
    assert(X.shape == (1, 3))
  return X

In [296]:
emp = []
for tweet in X_train:
  emp.append(extract_features(tweet , Freqs))

In [297]:
train_features = np.array(emp).reshape(len(X_train) , 3)

In [298]:

y_train = np.array(y_train).reshape(len(y_train),1)
y_test = np.array(y_test).reshape(len(y_test),1)

In [299]:
def gradientDescent(x, y, theta, alpha, num_iters):
    '''
    Input:
        x: matrix of features which is (m,n+1)
        y: corresponding labels of the input matrix x, dimensions (m,1)
        theta: weight vector of dimension (n+1,1)
        alpha: learning rate
        num_iters: number of iterations you want to train your model for
    Output:
        J: the final cost
        theta: your final weight vector
    Hint: you might want to print the cost to make sure that it is going down.
    '''
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    # get 'm', the number of rows in matrix x
    m = x.shape[0]     
    for i in range(0, num_iters):
        
        # get z, the dot product of x and theta
        z = np.dot(x,theta)
        
        # get the sigmoid of h
        h = sigmoid(z)
        
        # calculate the cost function
        J = -1./m * (np.dot(y.transpose(), np.log(h)) + np.dot((1-y).transpose(),np.log(1-h)))                                                    

        # update the weights theta
        theta = theta - (alpha/m) * np.dot(x.transpose(),(h-y))
        
    ### END CODE HERE ###
    J = float(J)
    return J, theta

In [300]:
J, theta = gradientDescent(train_features, y_train,np.zeros((3,1)),  1e-9, 1500)

In [301]:
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

The cost after training is 1.19387573.
The resulting vector of weights is [5e-08, 0.00053778, -0.00046854]


In [302]:
def predict_tweet(tweet, freqs, theta):
    x = extract_features(tweet,freqs)
    y_pred = sigmoid(np.dot(x,theta))
    return y_pred

In [303]:
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    print( '%s -> %f' % (tweet, predict_tweet(tweet, Freqs, theta)))

I am happy -> 0.506528
I am bad -> 0.375015
this movie should have been great. -> 0.510184
great -> 0.612389
great great -> 0.713967
great great great -> 0.797718
great great great great -> 0.861697


In [304]:
def test_logistic_regression(test_x, test_y, freqs, theta):
    
    # the list for storing predictions
    y_hat = []
    
    for tweet in test_x:
        # get the label prediction for the tweet
        y_pred = predict_tweet(tweet, freqs, theta)
        if y_pred > 0.5:
            # append 1.0 to the list
            y_hat.append(1)
        else:
            # append 0 to the list
            y_hat.append(0)

    # With the above implementation, y_hat is a list, but test_y is (m,1) array
    # convert both to one-dimensional arrays in order to compare them using the '==' operator
    
    accuracy = (y_hat==np.squeeze(test_y)).sum()/len(test_x)

    return accuracy

In [305]:
tmp_accuracy = test_logistic_regression(X_test, y_test, Freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")

Logistic regression model's accuracy = 0.6070
