In [1]:
# run this cell to import nltk
import nltk
from os import getcwd

nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\teba\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\teba\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
import numpy as np
import pandas as pd


In [3]:
import re                                  # library for regular expression operations
import string                              # for string operations

from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings

### Prepare the data

In [4]:
def preProcessing(text):
    import re
    import string
    # Convert text to lowercase
    outText = text.lower()
    
    # Remove numbers
    outText = re.sub(r'\d+', '', outText)
    
    # Remove punctuation
    outText =  outText.translate(str.maketrans("","", string.punctuation))
    
    #Remove whitespaces
    outText = outText.strip()
    
    #Remove stopwords
    from nltk.corpus import stopwords 
    from nltk.tokenize import word_tokenize 
    stop_words = set(stopwords.words('english'))

    tokens = word_tokenize(outText)
    outText = [i for i in tokens if not i in stop_words]
    
    #Lemmatization
    from nltk.stem import WordNetLemmatizer
    lemmatizer=WordNetLemmatizer()
    result=[]
    for word in outText:
        result.append(lemmatizer.lemmatize(word))
        
        
    return result

### Read the dataset



In [5]:
train_dataset = pd.read_csv("train.csv")  
test_dataset = pd.read_csv("test.csv")  

In [6]:
train_dataset.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [7]:
test_dataset.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [8]:
test_set = test_dataset[["text"]].values

In [9]:
test_set.shape

(3263, 1)

In [10]:
X = train_dataset[["text"]].values
y = train_dataset[["target"]].values

In [11]:
X

array([['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'],
       ['Forest fire near La Ronge Sask. Canada'],
       ["All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"],
       ...,
       ['M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ'],
       ['Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.'],
       ['The Latest: More Homes Razed by Northern California Wildfire - ABC News http://t.co/YmY4rSkQ3d']],
      dtype=object)

In [12]:
y

array([[1],
       [1],
       [1],
       ...,
       [1],
       [1],
       [1]], dtype=int64)

In [13]:
print((X[1][0]))

Forest fire near La Ronge Sask. Canada


In [14]:
# test the function below
print('This is an example of a sentence: \n', X[0][0])
print('\nThis is an example of the processed version of the tweet: \n', preProcessing(X[0][0]))

This is an example of a sentence: 
 Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all

This is an example of the processed version of the tweet: 
 ['deed', 'reason', 'earthquake', 'may', 'allah', 'forgive', 'u']


## Part 1.1 Implementing your helper functions

To help you train your naive bayes model, you will need to compute a dictionary where the keys are a tuple (word, label) and the values are the corresponding frequency.  Note that the labels we'll use here are 1 for positive and 0 for negative.

You will also implement a lookup helper function that takes in the `freqs` dictionary, a word, and a label (1 or 0) and returns the number of times that word and label tuple appears in the collection of tweets.

For example: given a list of tweets `["i am rather excited", "you are rather happy"]` and the label 1, the function will return a dictionary that contains the following key-value pairs:

{
    ("rather", 1): 2,
    ("happi", 1) : 1, 
    ("excit", 1) : 1
}

- Notice how for each word in the given string, the same label 1 is assigned to each word.
- Notice how the words "i" and "am" are not saved, since it was removed by process_tweet because it is a stopword.
- Notice how the word "rather" appears twice in the list of tweets, and so its count value is 2.

#### Instructions
Create a function `count_tweets` that takes a list of tweets as input, cleans all of them, and returns a dictionary.
- The key in the dictionary is a tuple containing the stemmed word and its class label, e.g. ("happi",1).
- The value the number of times this word appears in the given collection of tweets (an integer).

In [15]:
def count_sentence(result, tweets, ys):
    '''
    Input:
        result: a dictionary that will be used to map each pair to its frequency
        tweets: a list of tweets
        ys: a list corresponding to the sentiment of each tweet (either 0 or 1)
    Output:
        result: a dictionary mapping each pair to its frequency
    '''
    ### START CODE HERE ###
    for y, tweet in zip(ys, tweets):
        for word in preProcessing(tweet):
            # define the key, which is the word and label tuple
            pair = (word , y)
            
            # if the key exists in the dictionary, increment the count
            if pair in result:
                result[pair] += 1

            # else, if the key is new, add it to the dictionary and set the count to 1
            else:
                result[pair] = 1
    ### END CODE HERE ###

    return result

In [16]:
# Testing your function

result = {}
sentence = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 0, 0, 0]
count_sentence(result, sentence, ys)

{('happy', 1): 1, ('tricked', 0): 1, ('sad', 0): 1, ('tired', 0): 2}

# Part 2: Train your model using Naive Bayes

Naive bayes is an algorithm that could be used for sentiment analysis. It takes a short time to train and also has a short prediction time.

#### So how do you train a Naive Bayes classifier?
- The first part of training a naive bayes classifier is to identify the number of classes that you have.
- You will create a probability for each class.
$P(D_{pos})$ is the probability that the document is positive.
$P(D_{neg})$ is the probability that the document is negative.
Use the formulas as follows and store the values in a dictionary:

$$P(D_{pos}) = \frac{D_{pos}}{D}\tag{1}$$

$$P(D_{neg}) = \frac{D_{neg}}{D}\tag{2}$$

Where $D$ is the total number of documents, or tweets in this case, $D_{pos}$ is the total number of positive tweets and $D_{neg}$ is the total number of negative tweets.

#### Prior and Logprior

The prior probability represents the underlying probability in the target population that a tweet is positive versus negative.  In other words, if we had no specific information and blindly picked a tweet out of the population set, what is the probability that it will be positive versus that it will be negative? That is the "prior".

The prior is the ratio of the probabilities $\frac{P(D_{pos})}{P(D_{neg})}$.
We can take the log of the prior to rescale it, and we'll call this the logprior

$$\text{logprior} = log \left( \frac{P(D_{pos})}{P(D_{neg})} \right) = log \left( \frac{D_{pos}}{D_{neg}} \right)$$.

Note that $log(\frac{A}{B})$ is the same as $log(A) - log(B)$.  So the logprior can also be calculated as the difference between two logs:

$$\text{logprior} = \log (P(D_{pos})) - \log (P(D_{neg})) = \log (D_{pos}) - \log (D_{neg})\tag{3}$$

#### Log likelihood
To compute the loglikelihood of that very same word, we can implement the following equations:

$$\text{loglikelihood} = \log \left(\frac{P(W_{pos})}{P(W_{neg})} \right)\tag{6}$$

##### Create `freqs` dictionary
- Given your `count_tweets` function, you can compute a dictionary called `freqs` that contains all the frequencies.
- In this `freqs` dictionary, the key is the tuple (word, label)
- The value is the number of times it has appeared.

We will use this dictionary in several parts of this assignment.

In [17]:
y[0][0]

1

In [18]:
def convert_2d_array_to_list(x):
    new_X = []

    for i in range(len(x)):
        new_X.append(x[i][0])
        
        
    return new_X 

In [19]:
new_X = []
new_y = []

for i in range(len(X)):
    new_X.append(X[i][0])
    new_y.append(y[i][0])

In [20]:
new_X[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [21]:
new_y[0]

1

In [22]:
# Build the freqs dictionary for later uses
freqs = count_sentence({}, new_X, new_y)

In [23]:
freqs

{('deed', 1): 1,
 ('reason', 1): 8,
 ('earthquake', 1): 47,
 ('may', 1): 50,
 ('allah', 1): 6,
 ('forgive', 1): 1,
 ('u', 1): 107,
 ('forest', 1): 50,
 ('fire', 1): 262,
 ('near', 1): 47,
 ('la', 1): 24,
 ('ronge', 1): 1,
 ('sask', 1): 1,
 ('canada', 1): 10,
 ('resident', 1): 8,
 ('asked', 1): 1,
 ('shelter', 1): 6,
 ('place', 1): 16,
 ('notified', 1): 1,
 ('officer', 1): 31,
 ('evacuation', 1): 42,
 ('order', 1): 21,
 ('expected', 1): 11,
 ('people', 1): 106,
 ('receive', 1): 2,
 ('wildfire', 1): 80,
 ('california', 1): 115,
 ('got', 1): 32,
 ('sent', 1): 4,
 ('photo', 1): 28,
 ('ruby', 1): 1,
 ('alaska', 1): 5,
 ('smoke', 1): 12,
 ('pours', 1): 1,
 ('school', 1): 32,
 ('rockyfire', 1): 4,
 ('update', 1): 33,
 ('hwy', 1): 10,
 ('closed', 1): 17,
 ('direction', 1): 4,
 ('due', 1): 24,
 ('lake', 1): 9,
 ('county', 1): 31,
 ('cafire', 1): 2,
 ('flood', 1): 79,
 ('disaster', 1): 118,
 ('heavy', 1): 18,
 ('rain', 1): 36,
 ('cause', 1): 31,
 ('flash', 1): 17,
 ('flooding', 1): 37,
 ('street

#### Instructions
Given a freqs dictionary, `train_x` (a list of tweets) and a `train_y` (a list of labels for each tweet), implement a naive bayes classifier.

##### Calculate $V$
- You can then compute the number of unique words that appear in the `freqs` dictionary to get your $V$ (you can use the `set` function).

##### Calculate $freq_{pos}$ and $freq_{neg}$
- Using your `freqs` dictionary, you can compute the positive and negative frequency of each word $freq_{pos}$ and $freq_{neg}$.

##### Calculate $N_{pos}$, and $N_{neg}$
- Using `freqs` dictionary, you can also compute the total number of positive words and total number of negative words $N_{pos}$ and $N_{neg}$.

##### Calculate $D$, $D_{pos}$, $D_{neg}$
- Using the `train_y` input list of labels, calculate the number of documents (tweets) $D$, as well as the number of positive documents (tweets) $D_{pos}$ and number of negative documents (tweets) $D_{neg}$.
- Calculate the probability that a document (tweet) is positive $P(D_{pos})$, and the probability that a document (tweet) is negative $P(D_{neg})$

##### Calculate the logprior
- the logprior is $log(D_{pos}) - log(D_{neg})$

##### Calculate log likelihood
- Finally, you can iterate over each word in the vocabulary, use your `lookup` function to get the positive frequencies, $freq_{pos}$, and the negative frequencies, $freq_{neg}$, for that specific word.
- Compute the positive probability of each word $P(W_{pos})$, negative probability of each word $P(W_{neg})$ using equations 4 & 5.

$$ P(W_{pos}) = \frac{freq_{pos} + 1}{N_{pos} + V}\tag{4} $$
$$ P(W_{neg}) = \frac{freq_{neg} + 1}{N_{neg} + V}\tag{5} $$

**Note:** We'll use a dictionary to store the log likelihoods for each word.  The key is the word, the value is the log likelihood of that word).

- You can then compute the loglikelihood: $log \left( \frac{P(W_{pos})}{P(W_{neg})} \right)$.

In [24]:
def lookup(freqs , word , label):
    tup = (word , label)
    for key , val in freqs.items():
        if key == tup:
            return val
    return 0

In [25]:
# UNQ_C2 GRADED FUNCTION: train_naive_bayes

import math

def train_naive_bayes(freqs, train_x, train_y):
    '''
    Input:
        freqs: dictionary from (word, label) to how often the word appears
        train_x: a list of tweets
        train_y: a list of labels correponding to the tweets (0,1)
    Output:
        logprior: the log prior. (equation 3 above)
        loglikelihood: the log likelihood of you Naive bayes equation. (equation 6 above)
    '''
    loglikelihood = {}
    logprior = 0

    ### START CODE HERE ###

    # calculate V, the number of unique words in the vocabulary
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)    

    # calculate N_pos, N_neg, V_pos, V_neg
    N_pos = N_neg = V_pos = V_neg = 0
    for pair in freqs.keys():
        # if the label is positive (greater than zero)
        if pair[1] > 0:
            # increment the count of unique positive words by 1
            V_pos += 1

            # Increment the number of positive words by the count for this (word, label) pair
            N_pos += freqs[pair]

        # else, the label is negative
        else:
            # increment the count of unique negative words by 1
            V_neg += 1

            # increment the number of negative words by the count for this (word,label) pair
            N_neg += freqs[pair]
    
    # Calculate D, the number of documents
    D = len(train_y)

    # Calculate D_pos, the number of positive documents
    D_pos = 0
    for i in range(len(train_y)):
        if train_y[i] > 0:
            D_pos += 1

    # Calculate D_neg, the number of negative documents
    D_neg = 0
    for i in range(len(train_y)):
        if train_y[i] <= 0:
            D_neg += 1

    # Calculate logprior
    logprior = math.log(D_pos) - math.log(D_neg)
    
    # For each word in the vocabulary...
    for word in vocab:
        # get the positive and negative frequency of the word
        freq_pos = lookup(freqs,word,1)
        freq_neg = lookup(freqs,word,0)

        # calculate the probability that each word is positive, and negative
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)

        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos/p_w_neg)

    ### END CODE HERE ###

    return logprior, loglikelihood

In [26]:
# UNQ_C3 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
logprior, loglikelihood = train_naive_bayes(freqs, new_X , new_y)
print(logprior)
print(len(loglikelihood))

-0.28323932289985443
20399


# Part 3: Test your naive bayes

Now that we have the `logprior` and `loglikelihood`, we can test the naive bayes function by making predicting on some tweets!

#### Implement `naive_bayes_predict`
**Instructions**:
Implement the `naive_bayes_predict` function to make predictions on tweets.
* The function takes in the `tweet`, `logprior`, `loglikelihood`.
* It returns the probability that the tweet belongs to the positive or negative class.
* For each tweet, sum up loglikelihoods of each word in the tweet.
* Also add the logprior to this sum to get the predicted sentiment of that tweet.

$$ p = logprior + \sum_i^N (loglikelihood_i)$$

#### Note
Note we calculate the prior from the training data, and that the training data is evenly split between positive and negative labels (4000 positive and 4000 negative tweets).  This means that the ratio of positive to negative 1, and the logprior is 0.

The value of 0.0 means that when we add the logprior to the log likelihood, we're just adding zero to the log likelihood.  However, please remember to include the logprior, because whenever the data is not perfectly balanced, the logprior will be a non-zero value.

In [27]:
# UNQ_C4 GRADED FUNCTION: naive_bayes_predict

def naive_bayes_predict(tweet, logprior, loglikelihood):
    '''
    Input:
        tweet: a string
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: the sum of all the logliklihoods of each word in the tweet (if found in the dictionary) + logprior (a number)

    '''
    ### START CODE HERE ###
    # process the tweet to get a list of words
    word_l = preProcessing(tweet)

    # initialize probability to zero
    p = 0

    # add the logprior
    p += logprior

    for word in word_l:
        
        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            p += loglikelihood.get(word)

    ### END CODE HERE ###

    return p

In [28]:
# UNQ_C5 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
# Experiment with your own tweet.
my_tweet = 'She smiled.'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print('The expected output is', p)

The expected output is -0.28323932289985443


#### Implement test_naive_bayes
**Instructions**:
* Implement `test_naive_bayes` to check the accuracy of your predictions.
* The function takes in your `test_x`, `test_y`, log_prior, and loglikelihood
* It returns the accuracy of your model.
* First, use `naive_bayes_predict` function to make predictions for each tweet in text_x.

In [95]:
# UNQ_C6 GRADED FUNCTION: test_naive_bayes

def test_naive_bayes(test_x , ids , test_y, logprior, loglikelihood, naive_bayes_predict=naive_bayes_predict):
    """
    Input:
        test_x: A list of tweets
        test_y: the corresponding labels for the list of tweets
        logprior: the logprior
        loglikelihood: a dictionary with the loglikelihoods for each word
    Output:
        accuracy: (# of tweets classified correctly)/(total # of tweets)
    """
    accuracy = 0  # return this properly

    ### START CODE HERE ###
    y_hats = []
    for tweet in test_x:
        # if the prediction is > 0
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            # the predicted class is 1
            y_hat_i = 1
        else:
            # otherwise the predicted class is 0
            y_hat_i = 0

        # append the predicted class to the list y_hats
        y_hats.append(y_hat_i)

    # error is the average of the absolute values of the differences between y_hats and test_y
    new_y_hats = []
    
    return_y_hats = np.zeros((len(ids) , 2))
    
    #counter = 0
    for i in range(len(y_hats)):
        new_y_hats.append(y_hats[i])
            
    """#print(len(new_y_hats))"""
    for i in range(len(new_y_hats)):
        #print(ids[i])
        return_y_hats[i][0] = ids[i]
        return_y_hats[i][1] = new_y_hats[i]
        
    error = 0
    result = []
    for i in range(len(new_y_hats)):
        result.append(np.absolute(new_y_hats[i]-test_y[i]))
        
    getSum = 0
    for i in range(len(new_y_hats)):
        getSum += result[i]
        
    error = getSum/len(result)
    
    # Accuracy is 1 minus the error
    accuracy = 1 - error 

    ### END CODE HERE ###

    return accuracy ,  new_y_hats

In [30]:
""" Process the test data """
new_test_set = convert_2d_array_to_list(test_set)

In [32]:
sample = pd.read_csv("sample_submission.csv")

In [33]:
sample.shape

(3263, 2)

In [34]:
sample_array = sample.to_numpy()

In [35]:
sample_array[3][0]

9

In [80]:
ids = []
for i in range(len(sample_array)):
    ids.append(sample_array[i][0])

In [81]:
new_sample = []
for i in range(len(sample_array)):
    new_sample.append(sample_array[i][1])

In [86]:
len(ids)

3263

In [96]:
accuracy , y_hats = (test_naive_bayes(new_test_set, ids , new_sample, logprior, loglikelihood))

print("Naive Bayes accuracy = %0.4f" %
      accuracy)

Naive Bayes accuracy = 0.6258


In [98]:
result = np.zeros((len(y_hats) , 2))

In [99]:
result.shape

(3263, 2)

In [100]:
for i in range(len(result)):
    result[i][0] = ids[i]
    result[i][1] = y_hats[i]

In [101]:
result

array([[0.0000e+00, 1.0000e+00],
       [2.0000e+00, 1.0000e+00],
       [3.0000e+00, 1.0000e+00],
       ...,
       [1.0868e+04, 1.0000e+00],
       [1.0874e+04, 1.0000e+00],
       [1.0875e+04, 1.0000e+00]])

In [53]:
y_hats_array = np.array(y_hats)

In [57]:
new_y_hats = y_hats_array.reshape((3263 , 1))

In [58]:
new_y_hats.shape

(3263, 1)

In [103]:
pd.DataFrame(result).to_csv('submission.csv',  header  = ['id' , 'target'])    

In [105]:
print(pd.read_csv('submission.csv'))

         id  target
0         0       1
1         2       1
2         3       1
3         9       1
4        11       1
...     ...     ...
3258  10861       1
3259  10865       1
3260  10868       1
3261  10874       1
3262  10875       1

[3263 rows x 2 columns]
