In [36]:
import nltk                                # Python library for NLP
from nltk.corpus import twitter_samples    # sample Twitter dataset from NLTK
import matplotlib.pyplot as plt
from helper_funtion import clean_text,process_tweet,build_freqs
import numpy as np    

In [26]:
twitter_samples

<TwitterCorpusReader in 'C:\\Users\\8086f\\AppData\\Roaming\\nltk_data\\corpora\\twitter_samples'>

In [3]:
# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [4]:
print(len(all_positive_tweets))
print(len(all_negative_tweets))

5000
5000


In [7]:
# split the data into two pieces, one for training and one for testing (validation set)
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

# avoid assumptions about the length of all_positive_tweets
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

In [8]:
for i in range(len(train_x)):
  train_x[i]=clean_text(train_x[i])
for i in range(len(test_x)):
  test_x[i]=clean_text(test_x[i])

In [18]:
pro_train_x=process_tweet(train_x)
pro_test_y=process_tweet(test_x)

In [24]:
freqs=build_freqs(pro_train_x, train_y)
first_key = next(iter(freqs))
print(first_key, freqs[first_key])
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

('followfriday', 1.0) 23
type(freqs) = <class 'dict'>
len(freqs) = 17865


In [37]:
def lookup(freqs, word, label):
    '''
    Input:
        freqs: a dictionary with the frequency of each pair (or tuple)
        word: the word to look up
        label: the label corresponding to the word
    Output:
        n: the number of times the word with its corresponding label appears.
    '''
    n = 0  # freqs.get((word, label), 0)

    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]

    return n

In [39]:
def train_naive_bayes(freqs,train_x,train_y):
    '''here first gonna get prob of pos word and neg word'''


    loglikelihood = {}
    logprior = 0
    # calculate V, the number of unique words in the vocabulary
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    # calculate N_pos, N_neg, V_pos, V_neg
    N_pos = N_neg = V_pos = V_neg = 0
    for pair in freqs.keys():
        if pair[1]>0:
            # increment the count of unique positive words by 1
            V_pos+=1
            # increment the number of positive words by the count for this (word,label) pair
            N_pos += freqs[pair]
        else:
            # increment the count of unique negative words by 1
            V_neg += 1

            # increment the number of negative words by the count for this (word,label) pair
            N_neg += freqs[pair]

    D = len(train_y)

    D_pos = (len(list(filter(lambda x: x > 0, train_y))))
    D_neg = (len(list(filter(lambda x: x <= 0, train_y))))

     # Calculate logprior
    logprior = np.log(D_pos) - np.log(D_neg)
    for word in vocab:

        #   get the positive and negative frequency of the word
        freq_pos = lookup(freqs,word,1)
        freq_neg = lookup(freqs,word,0)
        # calculate the probability that each word is positive, and negative
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)

        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos/p_w_neg)

    ### END CODE HERE ###

    return logprior, loglikelihood

In [41]:
logprior, loglikelihood = train_naive_bayes(freqs, pro_train_x, train_y)
print(logprior)
print(len(loglikelihood))

0.0
15647


In [44]:
print(loglikelihood)

{'sooth': 0.6291833667545874, 'halfway': 0.6291833667545874, 'httpstcoldcxqykwi': 0.6291833667545874, 'ohkaibaek': -0.7571109943653032, 'gemma': 0.6291833667545874, 'obeyervin': -0.7571109943653032, 'slip': 0.6291833667545874, 'genad': -0.7571109943653032, 'koreachol': 0.6291833667545874, 'plain': -0.7571109943653032, 'mentionto': 0.6291833667545874, 'carrot': -0.7571109943653032, 'meal': -1.1625761024734675, 'celestinesofi': 0.6291833667545874, 'tttt': -0.7571109943653032, 'black': -0.8524211741696281, 'checkup': 0.6291833667545874, 'nyc': 0.6291833667545874, 'nicolaneyhaul': 0.6291833667545874, 'dumont': 0.6291833667545874, 'april': -0.06396381380535789, 'map': 1.0346484748627518, 'joe': -0.4694289219135223, 'plan': 0.34150129430280646, 'httptcoyukwgfjm': 0.6291833667545874, 'httpstcoyqkccykq': 0.6291833667545874, 'nikki': 1.0346484748627518, 'ding': -0.7571109943653032, 'pod': -0.7571109943653032, 'infrastructureteachersdoctorsnursesparamedicsampoth': -0.7571109943653032, 'spobabbbi

In [58]:
# def test_the_model(tweet,logprior,loglikelihood):
#     word_l = lookup(tweet)
#     # initialize probability to zero
#     p = 0

#     # add the logprior
#     p += logprior

#     for word in word_l:

#         # check if the word exists in the loglikelihood dictionary
#         if word in loglikelihood:
#             # add the log likelihood of that word to the probability
#             p += loglikelihood[word]


#     return p


In [None]:
# tweet='i deeply dissapointed'
# p = test_the_model(tweet, logprior, loglikelihood)
# print('The expected output is', p)