# Import Library

In [1]:
import nltk
import random
import os
import numpy as np
from nltk.corpus import twitter_samples
from matplotlib import pyplot as plt
from nltk.tokenize import TweetTokenizer
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Load Dataset

In [2]:
# DOWNLOAD DATASET

nltk.download('twitter_samples', download_dir= os.getcwd())

[nltk_data] Downloading package twitter_samples to
[nltk_data]     c:\Tensorflow\NLP\NLP Twitter...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [3]:
import os

# Print the contents of the directory
print(os.listdir(r'C:\Tensorflow\NLP\NLP Twitter\corpora\twitter_samples'))

['negative_tweets.json', 'positive_tweets.json', 'README.md', 'README.txt', 'tweets.20150430-223406.json']


In [4]:
# LOAD DATASET 

current_directory = os.getcwd()

# MOVE NLTK TO THAT PATH
nltk.data.path.append(r'C:\Tensorflow\NLP\NLP Twitter')

all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

print(f'Positive Tweets has {len(all_positive_tweets)} samples')
print(f'Negative Tweets has {len(all_negative_tweets)} samples')

Positive Tweets has 5000 samples
Negative Tweets has 5000 samples


###  Display 10 Positive Tweets

In [5]:
# DISPLAY 10 SAMPLES FROM POSITIVE TWEETS

for i in range(0,10):
    print(f'Tweet {i+1} : {all_positive_tweets[i]}')

Tweet 1 : #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
Tweet 2 : @Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!
Tweet 3 : @DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!
Tweet 4 : @97sides CONGRATS :)
Tweet 5 : yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days
Tweet 6 : @BhaktisBanter @PallaviRuhail This one is irresistible :)
#FlipkartFashionFriday http://t.co/EbZ0L2VENM
Tweet 7 : We don't like to keep our lovely customers waiting for long! We hope you enjoy! Happy Friday! - LWWF :) https://t.co/smyYriipxI
Tweet 8 : @Impatientraider On second thought, there‚Äôs just not enough time for a DD :) But new shorts entering system. Sheep must be buying.
Tweet 9 : Jgh , but we have to go to Bayan :D bye
Tweet 10 : As an act of mischi

### Display 10 Negative Tweets

In [6]:
# DISPLAY 10 SAMPLES FROM NEGATIVE TWEETS

i = 0

for tweet in all_negative_tweets:
    if i == 10:
        break

    print(f'Tweet {i+1} : {tweet}')
    i += 1

Tweet 1 : hopeless for tmr :(
Tweet 2 : Everything in the kids section of IKEA is so cute. Shame I'm nearly 19 in 2 months :(
Tweet 3 : @Hegelbon That heart sliding into the waste basket. :(
Tweet 4 : ‚Äú@ketchBurning: I hate Japanese call him "bani" :( :(‚Äù

Me too
Tweet 5 : Dang starting next week I have "work" :(
Tweet 6 : oh god, my babies' faces :( https://t.co/9fcwGvaki0
Tweet 7 : @RileyMcDonough make me smile :((
Tweet 8 : @f0ggstar @stuartthull work neighbour on motors. Asked why and he said hates the updates on search :( http://t.co/XvmTUikWln
Tweet 9 : why?:("@tahuodyy: sialan:( https://t.co/Hv1i0xcrL2"
Tweet 10 : Athabasca glacier was there in #1948 :-( #athabasca #glacier #jasper #jaspernationalpark #alberta #explorealberta #‚Ä¶ http://t.co/dZZdqmf7Cz


# Preprocess Raw Text

## Splitting Data

In [7]:
# SPLIT DATA INTO TRAIN AND TEST

train_data = np.concatenate((all_positive_tweets[:4000], all_negative_tweets[:4000]), axis=0)
test_data  = np.concatenate((all_positive_tweets[4000:5000], all_negative_tweets[4000:5000]), axis=0)

len(train_data) , len(test_data) , train_data[:2] , test_data[:2]

(8000,
 2000,
 array(['#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)',
        '@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!'],
       dtype='<U152'),
 array(['Bro:U wan cut hair anot,ur hair long Liao bo\nMe:since ord liao,take it easy lor treat as save $ leave it longer :)\nBro:LOL Sibei xialan',
        "@heyclaireee is back! thnx God!!! i'm so happy :)"], dtype='<U146'))

## Create a Label for every Tweet

In [8]:
# CREATE LABEL . IF THEY POSITIVE TWEET, GIVE THEM 1  .   AND IF THEY NEGATIVE TWEET , GIVE THEM 0

train_positive = all_positive_tweets[:4000]
train_negative = all_negative_tweets[:4000]

test_positive = all_positive_tweets[4000:5000]
test_negative = all_negative_tweets[4000:5000]

#            FILL VALUE 1        shape=(4000,1)          FILL VALUE 0  shape=(4000,1)
train_label = np.append(np.ones( shape=( len(train_positive),1) ), np.zeros( shape=( len(train_negative),1) ), axis=0)

#           FILL VALUE 1         shape=(1000,1)          FILL VALUE 0  shape=(1000,1)
test_label  = np.append(np.ones( shape=( len(test_positive),1) ) , np.zeros( shape=( len(test_negative),1) ), axis=0)


# DEBUGGING
len(train_label) , len(test_label) , train_label , test_label

(8000,
 2000,
 array([[1.],
        [1.],
        [1.],
        ...,
        [0.],
        [0.],
        [0.]]),
 array([[1.],
        [1.],
        [1.],
        ...,
        [0.],
        [0.],
        [0.]]))

In [None]:
# VISUALIZE DATASET TWEETS USING DATAFRAME 

import pandas as pd

data = {
    'Tweets': train_data,
    'Label' : train_label.reshape(-1)
}

df = pd.DataFrame(data)

df

Unnamed: 0,Tweets,Label
0,#FollowFriday @France_Inte @PKuchly57 @Milipol...,1.0
1,@Lamb2ja Hey James! How odd :/ Please call our...,1.0
2,@DespiteOfficial we had a listen last night :)...,1.0
3,@97sides CONGRATS :),1.0
4,yeaaaah yippppy!!! my accnt verified rqst has...,1.0
...,...,...
7995,Amelia didnt stalk my twitter :(,0.0
7996,"oh, i missed the broadcast. : (",0.0
7997,i really can't stream on melon i feel useless :-(,0.0
7998,I need to stop looking at old soccer pictures :(,0.0


## Text Cleaning

In [10]:
# CLEANING TWEETS

import re  # USING REGEX TO CLEANING TEXT

def TextCleaning(tweet):

    if not isinstance(tweet, str):
        raise TypeError(f"Expected string, got {type(tweet)}")
    
    # REMOVE HYPERLINK
    cleaning_tweets = re.sub(pattern=r'https?://[^\n\r\s]+', repl= '', string= tweet)

    # REMOVE USENAME
    cleaning_tweets = re.sub(pattern=r'@\w+', repl='', string= cleaning_tweets)

    # REMOVE HASHTAGS
    cleaning_tweets = re.sub(pattern=r'#', repl='', string= cleaning_tweets)

    return cleaning_tweets

#### Display Function

In [None]:
# DISPLAY TWEETS AFTER CLEANED

# BEFORE CLEANING
print('Before Clean :')
for tweet in train_data[:5]:
    print(tweet)

i = 0
cleaned_tweets = []

print('\nAfter Clean :')
for tweet in train_data:
    clean_tweet = TextCleaning(tweet)

    # JUST PRINT THE FIRST 5 TWEET
    if i < 5:
        print(clean_tweet)
    i += 1
    cleaned_tweets.append(clean_tweet)

Before Clean :
#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!
@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!
@97sides CONGRATS :)
yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days

After Clean :
FollowFriday    for being top engaged members in my community this week :)
 Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!
 we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!
 CONGRATS :)
yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days


## Tokenization

In [12]:
# TRANSFORM TWEET INTO EACH TOKEN

def Tokenization(tweet):

    # DECLARE TOKENIZER
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)

    # TRANSFORM TO EVERY TWEETS/STRINGS
    tweet_token= tokenizer.tokenize(tweet)

    return tweet_token

In [13]:
# DISPLAY TWEET AFTER TOKENIZATION

print('Tweet After Tokenization :')

i = 0
tweet_tokens = []

for tweet in cleaned_tweets:
    tweet_token = Tokenization(tweet)

    # JUST PRINT THE FIRST 5 TWEET
    if i < 5:
        print(tweet_token)

    i += 1
    tweet_tokens.append(tweet_token)

Tweet After Tokenization :
['followfriday', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']
['hey', 'james', '!', 'how', 'odd', ':/', 'please', 'call', 'our', 'contact', 'centre', 'on', '02392441234', 'and', 'we', 'will', 'be', 'able', 'to', 'assist', 'you', ':)', 'many', 'thanks', '!']
['we', 'had', 'a', 'listen', 'last', 'night', ':)', 'as', 'you', 'bleed', 'is', 'an', 'amazing', 'track', '.', 'when', 'are', 'you', 'in', 'scotland', '?', '!']
['congrats', ':)']
['yeaaah', 'yipppy', '!', '!', '!', 'my', 'accnt', 'verified', 'rqst', 'has', 'succeed', 'got', 'a', 'blue', 'tick', 'mark', 'on', 'my', 'fb', 'profile', ':)', 'in', '15', 'days']


## Remove Stop Words and Punctuation

In [14]:
# GET STOPWORDS AND PUNCTUATION

# DOWNLOAD STOPWORDS
nltk.download(info_or_id='stopwords')

stopwords_english = stopwords.words('english')

print(f'size stopwords : {len(stopwords_english)}')
print(f'Stopwords : {stopwords_english[:20]}')  # DISPLAY 20 STOPWORDS

print(f'Punctuation : {string.punctuation}\n')

size stopwords : 179
Stopwords : ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']
Punctuation : !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aliff\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
# REMOVE STOPWORDS AND PUNCTUATION

def remove_StopWords(tweet):

    cleaned_token = []

    # ITERATE EVERY TOKEN IN A TWEET         
    for word in tweet:   

        # IF TOKEN DOESNT CONTAIN STOPWORDS AND PUNCTUATION
        if (word not in stopwords_english and word not in string.punctuation):
            cleaned_token.append(word) # PUSH THAT TOKEN
                

    return cleaned_token

In [16]:
# DISPLAY TWEET AFTER REMOVED STOP WORDS

i = 0
cleaned_tweets = []

print('After Removed Stopwords and Punctuation :')
for tweet in tweet_tokens:
    tweet_clean = remove_StopWords(tweet)

    # JUST PRINT THE FIRST 5 TWEET
    if i < 5:
        print(tweet_clean)

    i += 1
    cleaned_tweets.append(tweet_clean)

After Removed Stopwords and Punctuation :
['followfriday', 'top', 'engaged', 'members', 'community', 'week', ':)']
['hey', 'james', 'odd', ':/', 'please', 'call', 'contact', 'centre', '02392441234', 'able', 'assist', ':)', 'many', 'thanks']
['listen', 'last', 'night', ':)', 'bleed', 'amazing', 'track', 'scotland']
['congrats', ':)']
['yeaaah', 'yipppy', 'accnt', 'verified', 'rqst', 'succeed', 'got', 'blue', 'tick', 'mark', 'fb', 'profile', ':)', '15', 'days']


## Stemming

In [17]:
# STEMMING USING PORTER STEMMING ALGORITHM


def stemming(tweet):
    stemmer = PorterStemmer()
    
    tweet_stemmed = []  # TO STORE STEMMED TWEET
    
        # ITERATE EACH WORD/TOKEN IN TWEET
    for word in tweet:
        stem_word = stemmer.stem(word)  # STEMMING
        tweet_stemmed.append(stem_word)  # PUSH STEMMED TOKEN

    return tweet_stemmed

In [18]:
# DISPLAY TWEET/TOKEN AFTER STEMMING

i = 0
tweet_stemmed = []

print('Tweet After Stemming : ')

for tweet in cleaned_tweets:
    stem = stemming(tweet)

    # JUST PRINT THE FIRST 5 TWEET
    if i < 5:
        print(stem)

    i += 1
    tweet_stemmed.append(stem)

Tweet After Stemming : 
['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']
['hey', 'jame', 'odd', ':/', 'pleas', 'call', 'contact', 'centr', '02392441234', 'abl', 'assist', ':)', 'mani', 'thank']
['listen', 'last', 'night', ':)', 'bleed', 'amaz', 'track', 'scotland']
['congrat', ':)']
['yeaaah', 'yipppi', 'accnt', 'verifi', 'rqst', 'succeed', 'got', 'blue', 'tick', 'mark', 'fb', 'profil', ':)', '15', 'day']


## Frequency Each Token

In [19]:
# FREQUENCY DICT 

frequency_token = {}

# ITERATE EVERY LABEL AND TWEETS
for label , tweets in zip(train_label, tweet_stemmed):
    for word in tweets:
        pair = (word, int(label))

        if pair in frequency_token:
            frequency_token[pair] += 1
        else:
            frequency_token[pair] = 1

# DISPLAY RESULT 
len(frequency_token) , frequency_token

  pair = (word, int(label))


(11420,
 {('followfriday', 1): 23,
  ('top', 1): 30,
  ('engag', 1): 7,
  ('member', 1): 14,
  ('commun', 1): 27,
  ('week', 1): 72,
  (':)', 1): 2960,
  ('hey', 1): 60,
  ('jame', 1): 7,
  ('odd', 1): 2,
  (':/', 1): 5,
  ('pleas', 1): 81,
  ('call', 1): 27,
  ('contact', 1): 4,
  ('centr', 1): 1,
  ('02392441234', 1): 1,
  ('abl', 1): 6,
  ('assist', 1): 1,
  ('mani', 1): 28,
  ('thank', 1): 522,
  ('listen', 1): 15,
  ('last', 1): 39,
  ('night', 1): 55,
  ('bleed', 1): 2,
  ('amaz', 1): 41,
  ('track', 1): 5,
  ('scotland', 1): 2,
  ('congrat', 1): 15,
  ('yeaaah', 1): 1,
  ('yipppi', 1): 1,
  ('accnt', 1): 2,
  ('verifi', 1): 2,
  ('rqst', 1): 1,
  ('succeed', 1): 1,
  ('got', 1): 57,
  ('blue', 1): 8,
  ('tick', 1): 1,
  ('mark', 1): 1,
  ('fb', 1): 4,
  ('profil', 1): 2,
  ('15', 1): 4,
  ('day', 1): 187,
  ('one', 1): 92,
  ('irresist', 1): 2,
  ('flipkartfashionfriday', 1): 16,
  ('like', 1): 187,
  ('keep', 1): 55,
  ('love', 1): 336,
  ('custom', 1): 4,
  ('wait', 1): 55,
  

create  function Pipeline for shortcut cleaning text

In [20]:
# Pipeline Function

def tokenizer(tweet):
    cleaning_tweet = TextCleaning(tweet)
    tweet_token    = Tokenization(cleaning_tweet)
    cleaned_token  = remove_StopWords(tweet_token)
    stemmed_token  = stemming(cleaned_token)

    return stemmed_token

# Train Model

create naive bayes with logarithm formula

The naive Bayes logarithm formula consists of 2 components, namely:

<pre>
1. Logprior : log( P(positif) 
                  ____________ 
                  P(Negatif) )

</pre>

<pre>
2. Loglikelihood = log ( P(token | positif)
                       _____________________
                        P(token | negatif) )
                   

</pre>

<pre>
3. Laplacian Correction (Smoothing) :        P(label_i ) + 1
                                      ____________________________
                                      N_label_i + total unique word


</pre>

In [21]:
def train_naive_bayes(frequency_token, labels):

    # DECLARE VARIABLE FOR LOGPRIOR AND LOGLIKELIHOOD
    logprior      = 0
    loglikelihood = {}

    # SELECT UNIQUE TOKEN/WORD
    vocab              = [word for word , _ in frequency_token.keys()]
    total_unique_word = len(vocab)


    # COMPUTE LOGPRIOR
    total_tweet = len(labels)
    total_positive_tweet = total_negative_tweet = 0

    # COUNT HOW MUCH POSITIVE AND NEGATIVE TWEET IN 1 DOCUMENT (DATASET)
    for label in labels:
        if label == 1:
            total_positive_tweet += 1
        else:
            total_negative_tweet += 1
    
    logprior = np.log(total_positive_tweet / total_tweet) - np.log(total_negative_tweet / total_tweet)

    # AFTER COMPUTE LOGPRIOR , NEXT COMPUTE LOGLIKELIHOOD

    # BEFORE THAT , LETS FIND LAPLACIAN SMOOTHING FIRST
    
    vocab = set([word for word, _ in frequency_token.keys()])   # UNIQUE WORD
    total_word = len(vocab)
    
    N_pos = N_neg = 0
    for pair in frequency_token.keys():
        # IF LABEL IS POSITIVE
        if pair[1] > 0:
            N_pos += frequency_token[pair]
        else:
            N_neg += frequency_token[pair]

    # COMPUTE PROBABILITIES EVERY WORD
    for word in vocab:
        total_positive_word = frequency_token.get((word,1), 0)
        total_negative_word = frequency_token.get((word,0), 0)

        # COMPUTE LAPLACIAN 
        laplacian_positive = (total_positive_word + 1) / (N_pos + total_word)
        laplacian_negative = (total_negative_word + 1) / (N_neg + total_word)

        # COMPUTE LOGLIKELIHOOD FOR EVERY WORD 
        loglikelihood[word] = np.log(laplacian_positive) - np.log(laplacian_negative)

    return logprior , loglikelihood


In [22]:

# CLEANING ENTIRE TRAIN DATA TWEETS

logprior , loglikelihood = train_naive_bayes(frequency_token, train_label)

if logprior > 0:
    print(f'there are more Positive Tweets in the dataset, with value logprior {logprior}')
elif logprior < 0:
    print(f'there are more Negative Tweets in the Dataset, with value of logprior {logprior}')
else:
    print(f'positive and negative tweets are balanced, with value of logprior {logprior}')

positive and negative tweets are balanced, with value of logprior 0.0


In [23]:
# CHECK PROBABILITIES EVERY WORD
loglikelihood

{'score': -0.7040528921548113,
 'thread': -0.7040528921548113,
 'üç§': 1.5985322008392355,
 'tf': -1.3972000727147567,
 'mic': -0.7040528921548113,
 'bought': -0.8582035719820684,
 '65': -0.7040528921548113,
 'orlean': 0.6822414689650795,
 'nuclear': -0.7040528921548113,
 'mbasa': -0.7040528921548113,
 '2013': -0.7040528921548113,
 'logic': 0.6822414689650795,
 'whip': -0.010905711594865863,
 'exam': -0.010905711594864087,
 'wacki': 0.6822414689650795,
 'baat': 0.6822414689650795,
 'slovenia': 1.087706577073245,
 'comeback': -0.4163708197030296,
 '{:': -0.7040528921548113,
 'juja': -0.7040528921548113,
 'himseek': -0.7040528921548113,
 'great': 2.129160451901406,
 'üò¢': -1.8026651808229204,
 'confess': 0.6822414689650795,
 'wast': -1.109518000262975,
 'kunoriforceo': 1.7808537576331886,
 'separ': 0.6822414689650795,
 'ikea': -0.7040528921548113,
 'bae': -0.48090934084060066,
 'hiya': 0.6822414689650795,
 'tempt': -1.109518000262975,
 'detail': 0.8000245046214634,
 'ever': -0.2732699

in the probability above, if the value is positive then it is a positive word.
and if the value is negative, then it is a negative word

ok lets predict some tweet using Naive Bayes Logarithm

In [24]:
def predict_tweet(tweet, logprior, loglikelihood):

    # PREPROCESS TEXT
    cleaned_tweet = tokenizer(tweet)

    probability = 0  # TO STORE PROBABILTY OF TWEET BEING POSITIVE OR NEGATIVE

    probability += logprior  # ADD LOGPRIOR TO PROBABILITY

    for word in cleaned_tweet:
        probability += loglikelihood.get(word, 0)

    return probability


In [25]:
# FUNCTION TO CONVER RAW VALUES TO LABELS (POSITIVE OR NEGATIVE)
def isPositive(numeric):
    if numeric >= 0:
        return 'Positive'
    else:
        return 'Negative'

In [35]:
# PREDICT SOME TWEET

word_1 = 'I hate you guys'
word_2 = "Just wanna let this story die and i'll be alright"
word_3 = 'Die with a Smile'
word_4 = 'Thank you for your helping :)'

probs_1 = predict_tweet(word_1, logprior, loglikelihood)
probs_2 = predict_tweet(word_2,logprior, loglikelihood)
probs_3 = predict_tweet(word_3,logprior, loglikelihood)
probs_4 = predict_tweet(word_4,logprior, loglikelihood)

print(f' {word_1} --> {isPositive(probs_1)}')
print(f' {word_2} --> {isPositive(probs_2)}')
print(f' {word_3} --> {isPositive(probs_3)}')
print(f' {word_4} --> {isPositive(probs_4)}')

 I hate you guys --> Negative
 Just wanna let this story die and i'll be alright --> Negative
 Die with a Smile --> Positive
 Thank you for your helping :) --> Positive


now lets test our model accuracy

In [27]:
# TEST MODEL

def test_model(data, label, logprior, loglikelihood):

    # DECLARE SOME VARIABLE
    accuracy = 0  
    y_preds = []

    # ITERATE EVERY TWEET IN DATASET
    for tweet in data:

        # SET PROBABILITY FOR EACH TWEET
        probability = predict_tweet(tweet, logprior, loglikelihood)

        # IF PROBS MORE THAN 0 THEN ITS CATEGORIZED AS POSITIVE WORD , OTHERWISE NEGATIVE.
        if probability > 0:
            y_pred_i = 1
        else:
            y_pred_i = 0

        y_preds.append(y_pred_i)
        
    # COMPUTE LOSS FUNCTION
    loss_function = sum(abs(y_hat - y) for y_hat, y in zip(y_preds, label)) / len(label)

    accuracy = 1 - loss_function
    
    return loss_function, accuracy


In [28]:
loss_function , accuracy = test_model(test_data, test_label, logprior, loglikelihood)

accuracy , loss_function

(array([0.9955]), array([0.0045]))

show positive and negative ratio for each word

## Ratio Word

In [29]:

def get_ratio(word, frequency_token):

    # CREATE RATIO DICTIONARY
    ratio = {'positive': 0,
             'negative': 0,
             'ratio'   : 0.0}
    
    ratio['positive'] = frequency_token[(word, 1)]
    ratio['negative'] = frequency_token[(word, 0)]

    # CALCULATE THE RATIO OF POSITIVE TO NEGATIVE VALUE
    ratio['ratio']    = (ratio['positive'] + 1) / (ratio['negative'] + 1)  # ADD 1 TO AVOID DIVISION BY ZERO
    return ratio


In [30]:
# DISPLAY RESULT

words = ['hi','love','happi','smell','smile','die']

for word in words:
    ratio = get_ratio(word, frequency_token)
    
    print(f'word : {word} --> positive : {ratio["positive"]} , negative : {ratio["negative"]} , ratio : {ratio["ratio"]}')
    print()

word : hi --> positive : 154 , negative : 28 , ratio : 5.344827586206897

word : love --> positive : 336 , negative : 128 , ratio : 2.612403100775194

word : happi --> positive : 162 , negative : 18 , ratio : 8.578947368421053

word : smell --> positive : 2 , negative : 2 , ratio : 1.0

word : smile --> positive : 47 , negative : 9 , ratio : 4.8

word : die --> positive : 8 , negative : 15 , ratio : 0.5625

