In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/twitter-entity-sentiment-analysis/twitter_validation.csv
/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv


In [2]:

import pdb
from nltk.corpus import stopwords, twitter_samples
import numpy as np
import pandas as pd
import nltk
import string
from nltk.tokenize import TweetTokenizer
from os import getcwd

In [3]:
nltk.download('twitter_samples')
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [5]:
all_positive_tweets[0]

'#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)'

In [6]:
#splitting the dataset
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]


train_x = train_pos + train_neg
test_x = test_pos + test_neg


In [7]:
len(train_x)


8000

In [8]:
len(test_x)

2000

In [9]:
#Creating train and test y
train_y = np.append(np.ones(len(train_pos)),np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)),np.zeros(len(test_neg)))


# Processing the data

In [10]:
import re
import string
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

def process_tweet(tweet):
    """
    Preprocesses a tweet by:
    - Lowercasing
    - Removing URLs
    - Removing handles (@username)
    - Removing punctuation
    - Tokenizing
    - Removing stopwords
    - Removing short words (length < 2)
    
    Input:
        tweet: a string containing a tweet
    Output:
        cleaned_words: a list of processed words from the tweet
    """
    # Convert to lowercase
    tweet = tweet.lower()
    
    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+', '', tweet)
    
    # Remove mentions (@username)
    tweet = re.sub(r'@\w+', '', tweet)
    
    # Remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize tweet
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    words = tokenizer.tokenize(tweet)
    
    # Remove stopwords and short words
    stop_words = set(stopwords.words('english'))
    cleaned_words = [word for word in words if word not in stop_words and len(word) > 1]
    
    return cleaned_words

# Example usage
tweet = "@user I love NLP! 😍 Check out https://nlp.com"
print(process_tweet(tweet))


['love', 'nlp', 'check']


# Count Tweets

In [11]:
def count_tweets(result, tweets, ys):
    #result - a dict that will map each pair to its freq or occurence
    # tweets - list of tweets we have
    # ys: list of corresponding sentiment of each tweet

    for y, tweet in zip(ys,tweets):
        for word in process_tweet(tweet):
            pair=(word,y)
            if pair in result:
                result[pair]+=1 #already present so increase by 1

            else:
                result[pair] =1 #present for the first time so turn it to 1

    return result

# Model training using Naive Bayes

* First part is to identify the number of classes we have
* Create probability for each class

In [12]:
freqs = count_tweets({}, train_x, train_y)

## Training NB
* Given a freqs dictionary, train_x (a list of tweets) and a train_y (a list of labels for each tweet), implement a naive bayes classifier.
* Calculate V
* Calculate freq of pos and negative
* Using freqs dict we can also compute the total number of pos and neg words
* Using train_y we can compute the total number of documents (D), D_pos and D_neg as well

In [14]:
def train_naive_bayes(freqs, train_x, train_y):
    #freqs : a dict that has (word, label) to find how often the word appears
    #train_x: list of tweets
    # train_y : lsit of labels (0,1) corresponding to tweets
    loglikelihood = {}
    logprior = 0
    vocab = set(pair[0] for pair in freqs.keys())
    V = len(vocab)


    N_pos = N_neg = 0
    for pair in freqs.keys():
        if pair[1] > 0:
            N_pos += freqs[pair]
        else:
            N_neg += freqs[pair]

    D = len(train_y)
    D_pos = sum(train_y)
    D_neg = D-D_pos

    logprior = np.log(D_pos) - np.log(D_neg)

    for word in vocab:
        freq_pos = freqs.get((word,1), 0)
        freq_neg = freqs.get((word,0), 0)

        #Calculating the probability that each word is pos or neg
        p_w_pos = (freq_pos+1)/(N_pos + V)
        p_w_neg = (freq_neg + 1)/(N_neg + V)

        loglikelihood[word] =  np.log(p_w_pos) - np.log(p_w_neg)

    return logprior, loglikelihood
            

In [15]:
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))

0.0
10620


# Testing

In [20]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    word_1 = process_tweet(tweet)
    p = 0
    p+= logprior
    for w in word_1:
        if w in loglikelihood:
            p+=loglikelihood[w]
    return p

In [21]:
my_tweet = 'He laughed.'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print('The expected output is', p)

The expected output is -1.114712842685476


# Test Naive Bayes to check accuracy

In [23]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood, naieve_bayes_predict=naive_bayes_predict):
    accuracy = 0
    y_hats = []
    if naive_bayes_predict(tweet, logprior, loglikelihood)>0:
        y_hat_i = 1
    else:
        y_hat_i = 0
    y_hats.append(y_hat_i)

    error = sum(abs(y_hat- y_true) for y_hat, y_true in zip(y_hats,test_y))/len(test_y)
    accuracy = 1- error
    return accuracy

In [24]:
# Run this cell to test your function
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    # print( '%s -> %f' % (tweet, naive_bayes_predict(tweet, logprior, loglikelihood)))
    p = naive_bayes_predict(tweet, logprior, loglikelihood)
#     print(f'{tweet} -> {p:.2f} ({p_category})')
    print(f'{tweet} -> {p:.2f}')

I am happy -> 2.05
I am bad -> -1.28
this movie should have been great. -> 2.03
great -> 2.19
great great -> 4.38
great great great -> 6.57
great great great great -> 8.75


# Filtering words by ratio of pos and neg counts
* Some words have more pos counts than others -> "more postitve"
* Some words have mroe neg counts that others -> "more negative"
* It can be determined using the loglikelihood calculations to determine the relative pos and neg of words

In [28]:
def get_ratio(freqs, word):
    pos_neg_ratio = {
        'positive':0,
        'negative': 0,
        'ratio':0.0
    }
    pos_neg_ratio['positive']=freqs.get((word,1),0)
    pos_neg_ratio['negative']=freqs.get((word,0),0)
    pos_neg_ratio['ratio']=(pos_neg_ratio['positive']+1)/(pos_neg_ratio['negative']+1) 
    # +1 to handle division by - 0 edge/error case

    return pos_neg_ratio

# Get words by threshold
* The function extracts words from freqs that meet the given threshold condition.
* It returns a dictionary containing only words that meet a specified threshold condition.

In [30]:
def get_words_by_threshold(freqs,label,threshold,get_ratio=get_ratio):
    word_list = {}
    for key in freqs.keys():
        w,_ = key
        pos_neg_ratio = get_ratio(freqs,w)
        if label == 1 and pos_neg_raio['ratio']>=threshold:
            word_list[w] = pos_neg_ratio

        
        elif label == 0 and pos_neg_ratio['ratio']<=threshold:
            word_list[w] = pos_neg_ratio
    return word_list

# Predict your own tweet!

In [33]:

my_tweet = 'I am happy because I am moving out :)'

p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print(p)

0.9353829182269733


In [34]:
my_tweet = 'I am sad because I am sick.'

p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print(p)

-5.800834555010549
