In [1]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Sentiment Analysis in Text with NLTK
URL https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk

Sentiment analysis is the process of analyzing large volumes of text to determine whether it expresses a positive sentiment, a negative sentiment or a neutral sentiment.
## Load libraries

In [2]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from nltk import classify
from nltk import NaiveBayesClassifier

import re, string, random

## Define functions

In [3]:
# cleans tokens in tweets from noise
def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    # assigns a part-of-speech tag to each one token using function pos_tag from NLTK
    for token, tag in pos_tag(tweet_tokens):
        # remove URLs
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        # Remove mentions, such as @username
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        # determine part of speech for lemmatization
        if tag.startswith("NN"):
            pos = 'n' #noun
        elif tag.startswith('VB'):
            pos = 'v' #verb
        else:
            pos = 'a' #adjective

        lemmatizer = WordNetLemmatizer() #lemmatization reduces words to their base form
        token = lemmatizer.lemmatize(token, pos)

        # add cleaned tokens to the list if they are not empty, punctuation, or stopwords
        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

# creates a generator for all words from cleaned tokens
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

#  convert each token into a dictionary {word: True}
def get_tweets_for_model(cleaned_tokens_list): 
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

  token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\


## Tokenizing the Data

We load the example strings from the dataset "positive_tweets" and "negatice_tweets" to understand the structure of the tweets

In [4]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

print(positive_tweets[0])

#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)


## Normalizing the Data

We remove stop words, tokenize the tweets into individual words, and prepare the data for further processing, such as cleaning tokens.

In [5]:
stop_words = stopwords.words('english')

positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

print(positive_tweet_tokens[0])

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']


## Remove Noise
We use our prepared function to remove noise from the tokenized tweets, creating cleaned lists of positive and negative tokens.

In [6]:
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
    
print(positive_cleaned_tokens_list[0])

['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)']


## Determining Word Density
​
Word density reflects the diversity of vocabulary in the dataset. A higher density indicates a more diverse vocabulary (less repetition). A lower density indicates more repetition of the same words.

$$
\text{Word Density} = \frac{\text{Number of Unique Words}}{\text{Total Number of Words}}
$$


In [7]:
all_pos_words = list(get_all_words(positive_cleaned_tokens_list))
all_neg_words = list(get_all_words(negative_cleaned_tokens_list))

# Calculate total and unique word counts
total_pos_words = len(all_pos_words)
unique_pos_words = len(set(all_pos_words))

total_neg_words = len(all_neg_words)
unique_neg_words = len(set(all_neg_words))

# Calculate word density
positive_density = unique_pos_words / total_pos_words
negative_density = unique_neg_words / total_neg_words

print(f"Word Density in Positive Tweets: {positive_density:.4f}")
print(f"Word Density in Negative Tweets: {negative_density:.4f}")

Word Density in Positive Tweets: 0.2154
Word Density in Negative Tweets: 0.2082


## The most popular words

To count the most frequently occurring words, we can use FreqDist. It tells us the frequency distribution of each vocabulary item in the text.


In [8]:
all_pos_words = get_all_words(positive_cleaned_tokens_list)
freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

positive_dataset = [(tweet_dict, "Positive")
                    for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
                    for tweet_dict in negative_tokens_for_model]

print(positive_dataset[0])
print(len(positive_dataset))
print(len(negative_dataset))

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 332), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]
({'#followfriday': True, 'top': True, 'engage': True, 'member': True, 'community': True, 'week': True, ':)': True}, 'Positive')
5000
5000


## Modeling and Classification

We split dataset into the traning and testing sets. We will use NaiveBayesClassifier. The goal of this model is to learn the relationship between words and sentiment labels by calculating the likelihood of words in each class. For example, if the word “love” appears more often in positive texts, the model associates it with a positive tone.

In [9]:
dataset = positive_dataset + negative_dataset
random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.9956666666666667
Most Informative Features
                      :) = True           Positi : Negati =   1648.1 : 1.0
                follower = True           Positi : Negati =     41.3 : 1.0
                     sad = True           Negati : Positi =     34.1 : 1.0
                  arrive = True           Positi : Negati =     21.2 : 1.0
                     bam = True           Positi : Negati =     20.8 : 1.0
                    glad = True           Positi : Negati =     18.8 : 1.0
                     x15 = True           Negati : Positi =     16.5 : 1.0
               wonderful = True           Positi : Negati =     13.6 : 1.0
                     idk = True           Negati : Positi =     13.1 : 1.0
              definitely = True           Positi : Negati =     12.2 : 1.0
None


## Use the trained model to predict
Now we can use our trained model to predict whether a sentence has a positive or negative tone, based on its words.

In [10]:
custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."
print(custom_tweet)

custom_tokens = remove_noise(word_tokenize(custom_tweet))
print(custom_tokens)

print(classifier.classify(dict([token, True] for token in custom_tokens)))

I ordered just once from TerribleCo, they screwed up, never used the app again.
['i', 'order', 'just', 'once', 'from', 'terribleco', 'they', 'screw', 'up', 'never', 'use', 'the', 'app', 'again']
Negative
