# Tweet Sentiment Analysis using Natural Language Toolkit (NLTK)

## Step 1: Import libraries and download the data 

In [1]:
import nltk

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/sota/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/sota/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

## Step 2: Tokenize  

In [4]:
from nltk.corpus import twitter_samples

In [5]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
tweet_tokens[0]

['#FollowFriday',
 '@France_Inte',
 '@PKuchly57',
 '@Milipol_Paris',
 'for',
 'being',
 'top',
 'engaged',
 'members',
 'in',
 'my',
 'community',
 'this',
 'week',
 ':)']

## Step 3: Normalize the data

In [6]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/sota/nltk_data...


True

In [7]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sota/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [11]:
from nltk.tag import pos_tag

In [12]:
pos_tag(tweet_tokens[0])

[('#FollowFriday', 'JJ'),
 ('@France_Inte', 'NNP'),
 ('@PKuchly57', 'NNP'),
 ('@Milipol_Paris', 'NNP'),
 ('for', 'IN'),
 ('being', 'VBG'),
 ('top', 'JJ'),
 ('engaged', 'VBN'),
 ('members', 'NNS'),
 ('in', 'IN'),
 ('my', 'PRP$'),
 ('community', 'NN'),
 ('this', 'DT'),
 ('week', 'NN'),
 (':)', 'NN')]

In [13]:
#we are using lemmatization technique of normalization of words 
from nltk.stem.wordnet import WordNetLemmatizer

In [14]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /Users/sota/nltk_data...


True

In [20]:
x,y = pos_tag(tweet_tokens[0])[0]

In [21]:
x

'#FollowFriday'

In [22]:
y

'JJ'

In [23]:
import seaborn as sns

In [24]:
df = sns.load_dataset('tips')

In [27]:
male, female = df.groupby('sex')

In [30]:
x, male, female = df.groupby('sex')

ValueError: not enough values to unpack (expected 3, got 2)

In [28]:
male

('Male',
      total_bill   tip   sex smoker  day    time  size
 1         10.34  1.66  Male     No  Sun  Dinner     3
 2         21.01  3.50  Male     No  Sun  Dinner     3
 3         23.68  3.31  Male     No  Sun  Dinner     2
 5         25.29  4.71  Male     No  Sun  Dinner     4
 6          8.77  2.00  Male     No  Sun  Dinner     2
 ..          ...   ...   ...    ...  ...     ...   ...
 236       12.60  1.00  Male    Yes  Sat  Dinner     2
 237       32.83  1.17  Male    Yes  Sat  Dinner     2
 239       29.03  5.92  Male     No  Sat  Dinner     3
 241       22.67  2.00  Male    Yes  Sat  Dinner     2
 242       17.82  1.75  Male     No  Sat  Dinner     2
 
 [157 rows x 7 columns])

In [29]:
female

('Female',
      total_bill   tip     sex smoker   day    time  size
 0         16.99  1.01  Female     No   Sun  Dinner     2
 4         24.59  3.61  Female     No   Sun  Dinner     4
 11        35.26  5.00  Female     No   Sun  Dinner     4
 14        14.83  3.02  Female     No   Sun  Dinner     2
 16        10.33  1.67  Female     No   Sun  Dinner     3
 ..          ...   ...     ...    ...   ...     ...   ...
 226       10.09  2.00  Female    Yes   Fri   Lunch     2
 229       22.12  2.88  Female    Yes   Sat  Dinner     2
 238       35.83  4.67  Female     No   Sat  Dinner     3
 240       27.18  2.00  Female    Yes   Sat  Dinner     2
 243       18.78  3.00  Female     No  Thur  Dinner     2
 
 [87 rows x 7 columns])

In [None]:
for word, tab in pos_tag(tokens)

In [16]:
tweet_tokens[0]

['#FollowFriday',
 '@France_Inte',
 '@PKuchly57',
 '@Milipol_Paris',
 'for',
 'being',
 'top',
 'engaged',
 'members',
 'in',
 'my',
 'community',
 'this',
 'week',
 ':)']

In [31]:
def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

In [32]:
print(lemmatize_sentence(tweet_tokens[0]))

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'be', 'top', 'engage', 'member', 'in', 'my', 'community', 'this', 'week', ':)']


In [33]:
tweet_tokens[0]

['#FollowFriday',
 '@France_Inte',
 '@PKuchly57',
 '@Milipol_Paris',
 'for',
 'being',
 'top',
 'engaged',
 'members',
 'in',
 'my',
 'community',
 'this',
 'week',
 ':)']

https://stackoverflow.com/a/67353408

In [13]:
pos_tag(tweet_tokens[0])

[('#FollowFriday', 'JJ'),
 ('@France_Inte', 'NNP'),
 ('@PKuchly57', 'NNP'),
 ('@Milipol_Paris', 'NNP'),
 ('for', 'IN'),
 ('being', 'VBG'),
 ('top', 'JJ'),
 ('engaged', 'VBN'),
 ('members', 'NNS'),
 ('in', 'IN'),
 ('my', 'PRP$'),
 ('community', 'NN'),
 ('this', 'DT'),
 ('week', 'NN'),
 (':)', 'NN')]

## Step 4: Removing Noise (StopWords)

In [14]:
import re, string

def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        #remove hyperlinks
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        #remove twitter handles
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [15]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vipul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
#remove stopwords
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

remove_noise(tweet_tokens[0], stop_words)

['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)']

In [17]:
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

In [18]:
print(positive_tweet_tokens[0])

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']


In [19]:
print(positive_cleaned_tokens_list[0])

['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)']


## Determining Word Density

In [20]:
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

all_pos_words = get_all_words(positive_cleaned_tokens_list)

In [21]:
from nltk import FreqDist

In [22]:
freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]


## Step 6 — Preparing Data for the Model

In [23]:
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

Splitting the dataset for training and testing the model

In [24]:
import random

positive_dataset = [(tweet_dict, "positive")
                   for tweet_dict in positive_tokens_for_model]
negative_dataset = [(tweet_dict, "negative")
                   for tweet_dict in negative_tokens_for_model]
dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

#train test split in ratio train 70 :30 test
train_data = dataset[:7000]
test_data = dataset[7000:]

## Step 7 : Building and Testing Model

### Training

In [27]:
#using NaiveBayesClassifier to train() and computing the accuracy()
from nltk import classify
from nltk import NaiveBayesClassifier

In [28]:
model_nb = NaiveBayesClassifier.train(train_data)

In [33]:
print("Accuracy is ",classify.accuracy(model_nb , test_data))

Accuracy is  0.994


In [34]:
model_nb.show_most_informative_features(n=10)

Most Informative Features
                      :( = True           negati : positi =   2059.5 : 1.0
                      :) = True           positi : negati =   1653.6 : 1.0
                  arrive = True           positi : negati =     35.6 : 1.0
                     sad = True           negati : positi =     22.5 : 1.0
                     bam = True           positi : negati =     19.6 : 1.0
                     x15 = True           negati : positi =     19.1 : 1.0
                 welcome = True           positi : negati =     14.1 : 1.0
              appreciate = True           positi : negati =     13.6 : 1.0
                   great = True           positi : negati =     13.4 : 1.0
                followed = True           negati : positi =     13.0 : 1.0


### Test

<div class="alert alert-info">
to test on custom tweeet create a custom tweet with a sentiment
    </div>

In [35]:

from nltk import word_tokenize

In [42]:
custom_tweet = "I really happy today, It's not raining ."

In [43]:
custom_token = remove_noise(word_tokenize(custom_tweet))

In [44]:
model_nb.classify(dict([token, True] for token in custom_token))

'positive'

In [45]:
custom_tweet_2 = "Thank you so much for wishing me happy birthday, I really appreciate you all"

In [None]:
custom_token = remove_noise(word_tokenize(custom_tweet_2))

In [46]:
model_nb.classify(dict([token, True] for token in custom_token))

'positive'