In [4]:
from nltk.corpus import twitter_samples


In [7]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')

tweet_tokens = twitter_samples.tokenized('positive_tweets.json')

In [54]:
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer

def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence



In [17]:
import re, string

def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [56]:
# Pick a random tweet (tweet_tokens[PICK A NUMBER]) and explain the 
# difference you see in the output between the lemmatize_sentence function 
# and the remove_noise function

print(lemmatize_sentence(tweet_tokens[0]), "\n")
print(remove_noise(tweet_tokens[0]))

# remove_noise will remove useless information like @ people and hyperlink

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'be', 'top', 'engage', 'member', 'in', 'my', 'community', 'this', 'week', ':)'] 

['#followfriday', 'for', 'be', 'top', 'engage', 'member', 'in', 'my', 'community', 'this', 'week', ':)']


In [21]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

print(remove_noise(tweet_tokens[0], stop_words))

['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)']


In [22]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

#print(remove_noise(tweet_tokens[0], stop_words))

positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

In [53]:
# List 10 proper nouns that exist in the negative tweets data set.
count = 0
for tokens in negative_cleaned_tokens_list:
    # print(tokens)
    tags = pos_tag(tokens)
    for tag in tags:
        if tag[1] == "NN" and count < 10:
            print(tag[0])
            count += 1
        else:
            break



hopeless
tmr
:(
everything
kid
section
heart
slide
waste
basket


In [57]:
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

all_pos_words = get_all_words(positive_cleaned_tokens_list)
all_neg_words = get_all_words(negative_cleaned_tokens_list)

In [24]:
from nltk import FreqDist

freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]


In [58]:
# What are the top 10 most negative words by frequency?

freq_dist_neg = FreqDist(all_neg_words)
print(freq_dist_neg.most_common(10))

[(':(', 4585), (':-(', 501), ("i'm", 343), ('...', 332), ('get', 325), ('miss', 291), ('go', 275), ('please', 275), ('want', 246), ('like', 218)]


In [63]:
# Is “community” a positive or negative word under informative features?

print(freq_dist_pos["community"])
print(freq_dist_neg["community"])

# community appears in both of the group. But it is much more frequent in possitive group

31
1


In [31]:
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

In [33]:
import random

positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
                     for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

In [34]:
from nltk import classify
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.9956666666666667
Most Informative Features
                      :( = True           Negati : Positi =   2083.3 : 1.0
                      :) = True           Positi : Negati =   1641.3 : 1.0
                     sad = True           Negati : Positi =     55.5 : 1.0
                follower = True           Positi : Negati =     38.0 : 1.0
                    poor = True           Negati : Positi =     19.2 : 1.0
                    glad = True           Positi : Negati =     18.2 : 1.0
                     x15 = True           Negati : Positi =     17.2 : 1.0
                followed = True           Negati : Positi =     16.3 : 1.0
               community = True           Positi : Negati =     16.2 : 1.0
                    blog = True           Positi : Negati =     14.9 : 1.0
None


In [35]:
from nltk.tokenize import word_tokenize

custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."

custom_tokens = remove_noise(word_tokenize(custom_tweet))

print(classifier.classify(dict([token, True] for token in custom_tokens)))

Negative


In [65]:
Trump_twit = """If the Dow Joans ever falls more than 1000 ‘points’ 
in a Single Day the sitting president should be 'loaded' into a very big cannon and 
Shot into the sun at TREMENDOUS SPEED! No excuses!"""

Trump_twit_tokens = remove_noise(word_tokenize(Trump_twit))

print(classifier.classify(dict([token, True] for token in Trump_twit_tokens)))

# this twitter about trump fighting against coronavirus is possitive

Negative


# What did you learn from this exercise?
    Natual language processing can be really powerful in industry because it help us to identify the sentiment on each statement. However, sometimes it can be undecisive on vague language. For example, the twit above is Trump telling people to stay calm because he wont let Dow Joans to fall more than 1000 points. (Although it already have). But still, this twit is more possitive to me instead of negative predicted by the model. 