# import nltk
nltk.download('twitter_samples')

In [1]:
from nltk.corpus import twitter_samples

In [2]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')

In [3]:
print([len(positive_tweets), len(negative_tweets)])

[5000, 5000]


In [4]:
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')

In [5]:
print(tweet_tokens[0])

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']


In [6]:
from nltk.tag import pos_tag
print(pos_tag(tweet_tokens[0]))

[('#FollowFriday', 'JJ'), ('@France_Inte', 'NNP'), ('@PKuchly57', 'NNP'), ('@Milipol_Paris', 'NNP'), ('for', 'IN'), ('being', 'VBG'), ('top', 'JJ'), ('engaged', 'VBN'), ('members', 'NNS'), ('in', 'IN'), ('my', 'PRP$'), ('community', 'NN'), ('this', 'DT'), ('week', 'NN'), (':)', 'NN')]


In [7]:
help(pos_tag)

Help on function pos_tag in module nltk.tag:

pos_tag(tokens, tagset=None, lang='eng')
    Use NLTK's currently recommended part of speech tagger to
    tag the given list of tokens.
    
        >>> from nltk.tag import pos_tag
        >>> from nltk.tokenize import word_tokenize
        >>> pos_tag(word_tokenize("John's big idea isn't all that bad."))
        [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'),
        ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')]
        >>> pos_tag(word_tokenize("John's big idea isn't all that bad."), tagset='universal')
        [('John', 'NOUN'), ("'s", 'PRT'), ('big', 'ADJ'), ('idea', 'NOUN'), ('is', 'VERB'),
        ("n't", 'ADV'), ('all', 'DET'), ('that', 'DET'), ('bad', 'ADJ'), ('.', '.')]
    
    NB. Use `pos_tag_sents()` for efficient tagging of more than one sentence.
    
    :param tokens: Sequence of tokens to be tagged
    :type tokens: list(str)
    :param tagset: the tagset to be u

In [8]:
from nltk.stem.wordnet import WordNetLemmatizer

In [9]:
def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

print(lemmatize_sentence(tweet_tokens[0]))

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'be', 'top', 'engage', 'member', 'in', 'my', 'community', 'this', 'week', ':)']


In [10]:
import re, string

def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [11]:
from nltk.corpus import stopwords
# stopwords are meaningless words like 'the' 'a' 'this' etc...
stop_words = stopwords.words('english')

print(remove_noise(tweet_tokens[0], stop_words))

['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)']


In [12]:
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

In [13]:
print(positive_tweet_tokens[500])
print(positive_cleaned_tokens_list[500])

['Dang', 'that', 'is', 'some', 'rad', '@AbzuGame', '#fanart', '!', ':D', 'https://t.co/bI8k8tb9ht']
['dang', 'rad', '#fanart', ':d']


In [14]:
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

all_pos_words = get_all_words(positive_cleaned_tokens_list)

In [15]:
#for review_dict in all_pos_words: 
#    print(review_dict)

In [16]:
from nltk import FreqDist

freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]


In [17]:
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

In [18]:
for review_dict in negative_tokens_for_model: 
    print(review_dict)

{'hopeless': True, 'tmr': True, ':(': True}
{'everything': True, 'kid': True, 'section': True, 'ikea': True, 'cute': True, 'shame': True, "i'm": True, 'nearly': True, '19': True, '2': True, 'month': True, ':(': True}
{'heart': True, 'slide': True, 'waste': True, 'basket': True, ':(': True}
{'“': True, 'hate': True, 'japanese': True, 'call': True, 'ban': True, ':(': True, '”': True}
{'dang': True, 'start': True, 'next': True, 'week': True, 'work': True, ':(': True}
{'oh': True, 'god': True, 'baby': True, 'face': True, ':(': True}
{'make': True, 'smile': True, ':(': True}
{'work': True, 'neighbour': True, 'motor': True, 'asked': True, 'say': True, 'hat': True, 'update': True, 'search': True, ':(': True}
{':(': True, 'sialan': True}
{'athabasca': True, 'glacier': True, '#1948': True, ':-(': True, '#athabasca': True, '#glacier': True, '#jasper': True, '#jaspernationalpark': True, '#alberta': True, '#explorealberta': True, '…': True}
{'really': True, 'good': True, 'g': True, 'idea': True, "

{'everyone': True, 'go': True, 'womad': True, ':(': True}
{'wut': True, 'hell': True, "can't": True, 'sleep': True, '>:(': True}
{'vicious': True, 'circle': True, ':(': True, 'x': True}
{'surprise': True, ':(': True}
{'win': True, 'ticket': True, 'cody': True, 'simpson': True, 'concert': True, 'singapore': True, 'august': True, '10': True, '#spinorbinmusicxcodysimpson': True, 'pls': True, ':(': True}
{'sorry': True, 'poooo': True, ':(': True}
{'sorry': True, 'na': True, 'bh3s': True, ':(': True}
{'project': True, 'want': True, 'enter': True, '#pitchwars': True, 'may': True, 'work': True, ':(': True, 'first': True, 'chap': True, "mine's": True, 'phone': True, 'call': True, 'transcript': True, '#ugh': True}
{"apma's": True, ':(': True}
{'bad': True, 'shoulder': True, 'start': True, 'hurt': True, 'like': True, 'bitch': True, ':(': True, "we're": True, 'day': True, 'away': True, 'competition': True, 'hope': True, "it'll": True, 'fine': True, 'time': True}
{'okay': True, 'feel': True, 'happ

{'even': True, 'dream': True, 'frank': True, 'ocean': True, 'release': True, 'new': True, 'album': True, 'ffs': True, 'pls': True, ':(': True}
{'snapchat': True, 'lisaherring': True, '19': True, '#snapchat': True, '#kikme': True, '#kikmeboys': True, '#woman': True, '#ebony': True, '#weloveyounamjoon': True, '#sexi': True, ':(': True}
{'know': True, 'give': True, 'white': True, 'dress': True, 'small': True, 'polka': True, 'dot': True, 'away': True, ':(': True, '😭': True, 'sigh': True, 'ndi': True, 'hack': True}
{'miss': True, 'baby': True, ':-(': True}
{"i'm": True, 'already': True, 'bad': True, 'day': True, ':-(': True}
{'yum': True, 'fee': True, '6': True, 'leftover': True, ':(': True}
{'nobody': True, 'tell': True, 'side': True, 'twitter': True, 'cs': True, 'sorry': True, ':(': True}
{'last': True, 'dick': True, 'pic': True, 'get': True, 'awful': True, 'ruin': True, 'walnut': True, 'whip': True, 'life': True, ':-(': True, 'also': True, 'name': True, 'turn': True, 'wife': True, 'follo

{'know': True, '. .': True, "i'm": True, 'sorry': True, 'let': True, 'make': True}
{'getting': True, 'pump': True, ':(': True}
{'capeee': True, ':(': True}
{'money': True, ':(': True}
{'haha': True, 'japan': True, ':(': True, 'sorry': True, 'love': True}
{':(': True}
{'noo': True, "he's": True, 'random': True, 'person': True, 'class': True, ':(': True}
{'young': True, 'outlive': True, '>:(': True}
{'going': True, 'x-ray': True, 'dental': True, 'monday': True, 'check': True, 'spine': True, '...': True, 'nothing': True, 'though': True, 'apart': True, 'pain': True, 'relief': True, ':(': True}
{'popol': True, 'day': True, ':(': True}
{'stomach': True, 'kill': True, ':(': True}
{'aww': True, 'poor': True, 'frog': True, ':(': True}
{'three': True, 'favourite': True, 'picture': True, 'take': True, 'brad': True, ':-(': True, 'miss': True, 'tour': True, 'much': True}
{'looking': True, 'gen.ad': True, 'better': True, 'price': True, 'negotiable': True, ':(': True, 'someone': True, 'help': True, '

{'♛': True, '》': True, 'love': True, 'much': True, 'beli̇eve': True, 'wi̇ll': True, 'follow': True, 'please': True, 'justi̇n': True, ':(': True, 'x15': True, '331': True, 'ｓｅｅ': True, 'ｍｅ': True}
{'looks': True, 'like': True, 'gonna': True, 'long': True, 'night': True, 'good': True, 'thing': True, 'far': True, 'roof': True, ':(': True}
{':(': True, 'get': True, 'u': True, 'nice': True, 'lil': True, 'gf': True}
{'really': True, 'want': True, 'see': True, ':-(': True}
{':(': True, 'wish': True, 'could': True, 'sweet': True}
{'go': True, 'indian': True, 'politics': True, 'every': True, 'one': True, 'blaming': True, '68': True, 'year': True, 'still': True, 'represent': True, 'school': True, 'life': True, ':(': True}
{'corbyn': True, 'must': True, 'understand': True, "labour's": True, 'new': True, 'member': True, 'change': True, "party's": True, 'fortune': True, 'yet': True, 'another': True, 'rant': True, 'woman': True, ':(': True}
{'baby': True, ':(': True}
{'icecream': True, 'leeds': True

{'need': True, 'stop': True, 'hardheaded': True, ':-(': True}
{'lonely': True, 'moment': True, 'text': True, 'message': True, 'get': True, 'day': True, 'cell': True, 'phone': True, 'company': True, ':(': True}
{'call': True, 'baby': True, 'know': True, 'im': True, 'one': True, ':(': True, '—': True, 'eat': True, "zach's": True, 'burger': True, 'xpress': True}
{"that's": True, 'true': True, ':-(': True, 'try': True, 'get': True, 'rid': True, 'katie': True, 'hopkins': True, 'avail': True}
{"i'm": True, 'really': True, ':(': True}
{"can't": True, 'sleep': True, ':(': True}
{'every': True, 'night': True, 'take': True, 'hella': True, 'melatonin': True, 'like': True, '2-4': True, 'hour': True, 'nap': True, "i'm": True, 'wide': True, 'awake': True, 'next': True, 'day': True, ':(': True}
{':(': True, 'determine': True, 'feel': True, 'ill': True, 'tonight': True}
{':(': True}
{'cold': True, 'office': True, 'huhu': True, ':(': True, 'task': True, "i'll": True, '9pm': True, 'hahaah': True}
{'miss

{"i'm": True, 'work': True, 'next': True, 'saturday': True, 'im': True, 'well': True, 'gutted': True, 'want': True, 'go': True, 'pride': True, ':(': True}
{'nope': True, ':(': True, 'hueeeee': True}
{'wrong': True, 'password': True, 'recall': True, 'thingy': True, 'work': True, 'prob': True, ':-(': True}
{'hi': True, 'george': True, "can't": True, 'see': True, 'sell': True, "i'm": True, 'afraid': True, ':(': True, 'sorry': True, 'thanks': True, 'beth': True}
{'chew': True, 'toy': True, 'stella': True, 'come': True, 'go': True, 'lot': True, 'noise': True, 'teeth': True, 'mum': True, 'yell': True, 'throw': True, 'room': True, ':(': True}
{'woke': True, 'bad': True, 'dream': True, 'grabe': True, ':(': True}
{'rejected': True, ':(': True, '#theaccidentalcouple': True, 'ep': True, '15': True}
{':(': True, 'always': True}
{'smooth': True, 'handover': True, ':(': True}
{'srsly': True, 'order': True, 'time': True, ':-(': True}
{'im': True, '24': True, 'hour': True, 'sleep': True, 'sunday': Tru

{'weird': True, ':(': True}
{'sr': True, 'financial': True, 'analyst': True, 'expedia': True, 'inc': True, '#bellevue': True, 'wa': True, '#finance': True, '#expediajobs': True, '#job': True, '#jobs': True, '#hiring': True}
{'seriously': True, ':(': True}
{'tried': True, 'make': True, 'prison': True, 'style': True, 'alcohol': True, 'fail': True, 'badly': True, 'end': True, 'massive': True, 'dose': True, 'shit': True, '...': True, ':(': True}
{'fair': True, 'still': True, 'look': True, 'cute': True, ':(': True}
{'hi': True, 'love': True, 'u': True, ':(': True}
{'overly': True, 'attach': True, ':(': True}
{"m's": True, 'voice': True, ':(': True, 'lord': True, 'help': True}
{'want': True, 'bandana': True, 'bottle': True, 'set': True, 'huhuh': True, ':(': True}
{"can't": True, 'sleep': True, 'bc': True, 'heartburn': True, ':-(': True}
{'meet': True, 'steven': True, 'kid': True, 'listen': True, ':(': True}
{"i'm": True, 'fuck': True, 'jealous': True, 'everyone': True, 'holiday': True, ':(':

In [19]:
import random

positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in positive_tokens_for_model]#

negative_dataset = [(tweet_dict, "Negative")
                     for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

In [21]:
from nltk import classify
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0
Most Informative Features
None


In [22]:
from nltk.tokenize import word_tokenize

custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."
custom_tweet2 = 'Congrats #SportStar on your 7th best goal from last season winning goal of the year :) #Baller #Topbin #oneofmanyworldies'

custom_tokens = remove_noise(word_tokenize(custom_tweet))
custom_tokens2 = remove_noise(word_tokenize(custom_tweet2))


print(classifier.classify(dict([token, True] for token in custom_tokens)))
print(classifier.classify(dict([token, True] for token in custom_tokens2)))

Positive
Positive
