In [1]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import twitter_samples, stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk import FreqDist, classify, NaiveBayesClassifier
import re, string, random

In [2]:
def remove_noise(twitterTokens, stop_words = ()):
    cleaned_tokens = []
    for token, tag in pos_tag(twitterTokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)
        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)
        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [3]:
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

In [4]:
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

In [5]:
positiveTweets = twitter_samples.strings('positive_tweets.json')
negativeTweets = twitter_samples.strings('negative_tweets.json')

In [6]:
stop_words = stopwords.words('english')
positiveTokens = twitter_samples.tokenized('positive_tweets.json')
negativeTokens = twitter_samples.tokenized('negative_tweets.json')

In [7]:
positiveCleanedTokens = []
negativeCleanedTokens = []
for tokens in positiveTokens:
    positiveCleanedTokens.append(remove_noise(tokens, stop_words))
for tokens in negativeTokens:
    negativeCleanedTokens.append(remove_noise(tokens, stop_words))

In [8]:
positiveTokensModel = get_tweets_for_model(positiveCleanedTokens)
negativeTokensModel = get_tweets_for_model(negativeCleanedTokens)

In [9]:
positiveDataset = [(tweet_dict, "Positive") for tweet_dict in positiveTokensModel]
negativeDataset = [(tweet_dict, "Negative") for tweet_dict in negativeTokensModel]

In [10]:
dataset = positiveDataset + negativeDataset
random.shuffle(dataset)
train_data = dataset[:7000]
test_data = dataset[7000:]
classifier = NaiveBayesClassifier.train(train_data)
print("Accuracy is:", classify.accuracy(classifier, test_data))
print(classifier.show_most_informative_features(10))

Accuracy is: 0.9973333333333333
Most Informative Features
                      :( = True           Negati : Positi =   2066.4 : 1.0
                      :) = True           Positi : Negati =    982.4 : 1.0
                follower = True           Positi : Negati =     40.2 : 1.0
                     sad = True           Negati : Positi =     32.3 : 1.0
                 awesome = True           Positi : Negati =     20.3 : 1.0
                    poor = True           Negati : Positi =     19.7 : 1.0
                     bam = True           Positi : Negati =     18.3 : 1.0
               community = True           Positi : Negati =     16.3 : 1.0
                     x15 = True           Negati : Positi =     15.7 : 1.0
              appreciate = True           Positi : Negati =     15.0 : 1.0
None


In [11]:
customTweet = "Worst stock I ever purchased" #add any sentence to check for positive or negative
customTokens = remove_noise(word_tokenize(customTweet))
print(customTweet)
print(" Analysis:",classifier.classify(dict([token, True] for token in customTokens)))

Worst stock I ever purchased
 Analysis: Negative
