## First, download necessary staff.

In [1]:
import nltk
nltk.download('twitter_samples')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/pasha/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/pasha/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/pasha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/pasha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/pasha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Import necessary modules

In [2]:
from nltk.corpus import twitter_samples
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import re, string
from nltk import FreqDist
import random
from nltk import classify
from nltk import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

## Get the actual string content of those tweets

In [3]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

## Tokenize tweets

In [4]:
positive_tweets_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweets_tokens = twitter_samples.tokenized('negative_tweets.json')

## Let's write a function that will preprocess our tweets.

In [5]:
def preprocessing_tweets(text_tokens, del_hashtags_=True, lematize_text_=True, del_stop_words=True):
    if del_hashtags_:
            text_tokens = del_hashtags(text_tokens)
    if lematize_text:
        text_tokens = lematize_text(text_tokens)
    if del_stop_words:
        text_tokens = del_stopwords(text_tokens)
        
    return text_tokens


def del_hashtags(text_tokens):
    tokens = []
    for token_ in text_tokens:
        if "#" not in token_:
            tokens.append(token_)
    return tokens


def lematize_text(text_tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(text_tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence


def del_stopwords(text_tokens):
    cleaned_tokens = []
    stop_words = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()
    
    for token, tag in pos_tag(text_tokens):
        if (re.search(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', token) or 
            re.search(r'(@[A-Za-z0-9_]+)', token)):
            continue
        
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        token = lemmatizer.lemmatize(token, pos)

        if token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens


## Let's test `preprocessing_tweets`

In [6]:
print("Before:", positive_tweets_tokens[50])
print("After:", preprocessing_tweets(positive_tweets_tokens[50]))

Before: ['@groovinshawn', 'they', 'are', 'rechargeable', 'and', 'it', 'normally', 'comes', 'with', 'a', 'charger', 'when', 'u', 'buy', 'it', ':)']
After: ['rechargeable', 'normally', 'come', 'charger', 'u', 'buy', ':)']


## Run `preprocessing_tweets` on all positive/negative tokens.

In [7]:
positive_cleaned_tokens_list = [preprocessing_tweets(tokens) for tokens in positive_tweets_tokens]
negative_cleaned_tokens_list = [preprocessing_tweets(tokens) for tokens in negative_tweets_tokens]

## Let's see how did the processing go.

In [8]:
print(positive_tweets_tokens[500])
print(positive_cleaned_tokens_list[500])

['Dang', 'that', 'is', 'some', 'rad', '@AbzuGame', '#fanart', '!', ':D', 'https://t.co/bI8k8tb9ht']
['dang', 'rad', ':d']


## Let's see what is most common there. Add a helper function `get_all_words`:

In [9]:
def get_all_words(cleaned_tokens_list):
    return [w for tokens in cleaned_tokens_list for w in tokens]

In [10]:
all_pos_words = get_all_words(positive_cleaned_tokens_list)
all_pos_words[:20]

['top',
 'engage',
 'member',
 'community',
 'week',
 ':)',
 'hey',
 'james',
 'odd',
 ':/',
 'please',
 'call',
 'contact',
 'centre',
 '02392441234',
 'able',
 'assist',
 ':)',
 'many',
 'thanks']

## Perform frequency analysis using `FreqDist`:

In [11]:
freq_dist_pos = FreqDist(all_pos_words)
freq_dist_pos.most_common(10)

[(':)', 3691),
 (':-)', 701),
 (':d', 658),
 ('thanks', 388),
 ('follow', 357),
 ('love', 333),
 ('...', 290),
 ('good', 283),
 ('get', 264),
 ('thank', 253)]

## Fine. Now we'll convert these to a data structure usable for NLTK's naive Bayes classifier ([docs here](https://www.nltk.org/_modules/nltk/classify/naivebayes.html)):

In [12]:
def get_token_dict(tokens):
    return dict([token, True] for token in tokens)
    
def get_tweets_for_model(cleaned_tokens_list):   
    return [get_token_dict(tweet_tokens) for tweet_tokens in cleaned_tokens_list]

In [13]:
positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

## Create two datasets for positive and negative tweets. Use 7000/3000 split for train and test data.

In [14]:
positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
                     for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

## Finally we use the nltk's NaiveBayesClassifier on the training data we've just created:

In [15]:
classifier_naive = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier_naive, test_data))

print(classifier_naive.show_most_informative_features(10))

Accuracy is: 0.9956666666666667
Most Informative Features
                      :( = True           Negati : Positi =   2069.4 : 1.0
                     sad = True           Negati : Positi =     25.3 : 1.0
                followed = True           Negati : Positi =     23.7 : 1.0
                follower = True           Positi : Negati =     21.8 : 1.0
                     bam = True           Positi : Negati =     21.6 : 1.0
                 welcome = True           Positi : Negati =     20.6 : 1.0
                  arrive = True           Positi : Negati =     19.8 : 1.0
                     x15 = True           Negati : Positi =     15.7 : 1.0
                   didnt = True           Negati : Positi =     14.3 : 1.0
                    glad = True           Positi : Negati =     12.6 : 1.0
None


## Let's check some test phrase. First, download punkt sentence tokenizer ([docs here](https://www.nltk.org/api/nltk.tokenize.punkt.html))

In [18]:
def get_sentiment(text):
    custom_tokens = preprocessing_tweets(word_tokenize(text))
    return classifier_naive.classify(get_token_dict(custom_tokens))

In [19]:
texts = ["bad", "service is bad", "service is really bad", "service is so terrible", "great service", "they stole my money"]
for t in texts:
    print(t, ": ", get_sentiment(t))

bad :  Negative
service is bad :  Negative
service is really bad :  Negative
service is so terrible :  Negative
great service :  Positive
they stole my money :  Negative


## Try to use Logistic Regression classifier instead

In [20]:
classifier_log = SklearnClassifier(LogisticRegression()).train(train_data)

print("LogisticRegression\n\nAccuracy is:", classify.accuracy(classifier_log, test_data))

LogisticRegression

Accuracy is: 0.9943333333333333


## Try to use Decision Tree classifier instead

In [21]:
classifier_tree = SklearnClassifier(tree.DecisionTreeClassifier(max_depth=5)).train(train_data)

print("DecisionTreeClassifier\n\nAccuracy is:", classify.accuracy(classifier_tree, test_data))

DecisionTreeClassifier

Accuracy is: 0.9946666666666667


## Try to use Random Forest classifier instead

In [22]:
classifier_forest = SklearnClassifier(RandomForestClassifier()).train(train_data)

print("DecisionTreeClassifier\n\nAccuracy is:", classify.accuracy(classifier_forest, test_data))

DecisionTreeClassifier

Accuracy is: 0.9953333333333333
