In [2]:
import nltk

nltk.download([
     "names",
     "stopwords",
     "state_union",
     "twitter_samples",
     "movie_reviews",
     "averaged_perceptron_tagger",
     "vader_lexicon",
     "punkt",
])

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\alexm\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\names.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alexm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package state_union to
[nltk_data]     C:\Users\alexm\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\state_union.zip.
[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\alexm\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\twitter_samples.zip.
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\alexm\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\alexm\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nl

True

In [3]:
from sklearn.naive_bayes import (
    BernoulliNB,
    ComplementNB,
    MultinomialNB,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [4]:
classifiers = {
    "BernoulliNB": BernoulliNB(),
    "ComplementNB": ComplementNB(),
    "MultinomialNB": MultinomialNB(),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(),
    "MLPClassifier": MLPClassifier(max_iter=1000),
    "AdaBoostClassifier": AdaBoostClassifier(),
}

In [15]:
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

In [16]:
unwanted = nltk.corpus.stopwords.words("english")
unwanted.extend([w.lower() for w in nltk.corpus.names.words()])

def skip_unwanted(pos_tuple):
    word, tag = pos_tuple
    if not word.isalpha() or word in unwanted:
        return False
    if tag.startswith("NN"):
        return False
    return True

positive_words = [word for word, tag in filter(
    skip_unwanted,
    nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["pos"]))
)]
negative_words = [word for word, tag in filter(
    skip_unwanted,
    nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["neg"]))
)]

In [17]:
positive_fd = nltk.FreqDist(positive_words)
negative_fd = nltk.FreqDist(negative_words)

common_set = set(positive_fd).intersection(negative_fd)

for word in common_set:
    del positive_fd[word]
    del negative_fd[word]

top_100_positive = {word for word, count in positive_fd.most_common(100)}
top_100_negative = {word for word, count in negative_fd.most_common(100)}

In [20]:
def extract_features(text):
    features = dict()
    wordcount = 0
    compound_scores = list()
    positive_scores = list()

    for sentence in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sentence):
            if word.lower() in top_100_positive:
                wordcount += 1
        compound_scores.append(sia.polarity_scores(sentence)["compound"])
        positive_scores.append(sia.polarity_scores(sentence)["pos"])

    # Adding 1 to the final compound score to always have positive numbers
    # since some classifiers you'll use later don't work with negative numbers.
    features["mean_compound"] = np.mean(compound_scores) + 1
    features["mean_positive"] = np.mean(positive_scores)
    features["wordcount"] = wordcount

    return features

In [21]:
features = [
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "pos")
    for review in nltk.corpus.movie_reviews.fileids(categories=["pos"])
]
features.extend([
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "neg")
    for review in nltk.corpus.movie_reviews.fileids(categories=["neg"])
])

In [22]:
type(features)

list

In [23]:
features[:100]

[({'mean_compound': 1.0119444444444445,
   'mean_positive': 0.11548148148148146,
   'wordcount': 3},
  'pos'),
 ({'mean_compound': 1.0148181818181818,
   'mean_positive': 0.07952272727272727,
   'wordcount': 0},
  'pos'),
 ({'mean_compound': 1.2126947368421053,
   'mean_positive': 0.16726315789473684,
   'wordcount': 1},
  'pos'),
 ({'mean_compound': 1.090659090909091,
   'mean_positive': 0.09295454545454546,
   'wordcount': 1},
  'pos'),
 ({'mean_compound': 1.0626703703703704,
   'mean_positive': 0.09337037037037037,
   'wordcount': 1},
  'pos'),
 ({'mean_compound': 1.1488941176470588,
   'mean_positive': 0.10123529411764705,
   'wordcount': 18},
  'pos'),
 ({'mean_compound': 1.0525875, 'mean_positive': 0.112625, 'wordcount': 0},
  'pos'),
 ({'mean_compound': 1.175565625,
   'mean_positive': 0.13515624999999998,
   'wordcount': 1},
  'pos'),
 ({'mean_compound': 1.029923076923077,
   'mean_positive': 0.11538461538461539,
   'wordcount': 10},
  'pos'),
 ({'mean_compound': 0.864885714285

In [24]:
train_count = len(features)//4
print(train_count)
features[:train_count]

500


[({'mean_compound': 1.0119444444444445,
   'mean_positive': 0.11548148148148146,
   'wordcount': 3},
  'pos'),
 ({'mean_compound': 1.0148181818181818,
   'mean_positive': 0.07952272727272727,
   'wordcount': 0},
  'pos'),
 ({'mean_compound': 1.2126947368421053,
   'mean_positive': 0.16726315789473684,
   'wordcount': 1},
  'pos'),
 ({'mean_compound': 1.090659090909091,
   'mean_positive': 0.09295454545454546,
   'wordcount': 1},
  'pos'),
 ({'mean_compound': 1.0626703703703704,
   'mean_positive': 0.09337037037037037,
   'wordcount': 1},
  'pos'),
 ({'mean_compound': 1.1488941176470588,
   'mean_positive': 0.10123529411764705,
   'wordcount': 18},
  'pos'),
 ({'mean_compound': 1.0525875, 'mean_positive': 0.112625, 'wordcount': 0},
  'pos'),
 ({'mean_compound': 1.175565625,
   'mean_positive': 0.13515624999999998,
   'wordcount': 1},
  'pos'),
 ({'mean_compound': 1.029923076923077,
   'mean_positive': 0.11538461538461539,
   'wordcount': 10},
  'pos'),
 ({'mean_compound': 0.864885714285