In [1]:
# %pip install nltk
# import nltk
# nltk.download()

Read data from txt

In [2]:
positive = open("positive.txt", "r").read()
negative = open("negative.txt", "r").read()

Tokenizing

In [3]:
from nltk.tokenize import word_tokenize

list_words = word_tokenize(positive) + word_tokenize(negative)

Stopwords, Stemming, POS Tagging

In [4]:
from nltk.corpus import stopwords
from string import punctuation

eng_stopwords = stopwords.words('english')

list_words = [word for word in list_words if word not in eng_stopwords]
list_words = [word for word in list_words if word not in punctuation]
list_words = [word for word in list_words if word.isalpha()]

POS Tagging for NER

In [5]:
from nltk.tag import pos_tag
tagged = pos_tag(list_words)

Lemmatizing

In [6]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

list_words = [wordnet_lemmatizer.lemmatize(word) for word in list_words]

Named Entity Recognition (NER) Visualization

In [7]:
from nltk.chunk import ne_chunk
ner = ne_chunk(tagged)
# ner.draw()

Load Corpora

In [8]:
from nltk.corpus import gutenberg
emma = gutenberg.raw('austen-emma.txt')

Wordnet

In [9]:
from nltk.corpus import wordnet

for word in list_words:
    synsets = wordnet.synsets(word)
    if synsets:
        print(f"{word}: {synsets[0].definition()}")

Apple: fruit with red or yellow or green skin and sweet to tart crisp whitish flesh
work: activity directed toward making or doing something
really: in accordance with truth or fact or reality
well: a deep hole or shaft dug or drilled to obtain water or oil or gas or brine
easy: posing no difficulty; requiring little effort
use: the act of using
Galaxy: a splendid assemblage (especially of famous people)
phone: electronic equipment that converts sound into electrical signals that can be transmitted over distances and then converts received signals back into sounds
great: a person who has achieved distinction and honor in some field
screen: a white or silvered surface where pictures can be projected for viewing
good: benefit
camera: equipment for taking photographs (usually consisting of a lightproof box with a lens at one end and light-sensitive film at the other)
phone: electronic equipment that converts sound into electrical signals that can be transmitted over distances and then con

Frequency Distribution to reduce range

In [10]:
from nltk.probability import FreqDist
fdist = FreqDist(list_words)

list_words = [word for word, count in fdist.most_common(1000)]

Labeling

In [11]:
labeled_sentences = [] 
for sentence in positive.split("\n"):
    labeled_sentences.append((sentence, "positive"))

for sentence in negative.split("\n"):
    if len(sentence) == 1:
        continue
    labeled_sentences.append((sentence, "negative"))

Making dataset

In [12]:
dataset = []
for sentence, label in labeled_sentences:
    dict = {}
    words = word_tokenize(sentence)
    for feature in list_words:
        key = feature
        value = feature in words
        dict[key] = value
    dataset.append((dict, label))

import random
random.shuffle(dataset)

training_data = dataset[:int(len(dataset) * 0.7)]
testing_data = dataset[int(len(dataset) * 0.3):]

Import Naive Bayes Algorithms

In [13]:
from nltk.classify import NaiveBayesClassifier, accuracy
classifier = NaiveBayesClassifier.train(training_data)
accuracy(classifier, testing_data)

0.9618768328445748

Making Model

In [14]:
import pickle
file = open("model.pickle", "wb")
pickle.dump(classifier, file)

Using Model

In [15]:
file = open("model.pickle", "rb")
classifier = pickle.load(file)

file.close()

Show most informative features

In [16]:
classifier.show_most_informative_features(5)

Most Informative Features
                   price = True           positi : negati =      3.7 : 1.0
                 durable = True           positi : negati =      3.4 : 1.0
                    many = True           positi : negati =      3.0 : 1.0
                   offer = True           positi : negati =      3.0 : 1.0
                    user = True           negati : positi =      3.0 : 1.0


Testing Data

In [17]:
review = input()
words = word_tokenize(review)
print(classifier.classify(FreqDist(words)))