In [2]:
import nltk, pickle, pandas as pd
from string import punctuation
from random import shuffle
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet, stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.classify import NaiveBayesClassifier,accuracy
from nltk.tag import pos_tag
from nltk.probability import FreqDist

In [3]:
stems = PorterStemmer()
lemms = WordNetLemmatizer()
stopW = stopwords.words("english")

In [4]:
def preprocess(words):
    tokens = word_tokenize(words)
    
    #no sym no num
    words = [word.lower() for word in tokens if word.isalpha()]

    #no stop
    words = [word for word in words if word not in stopW]
    
    #stem & lem
    words = [stems.stem(word)for word in words]
    words = [lemms.lemmatize(word)for word in words]
    
    #return
    return words

In [5]:
dataset = pd.read_csv("./dataset.csv",encoding="utf-8-sig")

all_w = []
for sent in dataset["text"]:
    for word in preprocess(sent):
        all_w.append(word)

In [6]:
fd = FreqDist(all_w)
common_w = {word for word in fd.most_common(1000)}

In [7]:
feature_set = []
for text,category in zip(dataset["text"],dataset["label"]):
    feature = {}
    
    processed_feature = preprocess(text)
    
    for word in processed_feature:
        feature[word] = (word in common_w)
    feature_set.append((feature,category))

In [8]:
shuffle(feature_set)

split_idx = int(len(feature_set)*0.7)
trains = feature_set[:split_idx]
tests = feature_set[split_idx:]

In [9]:
classifier = NaiveBayesClassifier.train(trains)
classifier.show_most_informative_features(5)

Most Informative Features
                  someon = False          negati : positi =     12.5 : 1.0
                 terribl = False          negati : positi =     11.9 : 1.0
                 perfect = False          positi : negati =     10.7 : 1.0
                 horribl = False          negati : positi =      9.3 : 1.0
                    bill = False          negati : positi =      6.8 : 1.0


In [10]:
print (accuracy(classifier,tests))

0.6727272727272727


In [11]:
file = open("modelt3.pickle","wb")
pickle.dump(classifier,file)
file.close()

In [12]:
def synanto(words):
    SYN = []
    ANT = []
    
    for syns in wordnet.synsets(words):
        for lemmas in syns.lemmas():
            SYN.append(lemmas.name())
            for ants in lemmas.antonyms():
                ANT.append(ants.name())
    return SYN,ANT

def tagger(words):
    tokens = word_tokenize(words)
    tags = pos_tag(tokens)
    return tags

In [13]:
rev = "the book is good"

tok_rev = word_tokenize(rev)
for words in tok_rev:
    syn,ant = synanto(words)
    print(words)
    print(f"{", ".join(syn) if syn else "-"}")
    print(f"{", ".join(ant) if ant else "-"}")
    print("="*10)

the
-
-
book
book, book, volume, record, record_book, book, script, book, playscript, ledger, leger, account_book, book_of_account, book, book, book, rule_book, Koran, Quran, al-Qur'an, Book, Bible, Christian_Bible, Book, Good_Book, Holy_Scripture, Holy_Writ, Scripture, Word_of_God, Word, book, book, book, reserve, hold, book, book, book
-
is
be, be, be, exist, be, be, equal, be, constitute, represent, make_up, comprise, be, be, follow, embody, be, personify, be, be, live, be, cost, be
differ
good
good, good, goodness, good, goodness, commodity, trade_good, good, good, full, good, good, estimable, good, honorable, respectable, beneficial, good, good, good, just, upright, adept, expert, good, practiced, proficient, skillful, skilful, good, dear, good, near, dependable, good, safe, secure, good, right, ripe, good, well, effective, good, in_effect, in_force, good, good, serious, good, sound, good, salutary, good, honest, good, undecomposed, unspoiled, unspoilt, good, well, good, thoroughl

In [14]:
rev = "i like to read books"

tag_rev = tagger(rev)

for i,word in enumerate(tag_rev):
    print(f"{i+1}. {word[0]} => {word[1]}")


1. i => NNS
2. like => VBP
3. to => TO
4. read => VB
5. books => NNS


In [15]:
# w count for input

rev = ""

while True:
    rev = input("Enter input ")
    if(len(rev.split()))>2:
        break
    else:
        print("need more words")

In [16]:
def analyze(words):
    cleaned = preprocess(words)
    
    file = open("./modelt3.pickle","rb")
    classer = pickle.load(file)
    file.close()
    
    result = classer.classify(FreqDist(cleaned)) 
    
    print(result)


rev = "The food is so terrible"
analyze(rev)

positive
