In [1]:
import nltk, pickle, pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet, stopwords
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier,accuracy
from nltk.tag import pos_tag
from nltk.stem import PorterStemmer,WordNetLemmatizer

from string import punctuation
from random import shuffle

In [2]:
stems = PorterStemmer()
lemms = WordNetLemmatizer()
stopW = stopwords.words("english")

In [3]:
def preproc(words):
    tokens = word_tokenize(words)
    
    words = [word.lower() for word in tokens if word.isalpha()]
    words = [word for word in words if word not in stopW]
    
    words = [stems.stem(word) for word in words]
    words = [lemms.lemmatize(word) for word in words]
    
    return words

In [4]:
ds = pd.read_csv("./dataset.csv",encoding="utf-8-sig")

all_w = []
for sent in ds["text"]:
    for word in preproc(sent):
        all_w.append(word)

In [5]:
fd = FreqDist(all_w)
commonW = {word for word in fd.most_common(1000)}

In [9]:
feature_set = []

for txt,lbl in zip(ds["text"],ds["label"]):
    feature = {}
    
    proc_txt = preproc(txt)
    
    for word in proc_txt:
        feature[word] = (word in commonW)
        
    feature_set.append((feature,lbl))

In [10]:
shuffle(feature_set)

idx = int(len(feature_set)*0.7)
trains = feature_set[:idx]
tests = feature_set[idx:]

In [11]:
clf = NaiveBayesClassifier.train(trains)
clf.show_most_informative_features(5)

Most Informative Features
                 terribl = False          negati : positi =     10.1 : 1.0
                    hope = False          negati : positi =      8.2 : 1.0
                 airport = False          positi : negati =      6.9 : 1.0
                 fantast = False          positi : negati =      6.9 : 1.0
                  immedi = False          negati : positi =      6.4 : 1.0


In [12]:
def synanto(words):
    SYN = []
    ANT = []
    
    for syns in wordnet.synsets(words):
        for lemmas in syns.lemmas():
            SYN.append(lemmas.name())
            for ants in lemmas.antonyms():
                ANT.append(ants.name())
    return SYN,ANT

In [35]:
rev = "the book is good"

tok_rev = word_tokenize(rev)
for words in tok_rev:
    syn,ant = synanto(words)
    print(words)
    print(f"{", ".join(syn) if syn else "-"}")
    print(f"{", ".join(ant) if ant else "-"}")
    
    # print(ant)
    print("="*10)

the
-
-
book
book, book, volume, record, record_book, book, script, book, playscript, ledger, leger, account_book, book_of_account, book, book, book, rule_book, Koran, Quran, al-Qur'an, Book, Bible, Christian_Bible, Book, Good_Book, Holy_Scripture, Holy_Writ, Scripture, Word_of_God, Word, book, book, book, reserve, hold, book, book, book
-
is
be, be, be, exist, be, be, equal, be, constitute, represent, make_up, comprise, be, be, follow, embody, be, personify, be, be, live, be, cost, be
differ
good
good, good, goodness, good, goodness, commodity, trade_good, good, good, full, good, good, estimable, good, honorable, respectable, beneficial, good, good, good, just, upright, adept, expert, good, practiced, proficient, skillful, skilful, good, dear, good, near, dependable, good, safe, secure, good, right, ripe, good, well, effective, good, in_effect, in_force, good, good, serious, good, sound, good, salutary, good, honest, good, undecomposed, unspoiled, unspoilt, good, well, good, thoroughl

In [15]:
def tagger(words):
    tok_rev = word_tokenize(words)
    tags = pos_tag(tok_rev)
    return tags

In [16]:
rev = "i like to read books"

tag_rev = tagger(rev)

for i,word in enumerate(tag_rev):
    print(f"{i+1}. {word[0]} => {word[1]}")


1. i => NNS
2. like => VBP
3. to => TO
4. read => VB
5. books => NNS


In [19]:
def analyze(words):
    file = open("./modelt3.pickle","rb")
    classer = pickle.load(file)
    file.close()
    
    cleaned = preproc(words)
    
    res = classer.classify(FreqDist(cleaned))
    
    print(res)


rev = "i am an envoy of god who shall bring destruction upon your lands"
analyze(rev)

positive


In [34]:
test = ds[ds["text"] == "Friendly staff, same starbucks fair you get anywhere else.  Sometimes the lines can get long."]

for lbl in test["label"]:
    print(lbl)

positive
