In [1]:
import nltk,pickle,pandas as pd
from string import punctuation
from random import shuffle

from nltk.corpus import stopwords,wordnet
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier,accuracy
from nltk.tag import pos_tag

In [2]:
stems = PorterStemmer() 
lemms = WordNetLemmatizer()
stopW = stopwords.words("english")

In [4]:
def preproc(words):
    tokens = word_tokenize(words)
    words = [word.lower() for word in tokens if word.isalpha()]
    words = [word for word in words if word not in stopW]
    words = [stems.stem(word) for word in words]
    words = [lemms.lemmatize(word) for word in words]
    
    return words

In [6]:
ds = pd.read_csv("./dataset.csv", encoding="utf-8-sig")

allW = []
for sent in ds["text"]:
    for word in preproc(sent):
        allW.append(word)

In [7]:
fd =FreqDist(allW)
commonW = {word for word in fd.most_common(1000)}

In [8]:
feature_set = []

for txt,lbl in zip(ds["text"],ds["label"]):
    feature = {}
    
    clean_txt = preproc(txt)
    
    for word in clean_txt:
        feature[word] = (word in commonW)
    
    feature_set.append((feature,lbl))

In [9]:
shuffle(feature_set)

idx = int(len(feature_set)*0.7)
trains = feature_set[:idx]
tests = feature_set[idx:]

In [11]:
clf = NaiveBayesClassifier.train(trains)
clf.show_most_informative_features(5)

Most Informative Features
                 perfect = False          positi : negati =     11.4 : 1.0
                 terribl = False          negati : positi =     10.0 : 1.0
                 horribl = False          negati : positi =      9.4 : 1.0
                    rude = False          negati : positi =      8.2 : 1.0
                   south = False          positi : negati =      7.7 : 1.0


In [13]:
def synanto(words):
    SYN = []
    ANT = []
    
    for syns in wordnet.synsets(words):
        for lemmas in syns.lemmas():
            SYN.append(lemmas.name())
            for ants in lemmas.antonyms():
                ANT.append(ants.name())
    return SYN, ANT

In [15]:
rev = "the book is good"

tok_rev = word_tokenize(rev)
for words in tok_rev:
    syn,ant = synanto(words)
    print(words)
    print(f"{", ".join(syn) if syn else "-"}")
    print(f"{", ".join(ant) if ant else "-"}")
    
    # print(ant)
    print("="*10)

the
-
-
book
book, book, volume, record, record_book, book, script, book, playscript, ledger, leger, account_book, book_of_account, book, book, book, rule_book, Koran, Quran, al-Qur'an, Book, Bible, Christian_Bible, Book, Good_Book, Holy_Scripture, Holy_Writ, Scripture, Word_of_God, Word, book, book, book, reserve, hold, book, book, book
-
is
be, be, be, exist, be, be, equal, be, constitute, represent, make_up, comprise, be, be, follow, embody, be, personify, be, be, live, be, cost, be
differ
good
good, good, goodness, good, goodness, commodity, trade_good, good, good, full, good, good, estimable, good, honorable, respectable, beneficial, good, good, good, just, upright, adept, expert, good, practiced, proficient, skillful, skilful, good, dear, good, near, dependable, good, safe, secure, good, right, ripe, good, well, effective, good, in_effect, in_force, good, good, serious, good, sound, good, salutary, good, honest, good, undecomposed, unspoiled, unspoilt, good, well, good, thoroughl

In [22]:
def tagger(words):
    tok_rev = word_tokenize(words)
    tags = pos_tag(tok_rev)
    return tags

In [23]:
rev = "i love your voice"

tags = tagger(rev)

for i, word in enumerate(tags):
    print(f"{i+1}. {word[0]} --> {word[1]}")

1. i --> JJ
2. love --> VBP
3. your --> PRP$
4. voice --> NN


In [26]:
def analyze(words):
    cleaned = preproc(words)
    res = clf.classify(FreqDist(cleaned))
    
    print(res)


rev = "i wanna please you"    
analyze(rev)

positive
