In [1]:
import nltk, pandas as pd, pickle
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier,accuracy
from nltk.corpus import stopwords, wordnet
from string import punctuation
from random import shuffle

In [2]:
stems = PorterStemmer()
lemms = WordNetLemmatizer()
stopW = stopwords.words("english")

In [3]:
def preproc(words):
    tokens = word_tokenize(words)
    words = [word.lower() for word in tokens if word.isalpha()] 
    words = [word for word in words if word not in stopW]
    words = [stems.stem(word) for word in words]
    words = [lemms.lemmatize(word) for word in words]
    
    return words

In [5]:
ds = pd.read_csv("./dataset.csv",encoding = "utf-8-sig")

all_w = []

for sent in ds["text"]:
    for word in preproc(sent):
        all_w.append(word)

In [6]:
fd = FreqDist(all_w)
common = {word for word in fd.most_common(1000)}

In [7]:
feature_set = []

for txt,lbl in zip(ds["text"],ds["label"]):
    feature = {}
    
    procd = preproc(txt)
    
    for word in procd:
        feature[word] = (word in common)
    
    feature_set.append((feature,lbl))

In [8]:
shuffle(feature_set)

idx = int(len(feature_set)*0.7)
trains = feature_set[:idx]
tests = feature_set[idx:]

In [10]:
clf = NaiveBayesClassifier.train(trains)
clf.show_most_informative_features(5)

Most Informative Features
                 terribl = False          negati : positi =      9.6 : 1.0
                    cold = False          negati : positi =      9.0 : 1.0
                 horribl = False          negati : positi =      8.3 : 1.0
                    okay = False          negati : positi =      8.3 : 1.0
                 perfect = False          positi : negati =      7.8 : 1.0


In [22]:
print(f"{accuracy(clf,tests)*100:.2f} %")

69.09 %


0.69


In [11]:
def synanto(words):
    SYN = []
    ANT = []
    
    for syns in wordnet.synsets(words):
        for lemmas in syns.lemmas():
            SYN.append(lemmas.name())
            for ants in lemmas.antonyms():
                ANT.append(ants.name())
                
    return SYN,ANT

In [13]:
rev = "the book is good"

tok_rev = word_tokenize(rev)

for words in tok_rev:
    syn,ant  = synanto(words)
    print(syn if syn else "-")
    print(syn if syn else "-")
    print("="*10)

-
-
['book', 'book', 'volume', 'record', 'record_book', 'book', 'script', 'book', 'playscript', 'ledger', 'leger', 'account_book', 'book_of_account', 'book', 'book', 'book', 'rule_book', 'Koran', 'Quran', "al-Qur'an", 'Book', 'Bible', 'Christian_Bible', 'Book', 'Good_Book', 'Holy_Scripture', 'Holy_Writ', 'Scripture', 'Word_of_God', 'Word', 'book', 'book', 'book', 'reserve', 'hold', 'book', 'book', 'book']
['book', 'book', 'volume', 'record', 'record_book', 'book', 'script', 'book', 'playscript', 'ledger', 'leger', 'account_book', 'book_of_account', 'book', 'book', 'book', 'rule_book', 'Koran', 'Quran', "al-Qur'an", 'Book', 'Bible', 'Christian_Bible', 'Book', 'Good_Book', 'Holy_Scripture', 'Holy_Writ', 'Scripture', 'Word_of_God', 'Word', 'book', 'book', 'book', 'reserve', 'hold', 'book', 'book', 'book']
['be', 'be', 'be', 'exist', 'be', 'be', 'equal', 'be', 'constitute', 'represent', 'make_up', 'comprise', 'be', 'be', 'follow', 'embody', 'be', 'personify', 'be', 'be', 'live', 'be', 'cos

In [14]:
def tagger(words):
    tokens = word_tokenize(words)
    tags = pos_tag(tokens)
    return tags

In [16]:
tags = tagger(rev)

for i,words in enumerate(tags):
    print(f"{i+1}. {words[0]} {words[1]}")

1. the DT
2. book NN
3. is VBZ
4. good JJ


In [29]:
def analyze(words):
    cleaned = preproc(words)
    
    res = clf.classify(FreqDist(cleaned))
    print(res)

analyze("blud")

negative
