In [1]:
import nltk, pickle, pandas as pd
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier, accuracy 
from nltk.stem import PorterStemmer, WordNetLemmatizer
from random import shuffle

porter = PorterStemmer()
lem = WordNetLemmatizer()
stop_words = stopwords.words('english')

In [2]:
def preprocess(word):
    token = word_tokenize(word)
    filter = [word.lower() for word in token if word.isalpha()]
    
    nostop = [word for word in filter if word not in stop_words]
    
    res = [porter.stem(word) for word in nostop]
    res = [lem.lemmatize(word) for word in res]
    
    return res 

In [3]:
ds = pd.read_csv("dataset.csv", encoding = "utf-8-sig")

all_words = []
for sentence in ds["text"]:
    for word in preprocess(sentence):
        all_words.append(word)
        
fd = FreqDist(all_words)

In [4]:
common_words = {word for word in fd.most_common(100)}
feature_set = []

for text, category in zip(ds["text"], ds["label"]):
    feature = {}
    
    processed_feature = preprocess(text)
    
    for word in processed_feature:
        feature[word] = (word in common_words)
    feature_set.append((feature, category))
    

In [5]:
shuffle(feature_set)

split = int(len(feature_set)*0.8)
train = feature_set[:split]
test = feature_set[split:]

clf = NaiveBayesClassifier.train(train)
clf.show_most_informative_features(5)

Most Informative Features
                 perfect = False          positi : negati =     13.3 : 1.0
                 terribl = False          negati : positi =     10.6 : 1.0
                    sign = False          negati : positi =      9.9 : 1.0
                 horribl = False          negati : positi =      8.6 : 1.0
                 possibl = False          negati : positi =      8.6 : 1.0


In [6]:
print("Accuracy = ", accuracy(clf, test)*100, "%")

file = open("model.pickle", "wb")
pickle.dump(clf, file)
file.close()

Accuracy =  75.45454545454545 %


In [7]:
def showSynAnt(word):
    SYN = []
    ANT = []
    
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            SYN.append(lemma.name())
            if lemma.antonyms():
                ANT.append(lemma.antonyms()[0].name())
                
    return SYN, ANT
                
def tagger(word):
    token = word_tokenize(word)
    tag = pos_tag(token)
    return tag 

In [8]:
review = []

def wReview():
    rev = input("Insert the review : ") 
    if len(rev.split()) >= 2:
        review.append(rev)
        print("append success")
    else:
        print("invalid")
        
def analyze():
    if not review:
        print("No review exist")
        
    REV = review[-1]
    REV_TAG = tagger(REV)  
   
    for word, pos in REV_TAG:
        synonyms, antonyms = showSynAnt(word)
        print(f"Word : {word}, POS : {pos}")
        print(f"Syn : {', '.join(synonyms) if synonyms else 'None'}")
        print(f"Ant : {', '.join(antonyms) if antonyms else 'None'}")

    print("Category : ")
    file = open("model.pickle", "rb")
    clf = pickle.load(file)
    file.close()
    token = word_tokenize(REV)
    res = clf.classify(FreqDist(token))    
    print(res)

In [9]:
wReview()
analyze()

invalid
No review exist


IndexError: list index out of range