In [40]:
import nltk, pickle, pandas as pd
from nltk.corpus import wordnet, stopwords
from nltk.tag import pos_tag
from nltk.stem import PorterStemmer, WordNetLemmatizer
from string import punctuation

from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier, accuracy
from nltk.tokenize import word_tokenize
from random import shuffle

In [41]:
stems = PorterStemmer()
lemms = WordNetLemmatizer()
stopW = stopwords.words("english")

In [42]:
def preprocess(words):
    # tokenizing, removing stopwords, symbols, and numbers, and
    # lemmatizing/stemming the words
    
    tokens = word_tokenize(words)
    # no num no sym
    words = [word.lower() for word in tokens if word.isalpha]
    
    # no stop
    words = [word for word in words if word not in stopW]
    
    #lem & stem
    words = [stems.stem(word) for word in words]
    words = [lemms.lemmatize(word)for word in words]
    
    return words

In [43]:
dataset = pd.read_csv("./dataset.csv",encoding="utf-8-sig")

all_words = []
for sent in dataset["text"]:
    for word in preprocess(sent):
        all_words.append(word)
        


In [56]:
fd = FreqDist(all_words)
common_words = {word for word in fd.most_common(1000)}

In [57]:
feature_set = []
for text,category in zip(dataset["text"],dataset["label"]):
    feature = {}
    
    processed_feature = preprocess(text)
    
    for word in processed_feature:
        feature[word] = (word in common_words)
    feature_set.append((feature,category))

In [58]:
shuffle(feature_set)

split_idx = int(len(feature_set)*0.7)
train_set = feature_set[:split_idx]
test_set = feature_set[split_idx:]

In [59]:
classifier = NaiveBayesClassifier.train(train_set)
classifier.show_most_informative_features(5)

print(accuracy(classifier,test_set))

Most Informative Features
               positive5 = False          negati : positi =     12.5 : 1.0
                 terribl = False          negati : positi =     11.9 : 1.0
                  someon = False          negati : positi =     11.2 : 1.0
                 perfect = False          positi : negati =     10.0 : 1.0
                   anyth = False          negati : positi =      8.3 : 1.0
0.7333333333333333


In [90]:
file = open("./modelT2.pickle","wb")
pickle.dump(classifier,file)
file.close()

In [78]:
def synanto(words):
    syn = []
    ant = []
    
    for syns in wordnet.synsets(words):
        for lemma in syns.lemmas():
            syn.append(lemma.name())
            for ants in lemma.antonyms():
                ant.append(ants.name())
                
    return syn[:5],ant[:5]

def tagger(words):
    tokens = word_tokenize(words)
    tags = pos_tag(tokens)
    
    return tags

In [95]:
review = "i hate the food it sucks so bad"

def writeRev():
    rev = input("Insert input txt")
    if len(rev.split()) >= 2:
        print("success")
    else:
        print("try again")


def analyze():
    if review == "":
        print("no reviews")
        return
    rev_tags = tagger(review)
    
    for i,word in enumerate(rev_tags):
        print(f"{i+1}. {word[0]} =  {word[1]}")
    
    tokens = word_tokenize(review.lower())
    
    for word in tokens:
        syno, anto = synanto(word)
        print("="*10)
        print("Word =",word)
        print(f"Synos: {', '.join(syno) if syno else '-'}")
        print(f"Antos: {', '.join(anto) if anto else '-'}")
    
    
    file = open("./modelT2.pickle","rb")
    classer = pickle.load(file)
    file.close()
    
    cleaned_tokens = preprocess(review)
    
    result = classer.classify(FreqDist(cleaned_tokens))
    print("Res =",result)
    
    classer.show_most_informative_features(100)
    
analyze()


1. i =  NN
2. hate =  VBP
3. the =  DT
4. food =  NN
5. it =  PRP
6. sucks =  VBZ
7. so =  RB
8. bad =  JJ
Word = i
Synos: iodine, iodin, I, atomic_number_53, one
Antos: -
Word = hate
Synos: hate, hatred, hate, detest
Antos: love, love
Word = the
Synos: -
Antos: -
Word = food
Synos: food, nutrient, food, solid_food, food
Antos: -
Word = it
Synos: information_technology, IT
Antos: -
Word = sucks
Synos: sucking, suck, suction, suck, suck
Antos: bottlefeed
Word = so
Synos: sol, soh, so, so, so
Antos: -
Word = bad
Synos: bad, badness, bad, bad, big
Antos: good, goodness, good, unregretful
Res = positive
Most Informative Features
               positive5 = False          negati : positi =     12.5 : 1.0
                 terribl = False          negati : positi =     11.9 : 1.0
                  someon = False          negati : positi =     11.2 : 1.0
                 perfect = False          positi : negati =     10.0 : 1.0
                   anyth = False          negati : positi =      8.