In [1]:
import nltk
import string
import pickle
import pandas as pd
from random import shuffle

from nltk import FreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tag import pos_tag

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import spacy
from collections import defaultdict

In [2]:
stemmer = PorterStemmer()
wnl = WordNetLemmatizer()
stopw = stopwords.words("english")

dataset = pd.read_csv("./updated_dataset.csv")

In [3]:
def preproc(document):
    words = word_tokenize(document.lower())
    words = [wnl.lemmatize(word) for word in words]
    words = [stemmer.stem(word) for word in words]
    
    return {word: True for word in words if word not in stopw and word.isalpha()}

In [5]:
def trainmodel():
    fset = [(preproc(txt),lbl) for txt,lbl in zip(dataset["text"],dataset["label"])]
    shuffle(fset)
    
    split_idx = int(len(fset)*0.85)
    trains,tests = fset[:split_idx],fset[split_idx:]
    
    clf = nltk.NaiveBayesClassifier.train(trains)
    
    acc = nltk.classify.accuracy(clf,tests)
    print(f"acc > {acc}")
    
    clf.show_most_informative_features(5)
    
    file = open("model.pickle","wb")
    pickle.dump(clf,file)
    file.close()
    
    return clf

In [14]:
def readmodel():
    try:
        file = open("model.pickle","rb")
        clf = pickle.load(file)
        file.close()
        clf.show_most_informative_features(5)
    except:
        clf = trainmodel()
    
    return clf

In [10]:
def writerev():
    while True:
        review = input("input >= 2 words")
        
        if len(review.split()) >1:
            return review
        else:
            print("too short")

In [49]:
def analyze(review,clf):
    words = word_tokenize(review.lower())
    words = FreqDist([word for word in words if word.isalpha and word not in string.punctuation])
    
    tags = pos_tag(words)
    for i,word in enumerate(tags):
        print(f"{i+1}. {word[0]} > {word[1]}")
    print("")
    
    for word in words:
        synset = wordnet.synsets(word)
        synos = []
        antos = []
        
        for syns in synset:
            for lemms in syns.lemmas():
                synos.append(lemms.name())
                for ants in lemms.antonyms():
                    antos.append(ants.name())
                    
        synos = list(set(synos))
        antos = list(set(antos))
        
        print(f"{word} >")
        print(f"Synonyms > {", ".join(synos[:5]) if synos else "none"}")
        print(f"Antonyms > {", ".join(antos[:5]) if antos else "none"}")
        print("")
     
    cleaned_rev = [word for word in word_tokenize(review) if word not in stopw and word not in string.punctuation]
    cleaned_rev = [wnl.lemmatize(stemmer.stem(word)) for word in cleaned_rev]
       
    result = clf.classify(FreqDist(cleaned_rev))
    print(f"Review > {review} > {result}")
    
    return result

In [50]:
# classer = readmodel()
# analyze("i am him", classer)

Most Informative Features
                 terribl = True           negati : positi =     10.8 : 1.0
                 horribl = True           negati : positi =      9.5 : 1.0
                 perfect = True           positi : negati =      8.4 : 1.0
                  overpr = True           negati : positi =      8.2 : 1.0
                    rude = True           negati : positi =      8.2 : 1.0
1. i > NN
2. am > VBP
3. him > PRP

i >
Synonyms > i, single, iodin, unity, I
Antonyms > none

am >
Synonyms > AM, constitute, cost, personify, be
Antonyms > differ

him >
Synonyms > none
Antonyms > none

Review > i am him > negative


'negative'

In [46]:
def recomm(review):
    corpus = dataset["text"]
    target = dataset["restaurant"]
    
    tfidf = TfidfVectorizer()
    tfidf_mtr = tfidf.fit_transform(corpus)
    query = tfidf.transform([review])
    
    simil = cosine_similarity(tfidf_mtr,query)
    
    df = pd.DataFrame({
        'doc':corpus,
        'target':target,
        'simil':simil.flatten()
    })
    
    recomms = df[df["simil"]>0.15]
    top_recomms = recomms.sort_values(by="simil",ascending=False).head()
    
    
    if recomms.empty:
        print("No recomms")
    else:
        for i, word in top_recomms.iterrows():
            print(f"{i}. {word["target"]} > {word["simil"]}")

# recomm("Contrary to other reviews, I have zero complaints about the service or the prices. I have been getting tire service here for the past 5 years now, and compared to my experience with places like Pep Boys, these guys are experienced and know what they're doing. \nAlso, this is one place that I do not feel like I am being taken advantage of, just because of my gender. Other auto mechanics have been notorious for capitalizing on my ignorance of cars, and have sucked my bank account dry.")

In [36]:
nlp = spacy.load("en_core_web_sm")
ner_lab = nlp.get_pipe("ner").labels
# print(ner_lab)

def ner(review):
    categorized_ents = defaultdict(set)
    doc = nlp(review)
    
    for ent in doc.ents:
        if ent.label_ in ["GPE", "LANGUAGE", "ORG"]:
            categorized_ents[ent.label_].add(ent.text)
    
    for cat,ent in categorized_ents.items():
        print(f"{cat} > {", ".join(sorted(ent))}")
        
ner("He works at the microsoft english branch of jakarta")

ORG > microsoft
LANGUAGE > english
GPE > jakarta


In [51]:
clf = readmodel()
review = ""
sent = ""

while True:
    print("Menu blablabla")
    print("review :",review if review else "no reviews")
    print("sent :",sent if sent else "no sent")
    
    print("input >>",end="",flush=True)
    choice = input("enter input")
    print(f"( {choice} )")
    
    if choice == '1':
        review = writerev()
    elif choice == '2':
        if review:
            sent = analyze(review,clf)
        else:
            print("!!review empty")
    elif choice == '3':
        if review:
            recomm(review)
        else:
            print("!!review empty")
    elif choice == '4':
        if review:
            ner(review)
        else:
            print("!!review empty")
    elif choice == '5':
        print("!!exiting")
        break
    else:
        print("!!input out of range")
        
    print("")

Most Informative Features
                 terribl = True           negati : positi =     10.8 : 1.0
                 horribl = True           negati : positi =      9.5 : 1.0
                 perfect = True           positi : negati =      8.4 : 1.0
                  overpr = True           negati : positi =      8.2 : 1.0
                    rude = True           negati : positi =      8.2 : 1.0
Menu blablabla
review : no reviews
sent : no sent
input >>( 2 )
!!review empty

Menu blablabla
review : no reviews
sent : no sent
input >>( 1 )

Menu blablabla
review : the service of the english employees of Microsoft at Jakarta is terrible
sent : no sent
input >>( 2 )
1. the > DT
2. of > IN
3. service > NN
4. english > JJ
5. employees > NNS
6. microsoft > VBP
7. at > IN
8. jakarta > NN
9. is > VBZ
10. terrible > JJ

the >
Synonyms > none
Antonyms > none

of >
Synonyms > none
Antonyms > none

service >
Synonyms > Robert_William_Service, serving, service_of_process, religious_service, table_s