In [59]:
import nltk
import string
import pickle
import pandas as pd
from random import shuffle

from nltk import FreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.tag import pos_tag

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import spacy
from collections import defaultdict

In [60]:
stemmer = PorterStemmer()
wnl = WordNetLemmatizer()
stopw = stopwords.words("english")

dataset = pd.read_csv("./updated_dataset.csv")

In [61]:
def preproc(document):
    words = word_tokenize(document.lower())
    
    words = [wnl.lemmatize(word) for word in words]
    words = [stemmer.stem(word) for word in words]
    
    return {word:True for word in words if word not in stopw and word.isalpha()}

In [62]:
def trainmodel():
    fset = [(preproc(txt),label) for txt,label in zip(dataset["text"],dataset["label"])]
    shuffle(fset)
    
    split_idx = int(len(fset)*0.85)
    trains,tests = fset[:split_idx],fset[split_idx:]
    
    clf = nltk.NaiveBayesClassifier.train(trains)
    
    acc = nltk.classify.accuracy(clf,tests)
    print(f"acc > {acc}")
    
    clf.show_most_informative_features(5)
    
    
    file = open("model.pickle","wb")
    pickle.dump(clf,file)
    file.close()
    
    return clf

In [63]:
def readmodel():
    try:
        file = open("model.pickle","rb")
        clf = pickle.load(file)
        clf.show_most_informative_features(5)
        file.close()
    except:
        clf = trainmodel()
        
    return clf

In [64]:
def writerev():
    while True:
        review = input("Input must be >=2 words")
        
        if len(review.split())>1:
            return review
        else:
            print("Too short")

In [65]:
classifier = readmodel()

Most Informative Features
                 terribl = True           negati : positi =     13.8 : 1.0
                 perfect = True           positi : negati =     13.5 : 1.0
                 horribl = True           negati : positi =      9.8 : 1.0
                   anyon = True           negati : positi =      9.1 : 1.0
                  receiv = True           negati : positi =      8.4 : 1.0


In [66]:
def analyze(review:str,clf):
    words = word_tokenize(review.lower())
    words = [word for word in words if word not in string.punctuation and word.isalpha()]
    
    tags = pos_tag(words)
    for i,word in enumerate(tags):
        print(f"{i+1}. {word[0]} {word[1]}")
    print("")    
    
    for word in words:
        print("word > ",word)
        synos = []
        antos = []
        
        synset = wordnet.synsets(word)
        
        for syns in synset:
            for lemm in syns.lemmas():
                synos.append(lemm.name())
                for anto in lemm.antonyms():
                    antos.append(anto.name())
        
        
        #use to remove dupes         
        synos = list(set(synos))
        antos = list(set(antos))
                    
        print(", ".join(synos) if synos else "no syns")
        print(", ".join(antos) if antos else "no ants")
        print("===")
    
    
    
    print("")
    
    cleaned_rev = [word for word in word_tokenize(review) if word not in string.punctuation and word not in stopw]
    cleaned_rev = [wnl.lemmatize(stemmer.stem(word)) for word in cleaned_rev]
    
    result = clf.classify(FreqDist(cleaned_rev))
    print(f"{review} > {result}")
    
    return result

# analyze("owen loves little children", classifier)

1. owen JJ
2. loves VBZ
3. little JJ
4. children NNS

word >  owen
Sir_Richard_Owen, Robert_Owen, Owen
no ants
===
word >  loves
bang, have_a_go_at_it, passion, lovemaking, make_out, love, do_it, sleep_with, making_love, sleep_together, have_it_off, get_it_on, get_laid, have_intercourse, dearest, enjoy, honey, eff, know, erotic_love, fuck, bonk, have_sex, bed, be_intimate, hump, have_it_away, roll_in_the_hay, jazz, sexual_love, lie_with, dear, make_love, beloved, screw, love_life
hate
===
word >  little
lilliputian, little, small, trivial, fiddling, petty, footling, niggling, short, picayune, minuscule, piffling, piddling, slight
big, large, much, tall
===
word >  children
youngster, tiddler, minor, child, shaver, tike, baby, nipper, fry, small_fry, kid, nestling, tyke
parent
===

owen loves little children > positive


'positive'

In [67]:
def recomm(review):
    corpus = dataset["text"]
    target = dataset["restaurant"]
    
    tfidf = TfidfVectorizer()
    tfidf_mtr = tfidf.fit_transform(corpus)
    query = tfidf.transform([review])
    
    simil = cosine_similarity(tfidf_mtr,query)
    
    df = pd.DataFrame({
        'doc':corpus,
        'target':target,
        'simil':simil.flatten()
    })
    
    recomms = df[df["simil"]>0.15]
    
    top_recomms = recomms.sort_values(by='simil',ascending=False).head(10)
    
    # top_recomms = top_recomms.reset_index(drop=True)
    
    return top_recomms[['target','simil']]


recommends = recomm("My orders have come out bad pretty much every time I have ordered from here.")

if recommends.empty:
    print("no recomms")
else:
    for idx, row in recommends.iterrows():
        print(f"{idx} > {row['target']} {row['simil']}")

17 > Restaurant C 0.4629912053109412
387 > Restaurant C 0.16381587462932606
195 > Restaurant A 0.15969550308016198
170 > Restaurant A 0.15228221911813097
305 > Restaurant A 0.15161707817897763


In [68]:
nlp = spacy.load("en_core_web_sm")
ner_label = nlp.get_pipe("ner").labels

# print(ner_label)

def ner(review):
    categorized_data = defaultdict(set)
    doc = nlp(review)
    
    for ent in doc.ents:
        if ent.label_ in ["GPE","ORG","LANGUAGE"]:
            categorized_data[ent.label_].add(ent.text)
            
    for cat,ent in categorized_data.items():
        print(f"{cat} > {", ".join(sorted(ent))}")
        
ner("Indonesia seeks to ban microsoft in favor of Apple inc")

GPE > Indonesia
ORG > Apple inc, microsoft


In [69]:
classifier = readmodel()
review =""
sent = ""

while True:
    print("Menu, select command")
    print(f"rev > {review if review else "no review"}")
    print(f"sent > {sent if sent else "no sent"}")
    
    print(">>",flush=True, end = "")
    choice = input("enter 1-5")
    print(choice)
    
    
    if choice == '1':
        review = writerev()
    if choice == '2':
        if review:
            sent = analyze(review, classifier)
            print(f"Updated sentiment: {sent}")  # Debugging line
        else:
            print("no rev!")
    elif choice == '3':
        pass
    elif choice == '4':
        pass
    elif choice == '5':
        break
    else:
        print("try again")


Most Informative Features
                 terribl = True           negati : positi =     13.8 : 1.0
                 perfect = True           positi : negati =     13.5 : 1.0
                 horribl = True           negati : positi =      9.8 : 1.0
                   anyon = True           negati : positi =      9.1 : 1.0
                  receiv = True           negati : positi =      8.4 : 1.0
Menu, select command
rev > no review
sent > no sent
>>

1
try again
Menu, select command
rev > this is perfect
sent > no sent
>>2
1. this DT
2. is VBZ
3. perfect JJ

word >  this
no syns
no ants
===
word >  is
cost, exist, follow, live, comprise, embody, represent, constitute, make_up, equal, personify, be
differ
===
word >  perfect
complete, unadulterated, perfective_tense, staring, everlasting, arrant, hone, pure, consummate, perfective, perfect, sodding, stark, thoroughgoing, double-dyed, utter, gross, perfect_tense
imperfect
===

this is perfect > positive
Updated sentiment: positive
Menu, select command
rev > this is perfect
sent > positive
>>5
