In [58]:
# 5base 5recom #2recom #2ner

In [59]:
import nltk
import pickle
import string
import pandas as pd
from random import shuffle

from nltk import FreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tag import pos_tag

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import spacy
from collections import defaultdict

In [60]:
stemmer = PorterStemmer()
wnl = WordNetLemmatizer()
stopw = stopwords.words("english")

dataset = pd.read_csv("./updated_dataset.csv")

In [61]:
def preproc(document):
    words = word_tokenize(document.lower())
    
    words = [wnl.lemmatize(word) for word in words]
    words = [stemmer.stem(word) for word in words]
    
    return {word:True for word in words if word not in stopw and word.isalpha()}

In [62]:
def trainmodel():
    fset = [(preproc(txt),label) for txt,label in zip(dataset["text"],dataset["label"])]
    shuffle(fset)
    
    split = int(len(fset)*0.85)
    trains,tests = fset[:split],fset[split:]
    
    clf = nltk.NaiveBayesClassifier.train(trains)
    
    acc = nltk.classify.accuracy(clf,tests)
    print(acc)
    
    clf.show_most_informative_features(5)
    
    file = open("model.pickle","wb")
    pickle.dump(clf,file)
    file.close()
    
    return clf

In [63]:
def readmodel():
    try:
        file = open("model.pickle","rb")
        clf = pickle.load(file)
        clf.show_most_informative_features(5)
        file.close()
    except:
        clf = trainmodel()
        
    return clf

In [64]:
def writerev():
    while True:
        review = input("move than 2 words")
        
        if len(review.split())>1:
            return review
        else:
            print ("too short")

In [65]:
classifier = readmodel()

0.8072289156626506
Most Informative Features
                 terribl = True           negati : positi =     12.8 : 1.0
                    rude = True           negati : positi =     11.4 : 1.0
                    okay = True           negati : positi =     10.0 : 1.0
                 fantast = True           positi : negati =      9.3 : 1.0
                    card = True           negati : positi =      8.0 : 1.0


In [66]:
def analyze(review:str,clf):
    words = word_tokenize(review.lower())
    words = FreqDist([word for word in words if word.isalpha() and word not in string.punctuation])
    
    tags = pos_tag(words)
    for i,word in enumerate(tags):
        print(f"{i+1}. {word[0]} > {word[1]}")
    
    clean_rev = [word for word in word_tokenize(review) if word not in stopw and word not in string.punctuation]
    clean_rev = [wnl.lemmatize(stemmer.stem(word)) for word in clean_rev]
    
    result = clf.classify(FreqDist(clean_rev))

    print(f"{review} > {result}")

# analyze("you are terrible", classifier)
    

1. you > PRP
2. are > VBP
3. terrible > JJ
you are terrible > negative


In [72]:
def recomm(review):
    corpus = dataset["text"]
    target = dataset["restaurant"]
    
    tfidf = TfidfVectorizer()
    tfidf_mtr = tfidf.fit_transform(corpus)
    query = tfidf.transform([review])
    
    simil = cosine_similarity(tfidf_mtr,query)
    
    df = pd.DataFrame({
        'doc':corpus,
        'target':target,
        'simil':simil.flatten()
    })
    
    recomms = df[df['simil']>0.15]
    
    top_recomms = recomms.sort_values(by='simil',ascending=False).head()
    
    return top_recomms[['target','simil']]


# x = recomm("I have zero complaints")

# if x.empty:
#     print("empty")
# else:
#     for idx, row in x.iterrows():
#                 print(f"{idx+1} > {row['target']} > {row['simil']}")  

264 > Restaurant D > 0.28072732862084204
1 > Restaurant A > 0.19631737999176635
168 > Restaurant C > 0.1897028741200557
19 > Restaurant D > 0.18668342947623362


In [78]:
nlp = spacy.load("en_core_web_sm")
ner_label = nlp.get_pipe("ner").labels

print(ner_label)

def ner(review):
    categorized_ents = defaultdict(set)
    doc = nlp(review)
    
    for ent in doc.ents:
        if ent.label_ in ["GPE","LANGUAGE","ORG"]:
            categorized_ents[ent.label_].add(ent.text)
    
    for cat,ent in categorized_ents.items():
        print(f"{cat} > {", ".join(sorted(ent))}")

ner("He is fluent in English like most workers in the Apple Inc HQ of Jakarta")

('CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART')
LANGUAGE > English
ORG > the Apple Inc HQ
GPE > Jakarta


In [95]:
synsets = wordnet.synsets("good")

for synset in synsets:
    for lemma in synset.lemmas():
        print(lemma.name())
        for antonym in lemma.antonyms():
            print(">>>>",antonym.name())

good
good
>>>> evil
goodness
>>>> evilness
good
>>>> bad
goodness
>>>> badness
commodity
trade_good
good
good
>>>> bad
full
good
good
>>>> evil
estimable
good
honorable
respectable
beneficial
good
good
good
just
upright
adept
expert
good
practiced
proficient
skillful
skilful
good
dear
good
near
dependable
good
safe
secure
good
right
ripe
good
well
effective
good
in_effect
in_force
good
good
serious
good
sound
good
salutary
good
honest
good
undecomposed
unspoiled
unspoilt
good
well
>>>> ill
good
thoroughly
soundly
good


In [102]:
xyy = []

if not xyy:
    print("empty")
else:
    print("not empty")
    
xyy.append(1)
xyy.append(1)
xyy.append(1)

if not xyy:
    print("empty")
else:
    print("not empty")
    
print(", ".join(map(str, xyy[:2])))

empty
not empty
1, 1
