In [1]:
import nltk
import string
import pickle
import pandas as pd
from random import shuffle

from nltk import FreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tag import pos_tag

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import spacy
from collections import defaultdict

In [2]:
stemmer = PorterStemmer()
wnl = WordNetLemmatizer()
stopw = stopwords.words("english")

dataset = pd.read_csv("./updated_dataset.csv")

In [3]:
def preproc(document):
    words = word_tokenize(document.lower())
    
    words = [wnl.lemmatize(word) for word in words]
    words = [stemmer.stem(word) for word in words]
    
    return {word:True for word in words if word.isalpha() and word not in stopw}

In [4]:
def trainmodel():
    fset = [(preproc(txt),lbl) for txt,lbl in zip(dataset["text"],dataset["label"])]
    
    shuffle(fset)
    
    split_idx = int(len(fset)*0.85)
    trains,tests = fset[:split_idx],fset[split_idx:]
    
    clf = nltk.NaiveBayesClassifier.train(trains)
    
    acc = nltk.classify.accuracy(clf,tests)
    print(f"acc > {acc*100:.2f} %")
    
    clf.show_most_informative_features(5)
    
    file = open("model.pickle","wb")
    pickle.dump(clf,file)
    file.close()

In [5]:
def readmodel():
    try:
        file = open("model.pickle","rb")
        clf = pickle.load(file)
        clf.show_most_informative_features(5)
        file.close()
    except:
        clf = trainmodel()
        
    return clf

In [6]:
def write_rev():
    while True:
        review = input("input >= 2 words")
        
        if len(review.split())>1:
            return review
        else:
            print("too short")

In [7]:
def analyze(review,clf):
    words = word_tokenize(review.lower())
    words = FreqDist([word for word in words if word.isalpha() and word not in string.punctuation])
    
    tags = pos_tag(words)
    for i,word in enumerate(tags):
        print(f"{i+1}. {word[0]} {word[1]}")
    
    for word in words:
        synset = wordnet.synsets(word)
        
        print(word)
        print("=======")
        synos = []
        antos = []
        
        for syns in synset:
            for lemm in syns.lemmas():
                synos.append(lemm.name())
                for anto in lemm.antonyms():
                    antos.append(anto.name())
                    
        synos = list(set(synos))
        antos = list(set(antos))
        
        print(f"syns > {", ".join(synos) if synos else "none"}")
        print(f"antos > {", ".join(antos) if antos else "none"}")
        print("")
        
    cleaned_rev = [word for word in word_tokenize(review) if word not in stopw and word not in string.punctuation]
    cleaned_rev = [wnl.lemmatize(stemmer.stem(word)) for word in cleaned_rev]
    
    result = clf.classify(FreqDist(cleaned_rev))
    print(f"{review} >> ({result})")

    return result
# analyze("i am him", classifier)

In [8]:
# classifier = readmodel()
# analyze("i am him", classifier)

In [9]:
def recomm(review):
    corpus = dataset["text"]
    target = dataset["restaurant"]
    
    tfidf = TfidfVectorizer()
    tfidf_mtr = tfidf.fit_transform(corpus)
    query = tfidf.transform([review])
    
    simil = cosine_similarity(tfidf_mtr,query)
    
    df = pd.DataFrame({
        'doc':corpus,
        'target':target,
        'simil':simil.flatten()
    })
    
    recomms = df[df["simil"]>0.15]
    
    top_recomms = recomms.sort_values(by="simil",ascending=False).head(10)
    
    return top_recomms[["simil","target"]]

In [10]:
# recommends = recomm("Contrary to other reviews, I have zero complaints about the service")

# if recommends.empty:
#     print("no recomms")
# else:
#     for idx, word in recommends.iterrows():
#         print(f"{idx} > {word["target"]} {word["simil"]}")

In [11]:
nlp = spacy.load("en_core_web_sm")
ner_label = nlp.get_pipe("ner").labels

# print(ner_label)

def ner(review):
    categorized_ents = defaultdict(set)
    doc = nlp(review)
    
    for ent in doc.ents:
        if ent.label_ in ["GPE", "LANGUAGE", "ORG"]:
            categorized_ents[ent.label_].add(ent.text)
    
    for cat,ent in categorized_ents.items():
        print(f"{cat} > {", ".join(sorted(ent))}")

ner("Microsoft Nigeria branch is preparing to teach english in Addis Abbaba")

ORG > Microsoft Nigeria
LANGUAGE > english
GPE > Addis Abbaba


In [17]:
classifier = readmodel()
review = ""
sent = ""

while True:
    print("Menu > pick option(1-5)")
    print("review > ", review if review else "no review yet")
    print("sent > ", sent if sent else "no sent yet")
    
    print("Enter opt >> ",flush="True",end="")
    choice = input("from no 1-5")
    print(f"(  {choice}  )")
    
    if choice == '1':
        review = write_rev()
    elif choice == '2':
        if review:
            sent = analyze(review, classifier)
        else:
            print("!!no review!!")
            
    elif choice == '3':
        if review:
            recommends = recomm(review)
            if recommends.empty:
                print("no recomms")
            else:
                for idx, word in recommends.iterrows():
                    print(f"{idx} > {word["target"]} {word["simil"]}")
        else:
            print("!!no review!!")
        
    elif choice == '4':
        if review:
            ner(review)
        else:
            print("!!no review!!")
            
    elif choice == '5':
        print("!!exiting!!")
        break
    else:
        print("!!out of range!!")
    
    print("")
        
        

Most Informative Features
                 terribl = True           negati : positi =     13.7 : 1.0
                 horribl = True           negati : positi =      9.9 : 1.0
                 fantast = True           positi : negati =      8.7 : 1.0
                    bill = True           negati : positi =      8.6 : 1.0
                 possibl = True           negati : positi =      7.9 : 1.0
Menu > pick option(1-5)
review >  no review yet
sent >  no sent yet
Enter opt >> (  2  )
!!no review!!

Menu > pick option(1-5)
review >  no review yet
sent >  no sent yet
Enter opt >> (  1  )

Menu > pick option(1-5)
review >  Contrary to other reviews, I have zero complaints about the service of the english employees of Microsoft at Jakarta 
sent >  no sent yet
Enter opt >> (  2  )
1. the DT
2. of IN
3. contrary JJ
4. to TO
5. other JJ
6. reviews NNS
7. i NNS
8. have VBP
9. zero VBN
10. complaints NNS
11. about IN
12. service NN
13. english JJ
14. employees NNS
15. microsoft VBP
16. at IN
1