In [22]:
import os
import re
import datetime
import time
from itertools import islice
from operator import itemgetter

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

from sklearn.model_selection import train_test_split, ShuffleSplit

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score, f1_score, precision_score, accuracy_score, roc_auc_score, confusion_matrix
from sklearn import svm

from imblearn.over_sampling import SMOTE

import pickle

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM,Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, text_to_word_sequence


import nltk
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from nltk import tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/dat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/dat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
lstm_model = pickle.load(open('model/lstm_model.sav', 'rb'))
naive_bow_model = pickle.load(open('model/naive_bow.sav', 'rb'))
naive_tfidf_model = pickle.load(open('model/naive_tfidf.sav', 'rb'))
svm_bow_model = pickle.load(open('model/svm_clf-bow.sav', 'rb'))
svm_tfidf_model = pickle.load(open('model/svm_clf_tfidf.sav', 'rb'))

In [6]:
def get_run_time(t1, t2):
    diff = t2 - t1
    mins = int(diff / 60)
    secs = round(diff % 60, 3)
    return str(mins) + " mins and " + str(secs) + " seconds"

def clean_str(sentence):
    # Remove HTML
    review_text = BeautifulSoup(sentence, features="html.parser").text
    # Remove non-letters
    letters_only = re.sub("[^a-zA-Z\s\s+]", "", review_text).strip()
    return letters_only



def get_data(file_name):
    if os.path.exists(file_name):
        print("-- " + file_name + " found locally")
        df = pd.read_csv(file_name)
    return df

def review_to_words(review):
    # 1. Convert to lower case, split into individual words
    words = review.lower().split()

    # 2. Get english stop words
    stops = set(stopwords.words("english"))
    
    # 3. Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    
    return " ".join(meaningful_words)


In [7]:
# read data from file
reviews = pd.read_csv("clean_train_reviews.csv", nrows=20000)
# ignore all 3* reviews
reviews = reviews[reviews["score"] != 3]
# positive sentiment = 4* or 5* reviews (sentriment = True)
reviews["sentiment"] = reviews["score"] >= 4

# X = reviews['text'].values.astype('U')
X = reviews['text']
y = reviews['sentiment']

In [23]:
vect_bow = CountVectorizer(analyzer="word",
                            preprocessor=None,
                            stop_words=None,
                            max_features=1000)

vect_tfidf = TfidfVectorizer(analyzer="word",
                                preprocessor=None,
                                stop_words=None,
                                max_features=1000)

tokenizer = Tokenizer(nb_words=20000)


In [42]:
vect_bow.fit(X.values.astype('U'))
vect_tfidf.fit(X.values.astype('U'))
tokenizer.fit_on_texts(X.values.astype('U'))

In [209]:
senten = [
        "I think anyone who expected this to be exactly like gin was fooling themselves. Just as the description says it is like comparing a veggie burger to beef. The veggie can be good too if used correctly, but it will never be beef. I made an offhand virgin negroni using this, blutul sweet vermouth and Italian san bitter. Turned out freaking awesome. Made a gimlet with fresh squeezed lime juice, simple syrup and ritual. It also turned out awesome. Use it for mixing and it is amazing",
        "Every year I take a month off of drinking to prepare for or recover from holiday indulgences. During these months, I have often felt that what I missed was not beer, wine, or spirits, per se, but the rituals surrounding those drinks - the mixing of the cocktail after work, or sharing a concoction with friends and family. And so, Ritual piqued my interest. I’ve used the Ritual Gin Alternative in virgin gimlets and g&ts, and find it a nice substitute. It’s worth thinking of it as Ritual would ask you to think about it: as a veggie burger is to a hamburger, Ritual is to gin. It has the same floral character of gin, though in place of the burn of the alcohol there is what I think is a mild capsaicin burn. The flavor and burn give this a complexity that is often missing in mocktails. I won’t be giving up gin on a permanent basis. It’s too delicious. But for Sober October, or for days when I need to wake up early to work or work out, I’ll happily sub this into my gin cocktail of choice",
        "I've been a gin drinker for 40 years. I'll guessing I've tried 3-4 dozen of different gins from all over the world.. And yes gin does have a slight 'pine' taste, and aroma to it. Unfortunately this non alcoholic gin taste and smells like Pine Floor/Surface cleaner. Not that you would want to drink a Pine scented Cleaner, but I would imagine this is what it would taste like. They WAAAAY over did the pine 'essence' in this 'gin'.Even mixing it with tonic water and a generous squirt of lime doesn't hide the over the top pine flavoring. Maybe they over did the Juniper in this, IDK. I even made a G and T with half this, and half Tanqueray, but nope, even doing that couldn't save the drink.. I really wanted to find an alternative to regular gin. This isn't it though. Too bad. I'll give them 2 stars though, at least they tried and from reading other reviews some people seem to like it, not me though. Years ago I had a small batch 'craft' gin made in Massachusetts, this reminded me of that gin. Very 'piney', maybe this Ritual gin is is trying to copy small small batch distillers. Anyway taste is subjective."
    ]



In [210]:
sentences = []
for x in senten:
    se1= clean_str(x)
    se2 = review_to_words(se1)
    sentences.append(se2)

check_lst_bow = vect_bow.transform(sentences).toarray()
check_lst_tfidf = vect_tfidf.transform(sentences).toarray()

In [211]:
naive_bow_model.predict(check_lst_bow)

array([1, 1, 1])

In [212]:
naive_tfidf_model.predict(check_lst_tfidf)

array([1, 1, 1])

In [213]:
svm_bow_model.predict(check_lst_bow)

array([False,  True, False])

In [214]:
svm_tfidf_model.predict(check_lst_tfidf)

array([False,  True,  True])

In [215]:

data = np.zeros((len(sentences), 600), dtype='int32')
for i,sen in enumerate(sentences):
    if i > 10:
        break
    # print(sen)
    for j,word in enumerate(sen.split()):
        try:
            if j < 600 and not tokenizer.word_index[word]:
                data[i, j] = tokenizer.word_index[word]
            else: 
                data[i, j] = 0
        except:
            pass
        

In [216]:
lstm_model.predict(data)

array([[1.],
       [1.],
       [1.]], dtype=float32)