In [22]:
import os
import re
import datetime
import time
from itertools import islice
from operator import itemgetter

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

from sklearn.model_selection import train_test_split, ShuffleSplit

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score, f1_score, precision_score, accuracy_score, roc_auc_score, confusion_matrix
from sklearn import svm

from imblearn.over_sampling import SMOTE

import pickle

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM,Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, text_to_word_sequence


import nltk
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from nltk import tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/dat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/dat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
lstm_model = pickle.load(open('lstm_model.sav', 'rb'))
naive_bow_model = pickle.load(open('naive_bow.sav', 'rb'))
naive_tfidf_model = pickle.load(open('naive_tfidf.sav', 'rb'))
svm_bow_model = pickle.load(open('svm_clf-bow.sav', 'rb'))
svm_tfidf_model = pickle.load(open('svm_clf_tfidf.sav', 'rb'))

In [6]:
def get_run_time(t1, t2):
    diff = t2 - t1
    mins = int(diff / 60)
    secs = round(diff % 60, 3)
    return str(mins) + " mins and " + str(secs) + " seconds"

def clean_str(sentence):
    # Remove HTML
    review_text = BeautifulSoup(sentence, features="html.parser").text
    # Remove non-letters
    letters_only = re.sub("[^a-zA-Z\s\s+]", "", review_text).strip()
    return letters_only



def get_data(file_name):
    if os.path.exists(file_name):
        print("-- " + file_name + " found locally")
        df = pd.read_csv(file_name)
    return df

def review_to_words(review):
    # 1. Convert to lower case, split into individual words
    words = review.lower().split()

    # 2. Get english stop words
    stops = set(stopwords.words("english"))
    
    # 3. Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    
    return " ".join(meaningful_words)


In [7]:
# read data from file
reviews = pd.read_csv("clean_train_reviews.csv", nrows=20000)
# ignore all 3* reviews
reviews = reviews[reviews["score"] != 3]
# positive sentiment = 4* or 5* reviews (sentriment = True)
reviews["sentiment"] = reviews["score"] >= 4

# X = reviews['text'].values.astype('U')
X = reviews['text']
y = reviews['sentiment']

In [23]:
vect_bow = CountVectorizer(analyzer="word",
                            preprocessor=None,
                            stop_words=None,
                            max_features=1000)

vect_tfidf = TfidfVectorizer(analyzer="word",
                                preprocessor=None,
                                stop_words=None,
                                max_features=1000)

tokenizer = Tokenizer(nb_words=20000)


In [42]:
vect_bow.fit(X.values.astype('U'))
vect_tfidf.fit(X.values.astype('U'))
tokenizer.fit_on_texts(X.values.astype('U'))

In [110]:
senten = [
        "I'll start by saying I love exploring non-alcoholic options, so I wasn't expecting it to taste like gin, just to offer an interesting botanical profile and fun mixing options (I'm an athlete and I just don't like to drink most of the time). That said, this one is really vile. I see what they're trying to do, but it tastes like pine floor cleaner with a strong hot pepper burst for that real kick-you-when-you're-down painful finish. I think they're trying to mimic the 'burn' of alcohol? But it's not necessary. The flavors are also not balanced well, for all their claims of thoughtful botanicals and extracts - the strongest flavor is pine, a second note of must, and a punch of hot pepper, and then I don't taste much else. It just tastes really bad, and I've had lots of other really tasty non-alcoholic mixers and substitutes that I enjoy, so just skip this one. They also don't offer any return policy, so I'm out $25 - I guess I'll use it to clean my floors.
",
    ]



In [111]:
sentences = []
for x in senten:
    se1= clean_str(x)
    se2 = review_to_words(se1)
    sentences.append(se2)

check_lst_bow = vect_bow.transform(sentences).toarray()
check_lst_tfidf = vect_tfidf.transform(sentences).toarray()

In [112]:
naive_bow_model.predict(check_lst_bow)

array([0, 0, 0, 1, 0, 1])

In [113]:
naive_tfidf_model.predict(check_lst_tfidf)

array([1, 0, 1, 0, 1, 0])

In [114]:
svm_bow_model.predict(check_lst_bow)

array([ True,  True,  True,  True,  True, False])

In [115]:
svm_tfidf_model.predict(check_lst_tfidf)

array([ True, False,  True,  True,  True, False])

In [116]:

data = np.zeros((len(sentences), 600), dtype='int32')
for i,sen in enumerate(sentences):
    if i > 10:
        break
    # print(sen)
    for j,word in enumerate(sen.split()):
        if j < 600 and tokenizer.word_index[word] is not None:
            data[i, j] = tokenizer.word_index[word]
        else: 
            data[i, j] = 0

In [117]:
lstm_model.predict(data)

array([[1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.]], dtype=float32)