In [1]:
import os
import re
import datetime
import time
from itertools import islice
from operator import itemgetter

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

from sklearn.model_selection import train_test_split, ShuffleSplit

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score, f1_score, precision_score, accuracy_score, roc_auc_score, confusion_matrix
from sklearn import svm

from imblearn.over_sampling import SMOTE

import pickle

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM,Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence



import nltk
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords

Using TensorFlow backend.
[nltk_data] Downloading package stopwords to /home/dat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/dat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def get_run_time(t1, t2):
    diff = t2 - t1
    mins = int(diff / 60)
    secs = round(diff % 60, 3)
    return str(mins) + " mins and " + str(secs) + " seconds"

def clean_str(sentence):
    # Remove HTML
    review_text = BeautifulSoup(sentence, features="html.parser").text
    # Remove non-letters
    letters_only = re.sub("[^a-zA-Z\s\s+]", "", review_text).strip()
    return letters_only

def convert_plain_to_csv(text_file, csv_file):
    t0 = time.time()
    with open(text_file, "r") as f1, open(csv_file, "w") as f2:
        i = 0
        f2.write("productId,score,summary,text\n")
        while True:
            next_n_lines = list(islice(f1, 9))  # read 9 line
            if not next_n_lines:
                break

            output_line = ""
            for line in next_n_lines:
                if "product/productId:" in line:
                    output_line += line.split(":")[1].strip() + ","
                elif "review/score:" in line:
                    output_line += line.split(":")[1].strip() + ","
                elif "review/summary:" in line:
                    summary = clean_str(line.split(":")[1].strip()) + ","
                    output_line += summary
                elif "review/text:" in line:
                    text = clean_str(line.split(":")[1].strip()) + "\n"
                    output_line += text

            f2.write(output_line)

            # print status
            i += 1
            if i % 10000 == 0:
                print(i, "reviews converted...")

    print(datetime.datetime.now(), "- Converting completed in", get_run_time(t0, time.time()))

def get_data(file_name):
    if os.path.exists(file_name):
        print("-- " + file_name + " found locally")
        df = pd.read_csv(file_name)
    return df

def review_to_words(review):
    # 1. Convert to lower case, split into individual words
    words = review.lower().split()

    # 2. Get english stop words
    stops = set(stopwords.words("english"))
    
    # 3. Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    
    return " ".join(meaningful_words)


def cleaning_data(dataset, file_name):
    t0 = time.time()
    num_reviews = dataset["text"].size
    clean_train_reviews = []

    # Loop over each review
    for i in range(0, num_reviews):
        # If the index is evenly divisible by 1000, print a message
        if (i + 1) % 10000 == 0:
            print("Review", i + 1, "of", num_reviews, "\n")

        productId = str(dataset["productId"][i])
        score = str(dataset["score"][i])
        summary = str(dataset["summary"][i])
        text = review_to_words(str(dataset["text"][i]))

        clean_train_reviews.append(productId + "," + score + "," + summary + "," + text + "\n")

    print("Writing clean train reviews...")
    with open(file_name, "w") as f:
        f.write("productId,score,summary,text\n")
        for review in clean_train_reviews:
            f.write("%s\n" % review)

    
    print(datetime.datetime.now(), "- Write file completed in", get_run_time(t0, time.time()))

In [None]:
# """
# Pre-processing
# """
# convert_plain_to_csv("finefoods.txt", "foods.csv")

# # Reading the Data
# train = get_data("foods.csv")
# print("Data dimensions:", train.shape)
# print("List features:", train.columns.values)

# cleaning_data(train, "clean_train_reviews.csv")

In [3]:
# read data from file
reviews = pd.read_csv("clean_train_reviews.csv", nrows=20000)
# ignore all 3* reviews
reviews = reviews[reviews["score"] != 3]
# positive sentiment = 4* or 5* reviews (sentriment = True)
reviews["sentiment"] = reviews["score"] >= 4

# X = reviews['text'].values.astype('U')
X = reviews['text']
y = reviews['sentiment']

In [4]:
naive_bow = MultinomialNB()
naive_tfidf = MultinomialNB()
svm_clf_bow = svm.SVC(kernel='linear', C=1)
svm_clf_tfidf = svm.SVC(kernel='linear', C=1)

In [5]:
ss = ShuffleSplit(n_splits=10, test_size=0.2)
sm = SMOTE()
accs = []
f1s = []
cms = []
pres = []
recs = []
vect = CountVectorizer(analyzer="word",
                            preprocessor=None,
                            stop_words=None,
                            max_features=1000)

for train_index, test_index in ss.split(X):

    
    
    X_train, X_test = X.iloc[train_index].values.astype('U'), X.iloc[test_index].values.astype('U')
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    Encoder = LabelEncoder() 
    y_train = Encoder.fit_transform (y_train) 
    y_test = Encoder.fit_transform (y_test)
    
    # Fit vectorizer and transform X train, then transform X test
    X_train_vect = vect.fit_transform(X_train)
    X_test_vect = vect.transform(X_test)
    
    # Oversample
    X_train_res, y_train_res = sm.fit_sample(X_train_vect, y_train)
    
    # Fit Naive Bayes on the vectorized X with y train labels, 
    # then predict new y labels using X test
    naive_bow.fit(X_train_res, y_train_res)
    # save the model to disk
    filename = 'naive_bow.sav'
    pickle.dump(naive_bow, open(filename, 'wb'))

    y_pred = naive_bow.predict(X_test_vect)
    
    # Determine test set accuracy and f1 score on this fold using the true y labels and predicted y labels
    accs.append(accuracy_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred))
    cms.append(confusion_matrix(y_test, y_pred))
    pres.append(precision_score(y_test, y_pred))
    recs.append(recall_score(y_test, y_pred))

print("\nAverage accuracy across folds: {:.2f}%".format(sum(accs) / len(accs) * 100))
print("\nAverage F1 score across folds: {:.2f}%".format(sum(f1s) / len(f1s) * 100))
print("\nAverage Precision score across folds: {:.2f}%".format(sum(pres) / len(pres) * 100))
print("\nAverage Recall score across folds: {:.2f}%".format(sum(recs) / len(recs) * 100))
print("\nAverage Confusion Matrix across folds: \n {}".format(sum(cms) / len(cms)))


Average accuracy across folds: 85.18%

Average F1 score across folds: 90.90%

Average Precision score across folds: 94.03%

Average Recall score across folds: 87.97%

Average Confusion Matrix across folds: 
 [[ 409.4  172.4]
 [ 371.7 2717.5]]


In [6]:
ss = ShuffleSplit(n_splits=10, test_size=0.2)
sm = SMOTE()
accs = []
f1s = []
cms = []
pres = []
recs = []
vect = TfidfVectorizer(analyzer="word",
                                preprocessor=None,
                                stop_words=None,
                                max_features=1000)

for train_index, test_index in ss.split(X):
    
    X_train, X_test = X.iloc[train_index].values.astype('U'), X.iloc[test_index].values.astype('U')
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    Encoder = LabelEncoder() 
    y_train = Encoder.fit_transform (y_train) 
    y_test = Encoder.fit_transform (y_test)
    
    # Fit vectorizer and transform X train, then transform X test
    X_train_vect = vect.fit_transform(X_train)
    X_test_vect = vect.transform(X_test)
    
    # Oversample
    X_train_res, y_train_res = sm.fit_sample(X_train_vect, y_train)
    
    # Fit Naive Bayes on the vectorized X with y train labels, 
    # then predict new y labels using X test
    naive_tfidf.fit(X_train_res, y_train_res)
    # save the model to disk
    filename = 'naive_tfidf.sav'
    pickle.dump(naive_tfidf, open(filename, 'wb'))

    y_pred = naive_tfidf.predict(X_test_vect)
    
    # Determine test set accuracy and f1 score on this fold using the true y labels and predicted y labels
    accs.append(accuracy_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred))
    cms.append(confusion_matrix(y_test, y_pred))
    pres.append(precision_score(y_test, y_pred))
    recs.append(recall_score(y_test, y_pred))

print("\nAverage accuracy across folds: {:.2f}%".format(sum(accs) / len(accs) * 100))
print("\nAverage F1 score across folds: {:.2f}%".format(sum(f1s) / len(f1s) * 100))
print("\nAverage Precision score across folds: {:.2f}%".format(sum(pres) / len(pres) * 100))
print("\nAverage Recall score across folds: {:.2f}%".format(sum(recs) / len(recs) * 100))
print("\nAverage Confusion Matrix across folds: \n {}".format(sum(cms) / len(cms)))


Average accuracy across folds: 83.49%

Average F1 score across folds: 89.53%

Average Precision score across folds: 95.85%

Average Recall score across folds: 83.99%

Average Confusion Matrix across folds: 
 [[ 473.9  112.2]
 [ 494.  2590.9]]


In [8]:
# X = X.values.astype('U')
# sm = SMOTE()
vect = CountVectorizer(analyzer="word",
                            preprocessor=None,
                            stop_words=None,
                            max_features=1000)
X_train, X_test, y_train, y_test = train_test_split(X.values.astype('U'), y, test_size=0.2, random_state=42)

# X_train, y_train = sm.fit_sample(X_train, y_train)

X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)
svm_clf_bow.fit(X_train, y_train)
# save the model to disk
filename = 'svm_clf-bow.sav'
pickle.dump(svm_clf_bow, open(filename, 'wb'))

y_pred = svm_clf_bow.predict(X_test)

accs =accuracy_score(y_test, y_pred)
f1s = f1_score(y_test, y_pred)
cms = confusion_matrix(y_test, y_pred)
pres = precision_score(y_test, y_pred)
recs = recall_score(y_test, y_pred)

print("\nAverage accuracy across folds: {:.2f}%".format(accs* 100))
print("\nAverage F1 score across folds: {:.2f}%".format(f1s * 100))
print("\nAverage Precision score across folds: {:.2f}%".format(pres* 100))
print("\nAverage Recall score across folds: {:.2f}%".format(recs* 100))
print("\nAverage Confusion Matrix across folds: \n {}".format(cms))


Average accuracy across folds: 88.67%

Average F1 score across folds: 93.37%

Average Precision score across folds: 91.04%

Average Recall score across folds: 95.81%

Average Confusion Matrix across folds: 
 [[ 328  288]
 [ 128 2927]]


In [9]:


# X = X.values.astype('U')
vect = TfidfVectorizer(analyzer="word",
                                preprocessor=None,
                                stop_words=None,
                                max_features=1000)
X_train, X_test, y_train, y_test = train_test_split(X.values.astype('U'), y, test_size=0.2, random_state=42)

X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)
svm_clf_tfidf.fit(X_train, y_train)
# save the model to disk
filename = 'svm_clf_tfidf.sav'
pickle.dump(svm_clf_tfidf, open(filename, 'wb'))

y_pred = svm_clf_tfidf.predict(X_test)

accs =accuracy_score(y_test, y_pred)
f1s = f1_score(y_test, y_pred)
cms = confusion_matrix(y_test, y_pred)
pres = precision_score(y_test, y_pred)
recs = recall_score(y_test, y_pred)

print("\nAverage accuracy across folds: {:.2f}%".format(accs* 100))
print("\nAverage F1 score across folds: {:.2f}%".format(f1s * 100))
print("\nAverage Precision score across folds: {:.2f}%".format(pres* 100))
print("\nAverage Recall score across folds: {:.2f}%".format(recs* 100))
print("\nAverage Confusion Matrix across folds: \n {}".format(cms))


Average accuracy across folds: 89.70%

Average F1 score across folds: 94.05%

Average Precision score across folds: 90.55%

Average Recall score across folds: 97.84%

Average Confusion Matrix across folds: 
 [[ 304  312]
 [  66 2989]]


In [None]:
senten = [
        'Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".',
        'This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis\' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch.',
        'Great taffy at a great price.  There was a wide assortment of yummy taffy.  Delivery was very quick.  If your a taffy lover, this is a deal.',
        'It is good',
        'It is bad',
        'My cats have been happily eating Felidae Platinum for more than two years. I just got a new bag and the shape of the food is different. They tried the new food when I first put it in their bowls and now the bowls sit full and the kitties will not touch the food. I\'ve noticed similar reviews related to formula changes in the past. Unfortunately, I now need to find a new food that my cats will eat.',
        'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.',
        'If you are looking for the secret ingredient in Robitussin I believe I have found it.  I got this in addition to the Root Beer Extract I ordered (which was good) and made some cherry soda.  The flavor is very medicinal.',
        'this is a nice',
        'it is tasty'
    ]

sentences = []
for x in senten:
    se1= clean_sentence(x)
    se2 = review_to_words(se1)
    sentences.append(se2)

check_lst = vectorizer.transform(sentences).toarray()
# print(vectorizer.fit(sentences).tokenizer)
# print(check_lst.reshape(2,100))
result = clf.predict(check_lst)
print(result)