In [1]:
# Basic Tools
import pandas as pd
import numpy as np
import itertools
from collections import Counter
import numpy as np
from matplotlib import pyplot as plt

# For text preprocessing
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.lancaster import LancasterStemmer  
from nltk.corpus import words, wordnet
from nltk.text import TextCollection
from copy import deepcopy
from bs4 import BeautifulSoup
import urllib3
import contractions
from nltk.stem.snowball import SnowballStemmer

# Sklearn Tools
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from sklearn import svm
from sklearn.metrics import classification_report , accuracy_score
from sklearn.decomposition import PCA

# Tensorflow framework
import tensorflow as tf
from tensorflow.keras import Sequential,optimizers
from tensorflow.keras.layers import Dense,BatchNormalization,Dropout,Activation
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras import backend as K
from tensorflow.keras import regularizers
import tensorflow_hub as hub

# self-desined tools
from QingHao import SDcorpus



In [2]:
data = pd.read_csv("movie_data.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
def binary_transform(sentiment):
    if sentiment == "positive":
        return 1
    return 0

In [4]:
y = data["sentiment"].apply(binary_transform).values
y_train = y[15000:]
y_test = y[:10000]
y_val = y[10000:15000]

In [5]:
raw_train = data["review"][15000:]
raw_test = data["review"][:10000]
raw_val = data["review"][10000:15000]

In [6]:
vectorizer = TfidfVectorizer(min_df=5, max_features=50000, stop_words=None)
vectorizer.fit(raw_train)
train_feature = vectorizer.transform(raw_train).toarray()
test_feature = vectorizer.transform(raw_test).toarray()
val_feature = vectorizer.transform(raw_val).toarray()

In [24]:
model = Sequential([
            Dropout(rate=0.2, input_shape=train_feature.shape[1:]),
            #Dense(16, input_dim=train_feature.shape[1], activation="relu"),
            #Dense(64, activation="relu"),
            Dense(32, activation="relu"),
            #Dense(64, activation="relu", kernel_regularizer=regularizers.l2(0.0001)),
            Dense(1, activation="sigmoid")
        ])

model.compile(
            optimizer="Adam",
            loss="binary_crossentropy",
            metrics=["binary_accuracy"]
        )

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=3, min_lr=0.001)

es = EarlyStopping(monitor='val_loss',
                   min_delta=0, patience=2, verbose=0, mode='auto',
                   baseline=None, restore_best_weights=False)

In [25]:
history = model.fit(
            x=train_feature,
            y=y_train,
            batch_size=32,
            epochs=1,
            verbose=1,
            validation_data=(val_feature, y_val),
            callbacks=[reduce_lr, es],
            shuffle=True
        )



In [9]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout (Dropout)            (None, 31690)             0         
_________________________________________________________________
dense (Dense)                (None, 64)                2028224   
_________________________________________________________________
dense_1 (Dense)              (None, 128)               8320      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 2,036,673
Trainable params: 2,036,673
Non-trainable params: 0
_________________________________________________________________


In [33]:
prediction = model.predict(val_feature)
threshold = 0.4
prediction_0 = [None]*len(prediction)
while threshold < 0.7:
    for i in range(len(prediction)):
        if prediction[i] <= threshold:
            prediction_0[i] = 0
        else:
            prediction_0[i] = 1
        
    error_count = 0 
    for i in range(len(prediction_0)):
        if prediction_0[i] != y_val[i]:
            error_count += 1
    
    print("threshold = ", threshold)
    print("Error Count: {}".format(error_count))
    print("Accuracy: {}%".format(((len(y_val)-error_count)/len(y_val))*100))
    
    threshold += 0.03
    

threshold =  0.4
Error Count: 550
Accuracy: 89.0%
threshold =  0.43000000000000005
Error Count: 531
Accuracy: 89.38000000000001%
threshold =  0.4600000000000001
Error Count: 510
Accuracy: 89.8%
threshold =  0.4900000000000001
Error Count: 487
Accuracy: 90.25999999999999%
threshold =  0.5200000000000001
Error Count: 483
Accuracy: 90.34%
threshold =  0.5500000000000002
Error Count: 471
Accuracy: 90.58%
threshold =  0.5800000000000002
Error Count: 468
Accuracy: 90.64%
threshold =  0.6100000000000002
Error Count: 487
Accuracy: 90.25999999999999%
threshold =  0.6400000000000002
Error Count: 502
Accuracy: 89.96%
threshold =  0.6700000000000003
Error Count: 506
Accuracy: 89.88000000000001%


In [32]:
prediction = model.predict(test_feature)
        
for i in range(len(prediction)):
    if prediction[i] <= 0.58:
        prediction[i] = 0
    else:
        prediction[i] = 1
        
error_count = 0 
for i in range(len(prediction)):
    if prediction[i] != y_test[i]:
        error_count += 1

print("Error Count: {}".format(error_count))
print("Accuracy: {}%".format(((len(y_test)-error_count)/len(y_test))*100))

Error Count: 954
Accuracy: 90.46%
