Ready to use pre-trained embedding

In [1]:
from gensim.models import Word2Vec
import os
import pandas as pd

df = pd.read_csv('Datasets\\processedAnimeReviews.csv')
w2v_model = Word2Vec.load('Models\\w2vmodel.bin')

In [2]:
print(len(w2v_model.wv.vocab))
print(w2v_model.vector_size)

170072
100


In [3]:
from collections import Counter
from nltk.tokenize import word_tokenize

vocab = Counter()

def count_words(text: str):
    tokens = word_tokenize(text)
    for token in tokens:
        vocab[token] += 1

df['review'].apply(lambda x: count_words(x))
for value, count in vocab.most_common(20):
    print(value, count)

character 363540
anime 342406
story 224260
show 213578
like 207145
one 194067
really 159586
episode 146563
series 143603
good 131607
time 125444
well 115288
first 109700
get 107283
much 102853
make 101059
even 100143
would 93321
also 90920
thing 87132


In [4]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

word_vectors = w2v_model.wv
word_index = {t[0]: i+1 for i,t in enumerate(vocab.most_common(len(word_vectors.vocab)))}
MAX_NB_WORDS = len(word_vectors.vocab)
MAX_SEQ_LEN = 300

reviews = list(df['review'].values)
sentiment = list(df['sentiment'].values)

X_train, X_test, y_train, y_test = train_test_split(reviews,sentiment,test_size=.4, random_state=42)

X_train = [[word_index.get(word, 0) for word in review] for review in X_train]
X_test = [[word_index.get(word, 0) for word in review] for review in X_test]

# padding
X_train = pad_sequences(X_train, maxlen=MAX_SEQ_LEN, padding="post", truncating="post")
X_test = pad_sequences(X_test, maxlen=MAX_SEQ_LEN, padding="post", truncating="post")

print(X_train.shape)
print(X_test.shape)

(81108, 300)
(54073, 300)


In [5]:
import numpy as np

wv_dim = 100
number_words = len(word_vectors.vocab)
word_vector_matrix = (np.random.rand(number_words, wv_dim) - .5) / 5.0
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    try:
        embedding_vector = word_vectors[word]
        # words not found in embedding index will be all-zeros.
        word_vector_matrix[i] = embedding_vector
    except:
        pass 

In [6]:
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout,SpatialDropout1D, Bidirectional, BatchNormalization, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.backend import mean

In [7]:
wv_layer = Embedding(number_words, wv_dim, mask_zero=False, weights=[word_vector_matrix], input_length=MAX_SEQ_LEN, trainable=False)
averaging_layer = Lambda(lambda x: mean(x, axis=1))

# Inputs
review_input = Input(shape=(MAX_SEQ_LEN,), dtype='int32')
embedded_sequences = wv_layer(review_input)

# biGRU
embedded_sequences = SpatialDropout1D(0.2)(embedded_sequences)
x = Bidirectional(LSTM(64, return_sequences=False))(embedded_sequences)

# Output
x = Dropout(0.2)(x)
x = BatchNormalization()(x)
preds = Dense(1, activation='sigmoid')(x)
preds = averaging_layer(preds)

# build the model
model = Model(inputs=[review_input], outputs=preds)
model.compile(loss='binary_crossentropy',
              optimizer=Adam(lr=0.001, clipnorm=.25, beta_1=0.7, beta_2=0.99),
              metrics=['acc'])

In [8]:
nd_y_train = np.asarray(y_train)
print(X_train.shape)
print(nd_y_train.shape) 
hist = model.fit(X_train, nd_y_train, validation_split=0.1, epochs=3, batch_size=256, shuffle=True)

(81108, 300)
(81108,)
Train on 72997 samples, validate on 8111 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [9]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 300)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 300, 100)          17007200  
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 300, 100)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               84480     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
batch_normalization (BatchNo (None, 128)               512       
_________________________________________________________________
dense (Dense)                (None, 1)                 129   

In [11]:
from sklearn.metrics import precision_score, recall_score, accuracy_score

yhat_probabilities = model.predict(X_test, verbose=0)
	
# # accuracy: (tp + tn) / (p + n)
# accuracy = accuracy_score(y_test, yhat_classes)
# print(f'Accuracy: {accuracy}')
# # precision tp / (tp + fp)
# precision = precision_score(y_test, yhat_classes)
# print(f'Precision: {precision}')
# # recall: tp / (tp + fn)
# recall = recall_score(y_test, yhat_classes)
# print(f'Recall: {recall}')