In [1]:
import re
from tqdm import tqdm
from sklearn.utils import shuffle
import numpy as np
from tqdm import tqdm
import bz2
from keras.layers import *
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


Using TensorFlow backend.


In [2]:
def splitReviewsLabels(lines):
    reviews = []
    labels = []
    for review in tqdm(lines):
        rev = reviewToX(review)
        label = reviewToY(review)
        reviews.append(rev[:512])
        labels.append(label)
    return reviews, labels

In [3]:
def reviewToY(review):
    return [1,0] if review.split(' ')[0] == '__label__1' else [0,1] 

In [4]:
def reviewToX(review):
    review = review.split(' ', 1)[1][:-1].lower()
    review = re.sub('\d','0',review)
    if 'www.' in review or 'http:' in review or 'https:' in review or '.com' in review:
        review = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", review)
    return review

In [5]:
train_file = bz2.BZ2File('../input/train.ft.txt.bz2')
test_file = bz2.BZ2File('../input/test.ft.txt.bz2')

In [6]:
train_lines = train_file.readlines()
test_lines = test_file.readlines()

In [7]:
len(train_lines), len(test_lines)

(3600000, 400000)

In [8]:
train_lines = [x.decode('utf-8') for x in train_lines]
test_lines = [x.decode('utf-8') for x in test_lines]

In [9]:
# Load from the file
reviews_train, y_train = splitReviewsLabels(train_lines)
reviews_test, y_test = splitReviewsLabels(test_lines)

100%|██████████| 3600000/3600000 [01:00<00:00, 59424.56it/s]
100%|██████████| 400000/400000 [00:06<00:00, 57916.21it/s]


In [10]:
reviews_train, y_train = shuffle(reviews_train, y_train)
reviews_test, y_test = shuffle(reviews_test, y_test)

In [11]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [12]:
max_features = 8192
maxlen = 128
embed_size = 64

In [13]:
tokenizer = Tokenizer(num_words=max_features)

In [14]:
tokenizer.fit_on_texts(reviews_train)

In [15]:
token_train = tokenizer.texts_to_sequences(reviews_train)
token_test = tokenizer.texts_to_sequences(reviews_test)

In [16]:
x_train = pad_sequences(token_train, maxlen=maxlen, padding='post')
x_test = pad_sequences(token_test, maxlen=maxlen, padding='post')

In [17]:
reviews_test[:10]

['lizzie put it best: re: audio tape version. i love follett - when he gives his best effort that is - but this novel smells like quick paycheck. forgettable characters and predictable action leave the reader with the same response that heroine lizzie has to her wedding night: is that all there is? the weak title should have been a clue that this would be a second-rate effort. for a much more compelling period piece from follett, try the excellent "a dangerous fortune".',
 "gratuitous sex and gore which holds no relevence.: this book is written on pure shock value. there is no plot. how was this book ever published? all this book does is disturb the reader. the subject matter of a lost generation is overdone and thus cliche'd. this book holds no relevancy as a moral work or for any genre of literature for that matter. the characters are like those of a stephen king novel: vacant. i would give the author merit for turning a first person narrative into a third person narrative b",
 'very

In [18]:
import gc
del train_file, test_file, train_lines, test_lines
del reviews_train, reviews_test
del token_train, token_test
gc.collect()

0

In [19]:
input = Input(shape=(maxlen,))
net = Embedding(max_features, embed_size)(input)
net = Dropout(0.2)(net)
net = BatchNormalization()(net)

net = Conv1D(32, 7, padding='same', activation='relu')(net)
net = BatchNormalization()(net)
net = Conv1D(32, 3, padding='same', activation='relu')(net)
net = BatchNormalization()(net)
net = Conv1D(32, 3, padding='same', activation='relu')(net)
net = BatchNormalization()(net)
net = Conv1D(32, 3, padding='same', activation='relu')(net)
net1 = BatchNormalization()(net)

net = Conv1D(2, 1)(net)
net = GlobalAveragePooling1D()(net)
output = Activation('softmax')(net)
model = Model(inputs = input, outputs = output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 128)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 128, 64)           524288    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128, 64)           0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 128, 64)           256       
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 128, 32)           14368     
_________________________________________________________________
batch_normalization_2

In [20]:
#model.fit(x_train, y_train, batch_size=2048, epochs=5, validation_split=0.1)
model.fit(x_train, y_train, batch_size=2048, epochs=1, validation_split=0.1)

Instructions for updating:
Use tf.cast instead.
Train on 3240000 samples, validate on 360000 samples
Epoch 1/1


<keras.callbacks.History at 0x7f61261ce940>

In [21]:
model.evaluate (x_test, y_test)



[0.16877406593553723, 0.9369025]

In [22]:
#here need import external texts and predict

In [23]:
y_predict = model.predict(x_test[:10])
for i in range(10):
    print(i, y_predict[i], y_test[i])

0 [0.9939435  0.00605653] [1 0]
1 [9.9940443e-01 5.9552334e-04] [1 0]
2 [0.9978258  0.00217421] [1 0]
3 [0.33279505 0.6672049 ] [0 1]
4 [0.9578514  0.04214856] [1 0]
5 [9.9961358e-01 3.8646258e-04] [1 0]
6 [0.99851745 0.00148254] [1 0]
7 [9.9969995e-01 3.0002507e-04] [1 0]
8 [0.6924317  0.30756825] [0 1]
9 [0.9652173  0.03478274] [1 0]


In [24]:
import pandas as pd
y_test_pd=pd.DataFrame(y_test).loc[:,0]
y_test_pd.value_counts()

1    200000
0    200000
Name: 0, dtype: int64