In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
#!/usr/bin/python3
# -*- coding: UTF-8 -*-
import time
from keras import metrics
import os
import numpy as np
import keras
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, SimpleRNN,Activation
from keras.layers.embeddings import Embedding
from datetime import datetime
from gensim.models import word2vec
import pandas as pd

Using TensorFlow backend.


In [5]:
# get texts data
TRAINING_PATH = './data/training/'
TESTING_PATH = './data/testing/'

categories = [dirname for dirname in os.listdir(TRAINING_PATH) if dirname[-4:] != '_cut']
# print(len(categories), str(categories))

category2idx = {'Japan_Travel': 0, 'KR_ENTERTAIN': 1, 'Makeup': 2, 'Tech_Job':  3, 'WomenTalk': 4,
                  'babymother': 5, 'e-shopping': 6, 'graduate': 7, 'joke': 8, 'movie': 9}

train_pickle_df = pd.read_pickle('train.pkl')
train_texts = train_pickle_df["text"].values
train_labels = train_pickle_df["category"]
# print(len(train_texts), train_texts[0], train_labels[0])   

test_pickle_df = pd.read_pickle('test.pkl')
test_texts = test_pickle_df["text"].values
# print(len(test_texts), test_texts[0]) 

In [31]:
# # process some data
train_labels_list = list(train_labels)
# print(type(train_labels_list), len(train_labels_list), train_labels_list[0])
embedding_matrix_len = len(train_labels_list)

label_id = 0
label_list = np.zeros((embedding_matrix_len, 10))
for label_val in train_labels_list:
    label_list[label_id][label_val] = 1
    label_id += 1
# print(len(label_list), label_list[0])

In [7]:
# get word embedding vector
answer = word2vec.Word2Vec.load("word2vec1.model")
# print(type(answer))
word_vectors = answer.wv
wvv = word_vectors.vocab
wvv_keys = wvv.keys()
wvv_keys_list = list(wvv_keys)
# print(wvv_keys_list[:10]) #['櫻花林', '好比', '考科', '床上', '一點現', '記住', '寶寶的', '柔嫩', '不規則', '朴智妍']

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
sequences = tokenizer.texts_to_sequences(train_texts)
max_doc_word_length = max(len(l) for l in train_texts)
sequences1 = pad_sequences(sequences, maxlen=max_doc_word_length, padding='post')
word_index = tokenizer.word_index
print("Found %s unique tokens" % len(word_index))
# data = pad_sequences(sequences)
# print("Shape of data tensor:" , data.shape)

Found 232236 unique tokens


In [10]:
vocab_size = len(word_index) + 1
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 250))
for word, i in word_index.items():
    if word in wvv_keys_list:
        embedding_vector = answer[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [11]:
# input_dim = 400
# output_dim = 100
# del answer
embedding_layer = Embedding(vocab_size, 250, 
                            weights=[embedding_matrix], 
                            input_length= max_doc_word_length, 
                            trainable=False)
model = Sequential()
model.add(embedding_layer)

model.add(SimpleRNN(input_dim = 250, output_dim = 50, unroll=True))
nb_classes = 10
model.add(Dense( nb_classes, input_dim = 3971))
model.add(Activation('softmax'))
# early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.00001, patience=0, verbose=0, mode='auto')
model.compile(optimizer='adam',loss='categorical_crossentropy',  metrics=["acc"])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 3971, 250)         58059250  
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 50)                15050     
_________________________________________________________________
dense_1 (Dense)              (None, 10)                510       
_________________________________________________________________
activation_1 (Activation)    (None, 10)                0         
Total params: 58,074,810
Trainable params: 15,560
Non-trainable params: 58,059,250
_________________________________________________________________


In [56]:
history = model.fit(x = sequences1, y = label_list, 
                    validation_split=0.1, 
                    batch_size=150,
                    epochs = 50, verbose = 1)

Train on 8100 samples, validate on 900 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [57]:
# loss_func_name = 'categorical_crossentropy'
# for value in history.history["categorical_crossentropy"]:
#     print(history)

In [58]:
# evaluate the model
loss_accuracy = model.evaluate(sequences1[0:100], label_list[0:100], verbose=1)
print(type(loss_accuracy), loss_accuracy)

<class 'list'> [2.1565294647216797, 0.0]


In [59]:
test_tokenizer = Tokenizer()
test_tokenizer.fit_on_texts(test_texts)
test_sequences = test_tokenizer.texts_to_sequences(test_texts)
test_sequences1 = pad_sequences(test_sequences, maxlen=max_doc_word_length, padding='post')

In [60]:
# print(len(test_sequences1))
# print(len(test_sequences1[0]))
# print(len(label_list))
# print(len(label_list[0]))
# old_label = label_list[0]
# print(type(label_list[0]))
# print(type(old_label))
# for label1 in label_list:
#     if not np.array_equal(label1 , old_label):
#         print(label1)
#         old_label = label1

In [61]:
predict_res = model.predict(test_sequences1, batch_size= 32, verbose=0)
print(len(predict_res), predict_res)

final_res = []
for pre_res in predict_res:
    final_res.append(np.argmax(pre_res))
print(final_res)

1000 [[1.1621815e-01 1.9188400e-01 9.2450708e-02 ... 1.1011779e-04
  8.8705562e-02 1.3003682e-01]
 [7.6237507e-02 9.5963083e-02 1.3309668e-01 ... 7.0879512e-05
  1.4548464e-01 9.6880250e-02]
 [7.6237500e-02 9.5963083e-02 1.3309668e-01 ... 7.0879534e-05
  1.4548463e-01 9.6880257e-02]
 ...
 [1.1621809e-01 1.9188400e-01 9.2450693e-02 ... 1.1011779e-04
  8.8705540e-02 1.3003682e-01]
 [7.6237485e-02 9.5963076e-02 1.3309671e-01 ... 7.0879440e-05
  1.4548464e-01 9.6880265e-02]
 [1.1621810e-01 1.9188401e-01 9.2450701e-02 ... 1.1011769e-04
  8.8705540e-02 1.3003680e-01]]
[1, 8, 8, 0, 8, 1, 6, 8, 6, 8, 1, 1, 6, 6, 6, 6, 8, 8, 8, 6, 0, 1, 1, 0, 1, 6, 8, 6, 1, 0, 8, 6, 8, 6, 0, 8, 6, 0, 8, 8, 8, 6, 8, 8, 6, 8, 8, 6, 1, 6, 0, 0, 0, 6, 8, 6, 8, 0, 8, 8, 8, 1, 0, 6, 8, 0, 0, 8, 8, 1, 1, 0, 0, 1, 6, 6, 1, 0, 8, 8, 6, 6, 8, 8, 6, 6, 6, 6, 8, 6, 1, 0, 0, 1, 6, 6, 6, 6, 6, 6, 0, 0, 8, 8, 8, 8, 1, 8, 0, 6, 8, 6, 1, 8, 8, 8, 6, 1, 0, 0, 8, 8, 1, 8, 8, 8, 8, 6, 6, 6, 6, 8, 6, 6, 8, 1, 8, 6, 8, 1, 8, 6, 8, 1

In [62]:
# result_txt = "result" + str(datetime.now()).split()[1] + ".txt"
print(len(final_res))
result_txt = "result001" + ".txt"
ids = 0
with open(result_txt, 'w') as out:
    out.write("id,category" + '\n')
    for value in final_res:
        out.write(str(ids) + "," + str(value) + '\n')
        ids += 1

1000
