In [81]:
#import all required packages and modules.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout, Bidirectional, SpatialDropout1D
from tensorflow.keras.layers import Embedding, Flatten
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
from tensorflow.keras.models import Model
from tensorflow.keras.initializers import Constant
from tensorflow.keras import layers

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import WordNetLemmatizer

from simpletransformers.classification import ClassificationModel
from transformers import AutoTokenizer, AutoModel

import tensorflow as tf
import transformers #huggingface transformers library
from sklearn.metrics import confusion_matrix

In [82]:
#load train and test data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data

Unnamed: 0,Id,Category,Title
0,0,sport,Roddick in talks over new coach
1,1,entertainment,Prodigy join V Festival line-up
2,2,entertainment,Sundance to honour foreign films
3,3,sport,Dunne keen to commit to Man City
4,4,politics,Row over 'police' power for CSOs
...,...,...,...
1775,1775,business,Lufthansa may sue over Bush visit
1776,1776,tech,Rolling out next generation's net
1777,1777,sport,Mirza makes Indian tennis history
1778,1778,tech,GTA sequel is criminally good


In [83]:
X = train_data['Title']
test_X = test_data['Title']
Y = train_data['Category']

In [84]:
#用label encoder將label轉成(0 1 2 3)
le = LabelEncoder()
le.fit(Y)
Y = le.transform(Y)
print(Y)

[3 1 1 ... 3 4 4]


In [85]:
# for i in range(len(X)):
#     #轉成小寫
#     X[i] = X[i].lower()
# for i in range(len(test_X)):
#     #轉成小寫
#     test_X[i] = test_X[i].lower()

In [86]:
train_text = np.array(X)
test_text = np.array(test_X)
train_text_dic = np.concatenate([train_text, test_text])
MAX_SEQUENCE_LENGTH = 10
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text_dic)
train_sequence = tokenizer.texts_to_sequences(train_text)
word_index = tokenizer.word_index
train_text_data = pad_sequences(train_sequence, maxlen = MAX_SEQUENCE_LENGTH)
train_text_labels = to_categorical(np.asarray(Y))
print('Shape of data tensor:', train_text_data.shape)
print('Shape of label tensor:', train_text_labels.shape)
print(len(word_index))
print(train_text_data)

test_text = np.array(test_X)
test_sequence = tokenizer.texts_to_sequences(test_text)
#word_index = tokenizer.word_index
test_text_data = pad_sequences(test_sequence, maxlen = MAX_SEQUENCE_LENGTH)

Shape of data tensor: (1780, 10)
Shape of label tensor: (1780, 5)
4081
[[   0    0    0 ...    7    8  249]
 [   0    0    0 ...  250  307   13]
 [   0    0    0 ...  140  684  685]
 ...
 [   0    0    0 ...  285  608  892]
 [   0    0    0 ...   59 3622 1208]
 [   0    0    0 ...    8  478  501]]


In [91]:
class TransformerBlock(layers.Layer):
    def __init__(self, EMBEDDING_DIM, attention, feed_forward_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.attention = layers.MultiHeadAttention(num_heads=attention, key_dim=EMBEDDING_DIM)
        self.feed_forward = Sequential([layers.Dense(feed_forward_dim, activation="relu"), layers.Dense(EMBEDDING_DIM),])
        self.normalization_1 = layers.LayerNormalization(epsilon=1e-6)
        self.normalization_2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attention_output = self.attention(inputs, inputs)
        attention_output = self.dropout1(attention_output, training=training)
        out1 = self.normalization_1(inputs + attention_output)
        feed_forward_output = self.feed_forward(out1)
        feed_forward_output = self.dropout2(feed_forward_output, training=training)
        return self.normalization_2(out1 + feed_forward_output)

In [88]:
EMBEDDING_DIM = 500
embeddings_index = {}
with open('enwiki_20180420_'+str(EMBEDDING_DIM)+'d.txt', encoding="utf-8") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs        
#Create glove embedding matrix
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
QQ=0
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word.lower())
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        QQ = QQ+1
print('Found %s word vectors.' % len(embeddings_index))
#Keras Glove Embedding layer
embedding_layer = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length = MAX_SEQUENCE_LENGTH, trainable=True)

  coefs = np.fromstring(coefs, 'f', sep=' ')


Found 4529833 word vectors.


In [95]:
#attention head的數量
num_heads = 2  
#transformer中feed forward network的Hidden layer size 
feed_forward_dim = 32  
inputs = layers.Input(shape=(MAX_SEQUENCE_LENGTH,))
x = embedding_layer(inputs)
x = TransformerBlock(EMBEDDING_DIM, num_heads, feed_forward_dim)(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
model = Model(inputs=inputs, outputs = layers.Dense(5, activation="softmax")(x))
model.summary()

Model: "model_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_12 (InputLayer)        [(None, 10)]              0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 10, 500)           2041000   
_________________________________________________________________
transformer_block_9 (Transfo (None, 10, 500)           2038032   
_________________________________________________________________
global_average_pooling1d_9 ( (None, 500)               0         
_________________________________________________________________
dropout_35 (Dropout)         (None, 500)               0         
_________________________________________________________________
dense_35 (Dense)             (None, 5)                 2505      
Total params: 4,081,537
Trainable params: 4,081,537
Non-trainable params: 0
_________________________________________________

In [93]:
X_train, X_test, y_train, y_test = train_test_split(train_text_data, train_text_labels, test_size=0.2, random_state=1, shuffle = False)
model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [94]:
predictions = model.predict(test_text_data)
submit = []
result = np.zeros(len(predictions))
count = 0
print(predictions[1])
result = np.array([np.argmax(y, axis=None, out=None) for y in predictions])
for pred in predictions:
    p = pred.argmax()
    submit.append(le.inverse_transform([p])[0])
    count = count+1
print(submit)
submission = pd.DataFrame({'Id':test_data['Id'], 'Category':submit})
submission.to_csv('309706019_submission.csv', index=False)

[9.9999964e-01 1.8038696e-09 1.2611831e-10 1.0605586e-11 3.8500909e-07]
['business', 'business', 'politics', 'tech', 'sport', 'business', 'politics', 'politics', 'tech', 'sport', 'sport', 'politics', 'entertainment', 'entertainment', 'politics', 'business', 'politics', 'entertainment', 'sport', 'tech', 'politics', 'business', 'sport', 'sport', 'tech', 'business', 'tech', 'sport', 'entertainment', 'sport', 'entertainment', 'tech', 'entertainment', 'tech', 'sport', 'business', 'entertainment', 'sport', 'politics', 'business', 'entertainment', 'sport', 'politics', 'business', 'business', 'politics', 'business', 'politics', 'politics', 'sport', 'tech', 'entertainment', 'sport', 'entertainment', 'sport', 'business', 'entertainment', 'politics', 'sport', 'tech', 'politics', 'sport', 'sport', 'politics', 'entertainment', 'sport', 'tech', 'politics', 'business', 'politics', 'politics', 'entertainment', 'business', 'tech', 'sport', 'tech', 'politics', 'business', 'sport', 'politics', 'tech', 'e