In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, GlobalMaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [2]:
train_data = pd.read_csv('changed_train.csv')
test_data = pd.read_csv('changed_test.csv')

In [3]:
train_y = train_data.MaterialType

In [4]:
train_X = train_data[['Title','Subjects']]

In [5]:
test_X = test_data[['Title','Subjects']]

In [6]:
train_X = train_X.fillna('unknown')

In [7]:
test_X = test_X.fillna('unknown')

In [34]:
num_labels = 8
vocab_size = 5000
batch_size = 64

In [35]:
train_X_arr = train_X.Title+" "+train_X.Subjects
test_X_arr = test_X.Title+" "+test_X.Subjects
train_X_arr = train_X_arr.values
test_X_arr = test_X_arr.values

In [36]:
seq = np.hstack((train_X_arr, test_X_arr))

In [37]:
tokn = Tokenizer(num_words=vocab_size)
tokn.fit_on_texts(seq)

In [38]:
max_len = 40
cnn_texts_seq = tokn.texts_to_sequences(train_X_arr)
print(cnn_texts_seq[0])
cnn_texts_mat = sequence.pad_sequences(cnn_texts_seq,maxlen=max_len)
print(cnn_texts_mat[0])
print(cnn_texts_mat.shape)

[2943, 4, 8]
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0 2943    4    8]
(31653, 40)


In [39]:
max_len = 40
cnn_texts_seqo = tokn.texts_to_sequences(test_X_arr)
print(cnn_texts_seq[0])
cnn_texts_mato = sequence.pad_sequences(cnn_texts_seqo,maxlen=max_len)
print(cnn_texts_mato[0])
print(cnn_texts_mato.shape)

[2943, 4, 8]
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0  125    2 2471   44    1  313    3  124    1]
(21102, 40)


In [40]:
encoder = LabelBinarizer()
encoder.fit(train_y)

LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)

In [41]:
y_train = encoder.transform(train_y)

In [42]:
def f2_score(y_true, y_pred):
    y_true = tf.cast(y_true, "int32")
    y_pred = tf.cast(tf.round(y_pred), "int32") # implicit 0.5 threshold via tf.round
    y_correct = y_true * y_pred
    sum_true = tf.reduce_sum(y_true, axis=1)
    sum_pred = tf.reduce_sum(y_pred, axis=1)
    sum_correct = tf.reduce_sum(y_correct, axis=1)
    precision = sum_correct / sum_pred
    recall = sum_correct / sum_true
    f_score = 5 * precision * recall / (4 * precision + recall)
    f_score = tf.where(tf.is_nan(f_score), tf.zeros_like(f_score), f_score)
    return tf.reduce_mean(f_score)

In [None]:
def cnn_model():
    model = Sequential()
    model.add(Embedding(5000,50,input_length=max_len))
    model.add(Dropout(0.4))
    model.add(Conv1D(256,3,padding='valid',activation='relu',strides=1))
    model.add(Conv1D(128,3,padding='valid',activation='relu',strides=1))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(128))
    model.add(Dropout(0.5))
    model.add(Activation('relu'))
    model.add(Dense(64))
    model.add(Dropout(0.5))
    model.add(Activation('relu'))
    model.add(Dense(8))
    model.add(Activation('softmax'))
    model.summary()
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=[f2_score])
    return model

def check_model(model,x,y):
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,restore_best_weights=True,patience=10)
    model.fit(x,y,batch_size=32,epochs=20,verbose=1,validation_split=0.2,callbacks=[es])


m = cnn_model()
check_model(m,cnn_texts_mat,y_train)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 40, 50)            250000    
_________________________________________________________________
dropout_22 (Dropout)         (None, 40, 50)            0         
_________________________________________________________________
conv1d_18 (Conv1D)           (None, 38, 256)           38656     
_________________________________________________________________
conv1d_19 (Conv1D)           (None, 36, 128)           98432     
_________________________________________________________________
global_max_pooling1d_10 (Glo (None, 128)               0         
_________________________________________________________________
dense_21 (Dense)             (None, 128)               16512     
_________________________________________________________________
dropout_23 (Dropout)         (None, 128)               0         
__________

In [41]:
def old_model():
    model = Sequential()
    model.add(Dense(512, input_shape=(vocab_size,)))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(256))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(128))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(32))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_labels))
    model.add(Activation('softmax'))
    model.summary()

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=[f2_score])

    history = model.fit(x_train, y_train,
                        batch_size=batch_size,
                        epochs=5,
                        verbose=1,
                        validation_split=0.1)

In [31]:
y_pred = m.predict(cnn_texts_mato)

In [32]:
new_y_pred = np.argmax(y_pred, axis=1)

In [33]:
t_E = encoder.classes_
c = t_E[new_y_pred]
test_set = pd.read_csv('test_file.csv')
new_df = pd.DataFrame({"ID":test_set["ID"],"MaterialType":c})
new_df=new_df.set_index("ID")
new_df.to_csv("submission_18.csv")