In [1]:
import pandas as pd
import numpy as np

import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Embedding
from keras.layers import Input, Conv1D, GlobalMaxPool1D 
from keras.layers import BatchNormalization, concatenate
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.losses import CategoricalCrossentropy, MeanSquaredError
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils.np_utils import to_categorical
from keras import optimizers

from gensim.models import KeyedVectors

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from pathlib import Path
from tqdm import tqdm
import re

In [2]:
ROOT = Path.cwd().parent
INPUT = ROOT/'input'
DATA = INPUT/'feedback-prize-english-language-learning'
WORK = ROOT/'working'
VECS = INPUT/'vectors'

In [3]:
target_col = ['cohesion', 'syntax', 'vocabulary',
              'phraseology', 'grammar', 'conventions']
max_len = 1440 #200
batch_size = 32
epochs = 10
max_words =30000
num_classes = 9

In [4]:
def decontractions(phrase):
    phrase = re.sub(r"wan\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", "not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def preprocess(text):
    preprocessed = []
    for sentence in tqdm(text.values):
        sentence = str(sentence)
        sent = sentence.replace('\n\n', ' ')
        sent = decontractions(sent)
        preprocessed.append(sent.lower().strip())
    return preprocessed

In [5]:
def model_init(max_len, max_words, embedding_matrix):
    input_1 = Input(shape=(max_len,))
    embed = Embedding(input_dim=max_words,
                      output_dim=128,
                      input_length=max_len,
                      weights=[embedding_matrix],
                      trainable=False)(input_1)

    branches = []
    x = Dropout(0.2)(embed)
    for i in range(2, 6):
        branch = Conv1D(128, i,
                        padding = 'valid',
                        activation = 'relu')(x)
        branch = GlobalMaxPool1D()(branch)
        branches.append(branch)

    x = concatenate(branches, axis=1)
    x = Dropout(0.2)(x)
    x = Dense(32, activation='relu')(x)
    x = Dense(9)(x)
    output = Activation('softmax')(x)

    model = Model(inputs = [input_1],
                  outputs = [output])
    
    model.compile(loss = 'categorical_crossentropy',
                  optimizer = 'adam',
                  metrics = ['accuracy'])
    return model

In [6]:
df_train = pd.read_csv(DATA/'train.csv')
df_train.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


In [7]:
df_train.full_text = preprocess(df_train.full_text)

token = Tokenizer()
token.fit_on_texts(df_train.full_text)
df_train['token'] = token.texts_to_sequences(df_train.full_text)
x_train = pad_sequences(df_train.token, maxlen=max_len, padding='post')

100%|██████████| 3911/3911 [00:00<00:00, 37286.13it/s]


In [8]:
#lines = list(token.word_index.keys())
#with open(WORK/'token.txt', 'w') as file:
#    for line in lines:
#        file.write(line + '\n')
        

In [9]:
embedding_matrix = np.load(VECS/'embedding.npy')
vocab_size = len(token.word_index) + 1

In [10]:
for col in tqdm(target_col):
    y_train = to_categorical(df_train[col]*2-2, num_classes)
    model_save = WORK/f'model_{col}.h5'
    checkpoint = ModelCheckpoint(model_save,
                                 monitor = 'val_accuracy',
                                 save_best_only = True,
                                 verbose = 1)
    model = model_init(max_len, vocab_size, embedding_matrix)
    history = model.fit(x_train, y_train,
                        batch_size = batch_size,
                        epochs = epochs,
                        validation_split = 0.1,
                        callbacks = [checkpoint],
                        verbose = 1)
    model.load_weights(model_save)
    model.layers[1].trainable = True
    model.compile(loss = 'categorical_crossentropy',
                  optimizer = 'adam',
                  metrics = ['accuracy'])
    history_2 = model.fit(x_train, y_train,
                          batch_size = batch_size,
                          epochs = epochs,
                          validation_split = 0.2,
                          callbacks = [checkpoint],
                          verbose = 1)

  0%|          | 0/6 [00:00<?, ?it/s]2022-10-03 23:12:23.051169: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2022-10-03 23:12:23.369531: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.26020, saving model to /kaggle/working/model_cohesion.h5
Epoch 2/10

Epoch 00002: val_accuracy improved from 0.26020 to 0.27296, saving model to /kaggle/working/model_cohesion.h5
Epoch 3/10

Epoch 00003: val_accuracy improved from 0.27296 to 0.28316, saving model to /kaggle/working/model_cohesion.h5
Epoch 4/10

Epoch 00004: val_accuracy did not improve from 0.28316
Epoch 5/10

Epoch 00005: val_accuracy did not improve from 0.28316
Epoch 6/10

Epoch 00006: val_accuracy improved from 0.28316 to 0.30612, saving model to /kaggle/working/model_cohesion.h5
Epoch 7/10

Epoch 00007: val_accuracy did not improve from 0.30612
Epoch 8/10

Epoch 00008: val_accuracy did not improve from 0.30612
Epoch 9/10

Epoch 00009: val_accuracy improved from 0.30612 to 0.31633, saving model to /kaggle/working/model_cohesion.h5
Epoch 10/10

Epoch 00010: val_accuracy did not improve from 0.31633
Epoch 1/10

Epoch 00001: val_accuracy did not improve fro

 17%|█▋        | 1/6 [26:17<2:11:29, 1577.80s/it]


Epoch 00010: val_accuracy did not improve from 0.34866
Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.32398, saving model to /kaggle/working/model_syntax.h5
Epoch 2/10

Epoch 00002: val_accuracy did not improve from 0.32398
Epoch 3/10

Epoch 00003: val_accuracy did not improve from 0.32398
Epoch 4/10

Epoch 00004: val_accuracy did not improve from 0.32398
Epoch 5/10

Epoch 00005: val_accuracy did not improve from 0.32398
Epoch 6/10

Epoch 00006: val_accuracy improved from 0.32398 to 0.33929, saving model to /kaggle/working/model_syntax.h5
Epoch 7/10

Epoch 00007: val_accuracy did not improve from 0.33929
Epoch 8/10

Epoch 00008: val_accuracy did not improve from 0.33929
Epoch 9/10

Epoch 00009: val_accuracy did not improve from 0.33929
Epoch 10/10

Epoch 00010: val_accuracy improved from 0.33929 to 0.35459, saving model to /kaggle/working/model_syntax.h5
Epoch 1/10

Epoch 00001: val_accuracy did not improve from 0.35459
Epoch 2/10

Epoch 00002: val_accuracy did not impr

 33%|███▎      | 2/6 [52:37<1:45:14, 1578.67s/it]


Epoch 00010: val_accuracy did not improve from 0.37548
Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.39286, saving model to /kaggle/working/model_vocabulary.h5
Epoch 2/10

Epoch 00002: val_accuracy did not improve from 0.39286
Epoch 3/10

Epoch 00003: val_accuracy did not improve from 0.39286
Epoch 4/10

Epoch 00004: val_accuracy did not improve from 0.39286
Epoch 5/10

Epoch 00005: val_accuracy did not improve from 0.39286
Epoch 6/10

Epoch 00006: val_accuracy did not improve from 0.39286
Epoch 7/10

Epoch 00007: val_accuracy did not improve from 0.39286
Epoch 8/10

Epoch 00008: val_accuracy did not improve from 0.39286
Epoch 9/10

Epoch 00009: val_accuracy did not improve from 0.39286
Epoch 10/10

Epoch 00010: val_accuracy did not improve from 0.39286
Epoch 1/10

Epoch 00001: val_accuracy improved from 0.39286 to 0.39847, saving model to /kaggle/working/model_vocabulary.h5
Epoch 2/10

Epoch 00002: val_accuracy did not improve from 0.39847
Epoch 3/10

Epoch 00003: val

 50%|█████     | 3/6 [1:19:04<1:19:08, 1582.75s/it]


Epoch 00010: val_accuracy did not improve from 0.39847
Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.21684, saving model to /kaggle/working/model_phraseology.h5
Epoch 2/10

Epoch 00002: val_accuracy did not improve from 0.21684
Epoch 3/10

Epoch 00003: val_accuracy improved from 0.21684 to 0.29337, saving model to /kaggle/working/model_phraseology.h5
Epoch 4/10

Epoch 00004: val_accuracy did not improve from 0.29337
Epoch 5/10

Epoch 00005: val_accuracy did not improve from 0.29337
Epoch 6/10

Epoch 00006: val_accuracy improved from 0.29337 to 0.30357, saving model to /kaggle/working/model_phraseology.h5
Epoch 7/10

Epoch 00007: val_accuracy improved from 0.30357 to 0.34949, saving model to /kaggle/working/model_phraseology.h5
Epoch 8/10

Epoch 00008: val_accuracy did not improve from 0.34949
Epoch 9/10

Epoch 00009: val_accuracy did not improve from 0.34949
Epoch 10/10

Epoch 00010: val_accuracy improved from 0.34949 to 0.35204, saving model to /kaggle/working/model_p

 67%|██████▋   | 4/6 [1:45:36<52:52, 1586.22s/it]  


Epoch 00010: val_accuracy did not improve from 0.35204
Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.24745, saving model to /kaggle/working/model_grammar.h5
Epoch 2/10

Epoch 00002: val_accuracy improved from 0.24745 to 0.25510, saving model to /kaggle/working/model_grammar.h5
Epoch 3/10

Epoch 00003: val_accuracy did not improve from 0.25510
Epoch 4/10

Epoch 00004: val_accuracy improved from 0.25510 to 0.26276, saving model to /kaggle/working/model_grammar.h5
Epoch 5/10

Epoch 00005: val_accuracy did not improve from 0.26276
Epoch 6/10

Epoch 00006: val_accuracy improved from 0.26276 to 0.28571, saving model to /kaggle/working/model_grammar.h5
Epoch 7/10

Epoch 00007: val_accuracy improved from 0.28571 to 0.30612, saving model to /kaggle/working/model_grammar.h5
Epoch 8/10

Epoch 00008: val_accuracy did not improve from 0.30612
Epoch 9/10

Epoch 00009: val_accuracy did not improve from 0.30612
Epoch 10/10

Epoch 00010: val_accuracy did not improve from 0.30612
Epoch 

 83%|████████▎ | 5/6 [2:12:05<26:27, 1587.42s/it]


Epoch 00010: val_accuracy improved from 0.31673 to 0.31801, saving model to /kaggle/working/model_grammar.h5
Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.29592, saving model to /kaggle/working/model_conventions.h5
Epoch 2/10

Epoch 00002: val_accuracy did not improve from 0.29592
Epoch 3/10

Epoch 00003: val_accuracy did not improve from 0.29592
Epoch 4/10

Epoch 00004: val_accuracy did not improve from 0.29592
Epoch 5/10

Epoch 00005: val_accuracy did not improve from 0.29592
Epoch 6/10

Epoch 00006: val_accuracy did not improve from 0.29592
Epoch 7/10

Epoch 00007: val_accuracy did not improve from 0.29592
Epoch 8/10

Epoch 00008: val_accuracy did not improve from 0.29592
Epoch 9/10

Epoch 00009: val_accuracy did not improve from 0.29592
Epoch 10/10

Epoch 00010: val_accuracy did not improve from 0.29592
Epoch 1/10

Epoch 00001: val_accuracy improved from 0.29592 to 0.29757, saving model to /kaggle/working/model_conventions.h5
Epoch 2/10

Epoch 00002: val_accuracy d

100%|██████████| 6/6 [2:38:13<00:00, 1582.25s/it]


Epoch 00010: val_accuracy did not improve from 0.31162





In [11]:
test = pd.read_csv(DATA/'test.csv')
sample = pd.read_csv(DATA/'sample_submission.csv')
sample.text_id = test.text_id
test.head()

Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


In [12]:
test.full_text = preprocess(test.full_text)
x_test = token.texts_to_sequences(test.full_text)
x_test = pad_sequences(x_test, padding='post', maxlen=max_len)

100%|██████████| 3/3 [00:00<00:00, 11554.56it/s]


In [13]:
def label_transform(pred):
    labels = []
    for i in range(len(pred)):
        max_p = max(pred[i])
        for j in range(num_classes):
            if max_p == pred[i][j]:
                ind = (j + 2) / 2
                break
        labels.append(ind)
    return labels

for col in tqdm(target_col):
    model.load_weights(model_save)
    test_pred = model.predict(x_test,
                              batch_size = batch_size,
                              verbose = 1)
    sample[col] = label_transform(test_pred)

  0%|          | 0/6 [00:00<?, ?it/s]



 17%|█▋        | 1/6 [00:00<00:01,  4.57it/s]



 50%|█████     | 3/6 [00:00<00:00,  7.94it/s]



 67%|██████▋   | 4/6 [00:00<00:00,  8.51it/s]



100%|██████████| 6/6 [00:00<00:00,  8.78it/s]


In [14]:
sample.to_csv('submission.csv', index=False)