In [1]:
import pandas as pd
import numpy as np

import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Embedding
from keras.layers import Input, Conv1D, GlobalMaxPool1D 
from keras.layers import BatchNormalization, concatenate
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.losses import CategoricalCrossentropy, MeanSquaredError
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils.np_utils import to_categorical
from keras import optimizers

import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVR
from sklearn.linear_model import Ridge

from pathlib import Path
from tqdm import tqdm
import re

In [2]:
ROOT = Path.cwd().parent
INPUT = ROOT/'input'
DATA = INPUT/'feedback-prize-english-language-learning'
WORK = ROOT/'working'
VECS = INPUT/'vectors'

In [3]:
target_col = ['cohesion', 'syntax', 'vocabulary',
              'phraseology', 'grammar', 'conventions']
max_len = 1440 #200
batch_size = 32
epochs = 10
max_words =30000
num_classes = 9

In [4]:
def decontractions(phrase):
    phrase = re.sub(r"wan\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", "not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def preprocess(text):
    preprocessed = []
    for sentence in tqdm(text.values):
        sentence = str(sentence)
        sent = sentence.replace('\n\n', ' ')
        sent = decontractions(sent)
        preprocessed.append(sent.lower().strip())
    return preprocessed

In [5]:
def mcrmse(y_true, y_pred):
    colwise_mse = tf.reduce_mean(tf.square(y_true- y_pred), axis=0)
    return tf.reduce_mean(tf.sqrt(colwise_mse), axis=0)

In [6]:
def model_init(max_len, max_words, embedding_matrix):
    input_1 = Input(shape=(max_len,))
    embed = Embedding(input_dim=max_words,
                      output_dim=128,
                      input_length=max_len,
                      weights=[embedding_matrix],
                      trainable=False)(input_1)

    branches = []
    x = Dropout(0.2)(embed)
    for i in range(2, 6):
        branch = Conv1D(128, i,
                        padding = 'valid',
                        activation = 'relu')(x)
        branch = GlobalMaxPool1D()(branch)
        branches.append(branch)

    x = concatenate(branches, axis=1)
    x = Dropout(0.2)(x)
    x = Dense(32, activation='relu')(x)
    x = Dense(9)(x)
    output = Activation('softmax')(x)

    model = Model(inputs = [input_1],
                  outputs = [output])
    
    model.compile(loss = 'categorical_crossentropy',
                  optimizer = 'adam',
                  metrics = mcrmse)
    return model

In [7]:
df_train = pd.read_csv(DATA/'train.csv')
df_train.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


In [8]:
df_train.full_text = preprocess(df_train.full_text)

token = Tokenizer()
token.fit_on_texts(df_train.full_text)
df_train['token'] = token.texts_to_sequences(df_train.full_text)
x_train = pad_sequences(df_train.token, maxlen=max_len, padding='post')

def extract_vectors(x):
    vecs = vec.transform(x)
    return vecs.toarray().flatten()

vec = TfidfVectorizer(max_features=5000, smooth_idf=True)
vec.fit(df_train.full_text)
df_train['vec'] = df_train.full_text.apply(lambda x: extract_vectors([x]))

100%|██████████| 3911/3911 [00:00<00:00, 30236.31it/s]


In [9]:
embedding_matrix = np.load(VECS/'embedding.npy')
vocab_size = len(token.word_index) + 1

In [10]:
for col in target_col:
    y_train = to_categorical(df_train[col]*2-2, num_classes)
    model_save = WORK/f'model_{col}.h5'
    checkpoint = ModelCheckpoint(model_save,
                                 monitor = 'val_mcrmse',
                                 save_best_only = True,
                                 verbose = 1)
    model = model_init(max_len, vocab_size, embedding_matrix)
    history = model.fit(x_train, y_train,
                        batch_size = batch_size,
                        epochs = epochs,
                        validation_split = 0.1,
                        callbacks = [checkpoint],
                        verbose = 1)
    model.load_weights(model_save)
    model.layers[1].trainable = True
    model.compile(loss = 'categorical_crossentropy',
                  optimizer = 'adam',
                  metrics = mcrmse)
    history_2 = model.fit(x_train, y_train,
                          batch_size = batch_size,
                          epochs = epochs,
                          validation_split = 0.2,
                          callbacks = [checkpoint],
                          verbose = 1)

2022-10-19 09:57:28.316181: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2022-10-19 09:57:28.584452: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10

Epoch 00001: val_mcrmse improved from inf to 0.23406, saving model to /kaggle/working/model_cohesion.h5
Epoch 2/10

Epoch 00002: val_mcrmse did not improve from 0.23406
Epoch 3/10

Epoch 00003: val_mcrmse improved from 0.23406 to 0.23251, saving model to /kaggle/working/model_cohesion.h5
Epoch 4/10

Epoch 00004: val_mcrmse did not improve from 0.23251
Epoch 5/10

Epoch 00005: val_mcrmse did not improve from 0.23251
Epoch 6/10

Epoch 00006: val_mcrmse did not improve from 0.23251
Epoch 7/10

Epoch 00007: val_mcrmse did not improve from 0.23251
Epoch 8/10

Epoch 00008: val_mcrmse improved from 0.23251 to 0.23236, saving model to /kaggle/working/model_cohesion.h5
Epoch 9/10

Epoch 00009: val_mcrmse did not improve from 0.23236
Epoch 10/10

Epoch 00010: val_mcrmse did not improve from 0.23236
Epoch 1/10

Epoch 00001: val_mcrmse did not improve from 0.23236
Epoch 2/10

Epoch 00002: val_mcrmse did not improve from 0.23236
Epoch 3/10

Epoch 00003: val_mcrmse did not improve from 0

In [11]:
feats = []
y_cols = []
for i, row in tqdm(df_train.iterrows()):
    feats.append(row.vec)
    y_cols.append(row[target_col].astype(float))
feats = np.array(feats)
y_cols = np.array(y_cols)

3911it [00:02, 1371.02it/s]


In [12]:
test = pd.read_csv(DATA/'test.csv')
sample = pd.read_csv(DATA/'sample_submission.csv')
sample.text_id = test.text_id
test.head()

Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


In [13]:
test.full_text = preprocess(test.full_text)
x_test = token.texts_to_sequences(test.full_text)
x_test = pad_sequences(x_test, padding='post', maxlen=max_len)

test['vec'] = test.full_text.apply(lambda x: extract_vectors([x]))
test_f = []
for i, row in tqdm(test.iterrows()):
    test_f.append(row.vec)
test_f = np.array(test_f)

100%|██████████| 3/3 [00:00<00:00, 7561.85it/s]
3it [00:00, 2062.43it/s]


In [14]:
#model = model_init(max_len, vocab_size, embedding_matrix)

In [15]:
def label_transform(pred):
    labels = []
    for i in range(len(pred)):
        max_p = max(pred[i])
        for j in range(num_classes):
            if max_p == pred[i][j]:
                ind = (j + 2) / 2
                break
        labels.append(ind)
    return labels

for col in target_col:
  #  model_save = VECS/f'model_{col}.h5'
    model.load_weights(model_save)
    test_pred = model.predict(x_test,
                              batch_size = batch_size,
                              verbose = 1)
    test[f'{col}_cnn'] = label_transform(test_pred)



In [16]:
for i in tqdm(range(6)):
    svr = SVR(C=1)
    svr.fit(feats, y_cols[:,i])
    pred_s = svr.predict(test_f)
    test[f'{target_col[i]}_svr'] = pred_s

    ridge = Ridge(alpha=0.1)
    ridge.fit(feats, y_cols[:,i])
    pred_r = ridge.predict(test_f)
    test[f'{target_col[i]}_ridge'] = pred_r

100%|██████████| 6/6 [04:51<00:00, 48.66s/it]


In [17]:
for col in target_col:
    sample[col] = (test[f'{col}_cnn'] +
                   test[f'{col}_svr'] +
                   test[f'{col}_ridge']) / 3
sample.head()

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.893385,2.717177,2.98352,2.868692,2.687239,2.806138
1,000BAD50D026,2.859122,2.872375,2.864013,2.744721,2.639232,3.107684
2,00367BB2546B,3.420841,3.365519,3.551246,3.445748,3.476263,3.412227


In [18]:
sample.to_csv('submission.csv', index=False)