In [1]:
import numpy as np
import os
import tensorflow as tf
from tensorflow import keras
from keras import layers, models
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

This notebook is a followup on the approach on the baseline model submitted for the competition. Necessary EDA work has already been done on [that notebook](https://www.kaggle.com/code/adewoleakorede/feedback-prize-baseline1), so this and any future ones will simply go to making more sophisticated models.

In [2]:
train = pd.read_csv('../input/feedback-prize-english-language-learning/train.csv')
test = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def preprocess_dataset(dataset, train = True):
    """Utility function to preprocess dataset to expected tensors by the model"""
    inputs = dataset['full_text'].values.tolist()
    
    num_words = 20000
    tokenizer = Tokenizer(num_words = num_words)
    tokenizer.fit_on_texts(inputs)
    word_index = tokenizer.word_index # required for embedding
    
    sequences = tokenizer.texts_to_sequences(inputs)
    maxlen = 512
    inputs = pad_sequences(sequences, maxlen=maxlen, padding='post', truncating='post')
    inputs = np.array(inputs)
    
    if train:
        targets = dataset[['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']].values.tolist()
        targets = np.array(targets)
        return inputs, targets, word_index
    else:
        return inputs # test does not have targets

In [4]:
inputs, targets, word_index = preprocess_dataset(train)

In [5]:
inputs.shape, targets.shape

((3911, 512), (3911, 6))

In [6]:
test = preprocess_dataset(test, train=False)

## ML Models

The approach of this notebook is to use the glove pretrained embeddings on the task for a higher score.

In [7]:
num_words = 20000

In [8]:
model = models.Sequential(name='pretrained')
model.add(layers.Embedding(input_dim=num_words, output_dim=200, input_length=512))
model.add(layers.LSTM(64, dropout=0.2, return_sequences=False))
model.add(layers.Dense(6))

2022-09-01 20:38:42.321385: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-01 20:38:42.437250: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-01 20:38:42.438034: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-01 20:38:42.439236: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [9]:
model.summary()

Model: "pretrained"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 512, 200)          4000000   
_________________________________________________________________
lstm (LSTM)                  (None, 64)                67840     
_________________________________________________________________
dense (Dense)                (None, 6)                 390       
Total params: 4,068,230
Trainable params: 4,068,230
Non-trainable params: 0
_________________________________________________________________


In [10]:
path_to_embedding = '../input/glove-embeddings/glove.6B.200d.txt'

embedding_index = {} # create a dictionary to store the index
f = open(path_to_embedding)

for line in f:
    values = line.split()
    words = values[0]
    coefs = np.array(values[1:], dtype='float32')
    embedding_index[words] = coefs
f.close()  
print(len(embedding_index))

400000


In [11]:
embedding_dim = 200
embedding_matrix = np.zeros((num_words, embedding_dim))

for word, index in word_index.items():
    if index < num_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [12]:
embedding_matrix.shape

(20000, 200)

In [13]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

In [14]:
model.summary()

Model: "pretrained"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 512, 200)          4000000   
_________________________________________________________________
lstm (LSTM)                  (None, 64)                67840     
_________________________________________________________________
dense (Dense)                (None, 6)                 390       
Total params: 4,068,230
Trainable params: 68,230
Non-trainable params: 4,000,000
_________________________________________________________________


In [15]:
model.compile(loss='mse', optimizer='adam', metrics='acc')
history = model.fit(x=inputs, y=targets, epochs=20)

Epoch 1/20


2022-09-01 20:39:03.235997: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2022-09-01 20:39:05.466030: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [16]:
# evaluate

preds = model.predict(test)
predictions = preds.tolist()

In [17]:
test = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')

In [18]:
submission = pd.DataFrame(predictions, columns=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'])
submission = pd.concat([test['text_id'], submission], axis=1)
submission

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.970589,2.750278,3.039046,2.731361,2.644364,2.960699
1,000BAD50D026,3.158048,3.109138,3.244158,3.164521,3.161529,3.186002
2,00367BB2546B,3.482982,3.441391,3.539438,3.520582,3.522839,3.571836


In [19]:
submission.to_csv('submission.csv', index=False)