In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import tensorflow as tf
from nltk import word_tokenize, RegexpTokenizer
import gensim.downloader as api

In [None]:
data_train = pd.read_csv('./data/train.csv')

In [None]:
import pandas as pd
import numpy as np
import os
import string
import tensorflow as tf
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords

In [4]:
full_text = data_train['full_text']
cohesion = data_train['cohesion']
syntax = data_train['syntax']
vocabulary = data_train['vocabulary']
phraseology = data_train['phraseology']
grammar = data_train['grammar']
conventions = data_train['conventions']

In [5]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /Users/arpan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def CleanText(sentences):
    sentences = sentences.apply(lambda sequence : 
                                [ltrs.lower() for ltrs in sequence if ltrs not in string.punctuation])
    sentences = sentences.apply(lambda wrd: ''.join(wrd))
    sentences = sentences.apply(lambda sequence: 
                                [word for word in sequence.split() if word not in stop_words])
    sentences = sentences.apply(lambda wrd: ' '.join(wrd))
    return sentences

def CleanFeatures(sentences):
    sentences = sentences.apply(lambda sequence:
                                            [ltrs.lower() for ltrs in sequence if ltrs not in string.punctuation])
    sentences = sentences.apply(lambda wrd: ''.join(wrd))
    sentences = sentences.apply(lambda sequence:
                                            [word for word in sequence.split() if word not in stop_words])
    sentences = sentences.apply(lambda wrd: ' '.join(wrd))
    return sentences


In [7]:
clean_text = CleanFeatures(full_text)
clean_text

0       think students would benefit learning homebeca...
1       problem change let best matter happening chang...
2       dear principal u change school policy grade b ...
3       best time life become agree greatest accomplis...
4       small act kindness impact people change people...
                              ...                        
3906    believe using cellphones class education us go...
3907    working alone students argue decission proyect...
3908    problem chance best think quote cant best ever...
3909    many people disagree albert schweitzers quote ...
3910    think failure main thing people consist goals ...
Name: full_text, Length: 3911, dtype: object

In [8]:
from transformers import AutoTokenizer

In [9]:
list_words = [len(text.split()) for text in clean_text]
#seq_len = max(list_words)
seq_len = 512
seq_len

512

In [10]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [11]:
input_ids = []
attention_mask = []
for text in clean_text:
    tokens = tokenizer.encode_plus(text, max_length=seq_len, padding='max_length',
                         truncation=True, return_token_type_ids= True,
                         return_tensors = 'np')
    input_ids.append(tokens['input_ids'])
    attention_mask.append(tokens['attention_mask'])

In [12]:
input_ids = np.asarray(input_ids)
attention_mask = np.asarray(attention_mask)
input_ids.shape

(3911, 1, 512)

In [13]:
input_ids = np.reshape(input_ids, (input_ids.shape[0], input_ids.shape[2]))
attention_mask = np.reshape(attention_mask, (attention_mask.shape[0], attention_mask.shape[2]))
print(input_ids.shape)
print(attention_mask.shape)

(3911, 512)
(3911, 512)


In [14]:
syntax = np.asarray(syntax)
cohesion = np.asarray(cohesion)
vocabulary = np.asarray(vocabulary)
phraseology = np.asarray(phraseology)
grammar = np.asarray(grammar)
conventions = np.asarray(conventions)

In [15]:
from transformers import TFBertModel
bert = TFBertModel.from_pretrained('bert-base-cased')
from keras.layers import LSTM, Lambda
from keras.layers import add, maximum, subtract, minimum

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [16]:
def resnet(inputs, units):
    x = tf.keras.layers.LSTM(units,return_sequences = True, dropout=0.15)(inputs)
    return x

In [17]:
units = 32
input_ids_m = tf.keras.layers.Input(shape = (seq_len, ), dtype = 'int32')
attention_mask_n = tf.keras.layers.Input(shape = (seq_len, ),  dtype = 'int32')
bert_m = bert(input_ids_m, attention_mask = attention_mask_n)[0]
x = resnet(bert_m, units)
for stack in range(3):
    for block in range(1):
        y = resnet(x, units)
        if stack > 0 and block == 0:
            x = tf.keras.layers.LSTM(units, return_sequences = True, dropout=0.15, recurrent_dropout=0.15)(x)
        x = minimum([x, y])
    units *=2
x1 = tf.keras.layers.GlobalAveragePooling1D()(x)
x2 = tf.keras.layers.GlobalMaxPool1D()(x)
x = tf.keras.layers.concatenate([x1, x2], name="our_param")
y = tf.keras.layers.Dense(1,  name='syntax')(x)
y2 = tf.keras.layers.Dense(1, name='cohesion')(x)
y3 = tf.keras.layers.Dense(1, name='vocabulary')(x)
test_model = tf.keras.models.Model(inputs = [input_ids_m, attention_mask_n], outputs = [y, y2, y3])

In [18]:
test_model.layers[2].trainable = False

In [19]:
test_model.load_weights('bert_model_weights.h5')

In [20]:
test_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 512)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 512)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  108310272   ['input_1[0][0]',                
                                thPoolingAndCrossAt               'input_2[0][0]']                
                                tentions(last_hidde                                               
                                n_state=(None, 512,                                           

In [33]:
inputs = test_model.input
x = test_model.get_layer('our_param')(inputs)
outputs = tf.keras.layers.Dense(6)(x)

model = tf.keras.models.Model(inputs = inputs, outputs = outputs)

In [41]:
model.compile(loss="categorical_crossentropy", optimizer= "adam")
model.layers[0].trainable = False
model.layers[1].trainable = False
model.layers[2].trainable = False


In [44]:
train_size = 3300
x_train = [input_ids[:train_size], attention_mask[:train_size]]
y_train = [syntax[:train_size], vocabulary[:train_size], cohesion[:train_size], 
           phraseology[:train_size], grammar[:train_size], conventions[:train_size]]
x_test = [input_ids[train_size:], attention_mask[train_size:]]
y_test = [syntax[train_size:], vocabulary[train_size:], cohesion[train_size:], 
           phraseology[train_size:], grammar[train_size:], conventions[train_size:]]
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 512)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 512)]        0           []                               
                                                                                                  
 our_param (Concatenate)        multiple             0           ['input_1[0][0]',                
                                                                  'input_2[0][0]']                
                                                                                                  
 dense_1 (Dense)                (None, 6)            6150        ['our_param[1][0]']        

In [45]:
history = model.fit(x_train,y_train)

ValueError: in user code:

    File "/opt/miniconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "/opt/miniconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/miniconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "/opt/miniconda3/lib/python3.9/site-packages/keras/engine/training.py", line 994, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/opt/miniconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1052, in compute_loss
        return self.compiled_loss(
    File "/opt/miniconda3/lib/python3.9/site-packages/keras/engine/compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/opt/miniconda3/lib/python3.9/site-packages/keras/losses.py", line 152, in __call__
        losses = call_fn(y_true, y_pred)
    File "/opt/miniconda3/lib/python3.9/site-packages/keras/losses.py", line 272, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/opt/miniconda3/lib/python3.9/site-packages/keras/losses.py", line 1990, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "/opt/miniconda3/lib/python3.9/site-packages/keras/backend.py", line 5529, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 1) and (None, 6) are incompatible


In [38]:
attention_mask.shape

(3911, 512)