In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import tensorflow as tf
from nltk import word_tokenize, RegexpTokenizer
import gensim.downloader as api

In [2]:
data_train = pd.read_csv('./data/train.csv')

In [3]:
import pandas as pd
import numpy as np
import os
import string
import tensorflow as tf
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords

In [4]:
data_train.describe()

Unnamed: 0,cohesion,syntax,vocabulary,phraseology,grammar,conventions
count,3911.0,3911.0,3911.0,3911.0,3911.0,3911.0
mean,3.127077,3.028254,3.235745,3.11685,3.032856,3.081053
std,0.662542,0.644399,0.583148,0.655997,0.699841,0.67145
min,1.0,1.0,1.0,1.0,1.0,1.0
25%,2.5,2.5,3.0,2.5,2.5,2.5
50%,3.0,3.0,3.0,3.0,3.0,3.0
75%,3.5,3.5,3.5,3.5,3.5,3.5
max,5.0,5.0,5.0,5.0,5.0,5.0


In [5]:
full_text = data_train['full_text']
cohesion = data_train['cohesion']
syntax = data_train['syntax']
vocabulary = data_train['vocabulary']
phraseology = data_train['phraseology']
grammar = data_train['grammar']
conventions = data_train['conventions']

In [6]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /Users/arpan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def CleanText(sentences):
    sentences = sentences.apply(lambda sequence : 
                                [ltrs.lower() for ltrs in sequence if ltrs not in string.punctuation])
    sentences = sentences.apply(lambda wrd: ''.join(wrd))
    sentences = sentences.apply(lambda sequence: 
                                [word for word in sequence.split() if word not in stop_words])
    sentences = sentences.apply(lambda wrd: ' '.join(wrd))
    return sentences

def CleanFeatures(sentences):
    sentences = sentences.apply(lambda sequence:
                                            [ltrs.lower() for ltrs in sequence if ltrs not in string.punctuation])
    sentences = sentences.apply(lambda wrd: ''.join(wrd))
    sentences = sentences.apply(lambda sequence:
                                            [word for word in sequence.split() if word not in stop_words])
    sentences = sentences.apply(lambda wrd: ' '.join(wrd))
    return sentences


In [8]:
clean_text = CleanFeatures(full_text)
clean_text

0       think students would benefit learning homebeca...
1       problem change let best matter happening chang...
2       dear principal u change school policy grade b ...
3       best time life become agree greatest accomplis...
4       small act kindness impact people change people...
                              ...                        
3906    believe using cellphones class education us go...
3907    working alone students argue decission proyect...
3908    problem chance best think quote cant best ever...
3909    many people disagree albert schweitzers quote ...
3910    think failure main thing people consist goals ...
Name: full_text, Length: 3911, dtype: object

In [9]:
from transformers import AutoTokenizer

In [10]:
list_words = [len(text.split()) for text in clean_text]
#seq_len = max(list_words)
seq_len = 512
seq_len

512

In [11]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [12]:
input_ids = []
attention_mask = []
for text in clean_text:
    tokens = tokenizer.encode_plus(text, max_length=seq_len, padding='max_length',
                         truncation=True, return_token_type_ids= True,
                         return_tensors = 'np')
    input_ids.append(tokens['input_ids'])
    attention_mask.append(tokens['attention_mask'])

In [13]:
input_ids = np.asarray(input_ids)
attention_mask = np.asarray(attention_mask)
input_ids.shape

(3911, 1, 512)

In [14]:
input_ids = np.reshape(input_ids, (input_ids.shape[0], input_ids.shape[2]))
attention_mask = np.reshape(attention_mask, (attention_mask.shape[0], attention_mask.shape[2]))
print(input_ids.shape)
print(attention_mask.shape)

(3911, 512)
(3911, 512)


In [15]:
syntax = np.asarray(syntax)
cohesion = np.asarray(cohesion)
vocabulary = np.asarray(vocabulary)
phraseology = np.asarray(phraseology)
grammar = np.asarray(grammar)
conventions = np.asarray(conventions)

## Bert Model

In [16]:
from transformers import TFBertModel
bert = TFBertModel.from_pretrained('bert-base-cased')

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [17]:
from keras.layers import LSTM, Lambda
from keras.layers import add, maximum, subtract, minimum

In [21]:
def resnet(inputs, units):
    x = tf.keras.layers.LSTM(units,return_sequences = True, dropout=0.15)(inputs)
    return x

In [22]:
units = 32
input_ids_m = tf.keras.layers.Input(shape = (seq_len, ), dtype = 'int32')
attention_mask_n = tf.keras.layers.Input(shape = (seq_len, ),  dtype = 'int32')
bert_m = bert(input_ids_m, attention_mask = attention_mask_n)[0]
x = resnet(bert_m, units)
for stack in range(3):
    for block in range(1):
        y = resnet(x, units)
        if stack > 0 and block == 0:
            x = tf.keras.layers.LSTM(units, return_sequences = True, dropout=0.15, recurrent_dropout=0.15)(x)
        x = minimum([x, y])
    units *=2
x1 = tf.keras.layers.GlobalAveragePooling1D()(x)
x2 = tf.keras.layers.GlobalMaxPool1D()(x)
x = tf.keras.layers.concatenate([x1, x2], name="our_param")
y = tf.keras.layers.Dense(1,  name='syntax')(x)
y2 = tf.keras.layers.Dense(1, name='cohesion')(x)
y3 = tf.keras.layers.Dense(1, name='vocabulary')(x)
m = tf.keras.models.Model(inputs = [input_ids_m, attention_mask_n], outputs = [y, y2, y3])

In [23]:
m.layers[2].trainable = False

In [24]:
m.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 512)]        0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 512)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  108310272   ['input_3[0][0]',                
                                thPoolingAndCrossAt               'input_4[0][0]']                
                                tentions(last_hidde                                               
                                n_state=(None, 512,                                         

In [25]:
tf.keras.utils.plot_model(m, show_shapes=True, 
                          show_dtype=False, 
                          show_layer_names=True, 
                          expand_nested=True,
                          show_layer_activations=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [26]:
m.compile(loss="mse", optimizer= "adam")

In [27]:
history_training = m.fit([input_ids, attention_mask],
                         y = [syntax, cohesion, vocabulary],
                         batch_size= 16,
                         validation_split = 0.05, 
                         epochs= 6)

Epoch 1/6


2022-11-06 17:28:04.981457: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [28]:
m.save('bert_model.h5')

In [37]:
m.save_weights('bert_model_weights.h5')

In [38]:
units = 32
input_ids_m = tf.keras.layers.Input(shape = (seq_len, ), dtype = 'int32')
attention_mask_n = tf.keras.layers.Input(shape = (seq_len, ),  dtype = 'int32')
bert_m = bert(input_ids_m, attention_mask = attention_mask_n)[0]
x = resnet(bert_m, units)
for stack in range(3):
    for block in range(1):
        y = resnet(x, units)
        if stack > 0 and block == 0:
            x = tf.keras.layers.LSTM(units, return_sequences = True, dropout=0.15, recurrent_dropout=0.15)(x)
        x = minimum([x, y])
    units *=2
x1 = tf.keras.layers.GlobalAveragePooling1D()(x)
x2 = tf.keras.layers.GlobalMaxPool1D()(x)
x = tf.keras.layers.concatenate([x1, x2], name="our_param")
y = tf.keras.layers.Dense(1,  name='syntax')(x)
y2 = tf.keras.layers.Dense(1, name='cohesion')(x)
y3 = tf.keras.layers.Dense(1, name='vocabulary')(x)
test_model = tf.keras.models.Model(inputs = [input_ids_m, attention_mask_n], outputs = [y, y2, y3])

In [39]:
test_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 512)]        0           []                               
                                                                                                  
 input_6 (InputLayer)           [(None, 512)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  108310272   ['input_5[0][0]',                
                                thPoolingAndCrossAt               'input_6[0][0]']                
                                tentions(last_hidde                                               
                                n_state=(None, 512,                                         

In [40]:
test_model.load_weights('bert_model_weights.h5')

In [41]:
test_model

<keras.engine.functional.Functional at 0x291a35d30>