In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 5.7 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 47.7 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 6.7 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.24.0


In [None]:
!pip install tokenizers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os
import shutil
import pandas as pd
import numpy as np
import tensorflow as tf
# import tensorflow_text as text
import transformers

import matplotlib.pyplot as plt

# tf.get_logger().setLevel('ERROR')


In [None]:
data = pd.read_csv('/content/sample_data/dataset.csv')
data

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5
...,...,...,...,...,...,...,...,...
3906,FFD29828A873,I believe using cellphones in class for educat...,2.5,3.0,3.0,3.5,2.5,2.5
3907,FFD9A83B0849,"Working alone, students do not have to argue w...",4.0,4.0,4.0,4.0,3.5,3.0
3908,FFDC4011AC9C,"""A problem is a chance for you to do your best...",2.5,3.0,3.0,3.0,3.5,3.0
3909,FFE16D704B16,Many people disagree with Albert Schweitzer's ...,4.0,4.5,4.5,4.0,4.5,4.5


In [None]:
col = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
targets = np.array(data[col], dtype="float32")

In [None]:
MAX_LEN = 512

def bert_encode(texts, tokenizer, max_len):
    input_ids = []
    # token_type_ids = []
    attention_mask = []
    
    for text in texts:
        token = tokenizer(text, max_length=max_len, truncation=True, padding='max_length',
                         add_special_tokens=True)
        input_ids.append(token['input_ids'])
        # token_type_ids.append(token['token_type_ids'])
        attention_mask.append(token['attention_mask'])
    
    return np.array(input_ids), np.array(attention_mask)

In [None]:
from transformers import RobertaTokenizer, TFRobertaModel

ROBERTA_MODEL = "roberta-base"

bert_tokenizer = RobertaTokenizer.from_pretrained(ROBERTA_MODEL)
bert_tokenizer.save_pretrained('./tokenizer/')

cfg = transformers.AutoConfig.from_pretrained(ROBERTA_MODEL)

cfg.save_pretrained('./tokenizer/')


bert_model = TFRobertaModel.from_pretrained(ROBERTA_MODEL )

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/657M [00:00<?, ?B/s]

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [None]:
train_input_ids,train_attention_masks = bert_encode(data['full_text'][:2000], bert_tokenizer, MAX_LEN)

In [None]:
loss = tf.keras.losses.mae
metrics=tf.keras.metrics.mae


def build_model(model_layer, learning_rate, dense_dim = 6):
    
    #define inputs
    input_ids = tf.keras.Input(shape=(MAX_LEN ,),dtype='int64')
    attention_masks = tf.keras.Input(shape=(MAX_LEN ,),dtype='int64')
 
    
    #insert BERT layer
    transformer_layer = model_layer([input_ids,attention_masks])
    
    #choose only last hidden-state
    x = transformer_layer[1]
    x = tf.keras.layers.Dense(dense_dim,activation='sigmoid')(x)
    output = tf.keras.layers.Rescaling(scale=4.0, offset=1.0)(x)
    model = tf.keras.models.Model(inputs = [input_ids,attention_masks],outputs = output)

    model.compile(tf.keras.optimizers.Adam(learning_rate), loss=loss,metrics=metrics)
    
    return model

In [None]:
BERT= build_model(bert_model,learning_rate=1e-5)

In [None]:
BERT.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 512)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 512)]        0           []                               
                                                                                                  
 tf_roberta_model (TFRobertaMod  TFBaseModelOutputWi  124645632  ['input_1[0][0]',                
 el)                            thPoolingAndCrossAt               'input_2[0][0]']                
                                tentions(last_hidde                                               
                                n_state=(None, 512,                                           

In [None]:
history = BERT.fit((train_input_ids,train_attention_masks),targets,batch_size =4,        
                epochs=2,
                validation_split = 0.2
                )

Epoch 1/2
Epoch 2/2


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
tf.saved_model.save(BERT, 'roberta-base-essay')



In [None]:
saved_model = tf.saved_model.load('roberta-base-essay')

In [None]:
test_df = data[-50:]

In [None]:
test_df = test_df.reset_index()
test_df.drop(columns=['index'],axis=1,inplace=True)

In [None]:
test_df

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,FE3F2F729D98,To whom read it.\n\nFirst impressions are poss...,2.5,2.5,2.5,2.0,2.5,2.5
1,FE459F8CF4CE,"""A problem is a chance for you to do your best...",3.0,2.0,4.0,3.0,3.0,3.0
2,FE4E3BB2DAE1,People really work on this this? or People kno...,2.5,2.0,3.0,3.0,2.0,3.5
3,FE60E597467E,What is positive attitude? Positive attitude i...,4.0,4.0,4.0,4.5,4.5,5.0
4,FE69FBDD5681,Is learning online a gift? Students should att...,3.5,3.5,3.5,3.5,4.0,3.5
5,FE6D9B200002,The school ends at 4 o'clock and most of the s...,3.0,2.5,2.5,2.0,2.5,2.0
6,FE6EC11C8877,Should you relied on yourself or other about l...,3.5,3.5,3.0,3.0,2.5,3.5
7,FE700406E7D4,I think the people have to be guidance for the...,3.0,2.5,2.0,2.5,2.5,3.0
8,FE722240CBB7,Occupations through internships and shadowing ...,3.5,3.0,3.0,3.0,2.5,3.5
9,FE7408E48786,Students get their longest break on summer whe...,4.0,4.0,3.5,4.0,3.5,3.5


In [None]:
test_ids,test_masks = bert_encode(test_df['full_text'], bert_tokenizer, MAX_LEN)

In [None]:
pred = BERT.predict([test_ids,test_masks])



In [None]:
sub_df = pd.concat([test_df[['text_id']], pd.DataFrame(pred, columns=col)], axis=1)
sub_df

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,FE3F2F729D98,2.908597,2.580027,2.904452,2.659817,2.486815,2.856315
1,FE459F8CF4CE,3.756635,3.582624,3.641634,3.738405,3.649222,3.65561
2,FE4E3BB2DAE1,3.18861,2.823373,3.141181,3.022999,2.841998,3.132362
3,FE60E597467E,4.349122,4.262636,4.194976,4.403225,4.147082,4.107693
4,FE69FBDD5681,4.144053,3.988244,3.668585,4.042815,3.952818,3.97248
5,FE6D9B200002,3.054974,2.755029,2.927143,2.788334,2.656858,2.744283
6,FE6EC11C8877,3.290707,3.008377,3.143275,3.173102,3.08332,2.935103
7,FE700406E7D4,2.992458,2.609624,2.886422,2.570939,2.492879,2.940192
8,FE722240CBB7,3.533162,3.360369,3.378481,3.445864,3.340234,3.464787
9,FE7408E48786,4.134415,4.032102,3.871269,4.087649,3.847613,3.522392


In [None]:
y = sub_df

In [None]:
for i, row in y.iterrows():
  y.at[i, 'cohesion'] = (round(y.at[i, 'cohesion']*2))/2.0
  y.at[i, 'syntax'] = (round(y.at[i, 'syntax']*2))/2.0
  y.at[i, 'vocabulary'] = (round(y.at[i, 'vocabulary']*2))/2.0
  y.at[i, 'phraseology'] = (round(y.at[i, 'phraseology']*2))/2.0
  y.at[i, 'grammar'] = (round(y.at[i, 'grammar']*2))/2.0
  y.at[i, 'conventions'] = (round(y.at[i, 'conventions']*2))/2.0

In [None]:
y1 = round(3.7*2)/2.0
y1

3.5

In [None]:
test_df

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,FE3F2F729D98,To whom read it.\n\nFirst impressions are poss...,2.5,2.5,2.5,2.0,2.5,2.5
1,FE459F8CF4CE,"""A problem is a chance for you to do your best...",3.0,2.0,4.0,3.0,3.0,3.0
2,FE4E3BB2DAE1,People really work on this this? or People kno...,2.5,2.0,3.0,3.0,2.0,3.5
3,FE60E597467E,What is positive attitude? Positive attitude i...,4.0,4.0,4.0,4.5,4.5,5.0
4,FE69FBDD5681,Is learning online a gift? Students should att...,3.5,3.5,3.5,3.5,4.0,3.5
5,FE6D9B200002,The school ends at 4 o'clock and most of the s...,3.0,2.5,2.5,2.0,2.5,2.0
6,FE6EC11C8877,Should you relied on yourself or other about l...,3.5,3.5,3.0,3.0,2.5,3.5
7,FE700406E7D4,I think the people have to be guidance for the...,3.0,2.5,2.0,2.5,2.5,3.0
8,FE722240CBB7,Occupations through internships and shadowing ...,3.5,3.0,3.0,3.0,2.5,3.5
9,FE7408E48786,Students get their longest break on summer whe...,4.0,4.0,3.5,4.0,3.5,3.5


In [None]:
y_pred_cohesion = pd.Series(y['cohesion'], dtype="string")
y_cohesion = pd.Series(test_df['cohesion'], dtype="string")

from sklearn.metrics import accuracy_score

print(accuracy_score(y_cohesion,y_pred_cohesion))

0.38


In [None]:
from sklearn.metrics import mean_squared_error
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores

In [None]:
y_pred = y.iloc[:,1:].values
y_true = test_df.iloc[:,2:].values
MCRMSE(y_pred,y_true)

(0.5400548578691723,
 [0.6855654600401044,
  0.5958187643906492,
  0.4301162633521313,
  0.4949747468305833,
  0.5,
  0.5338539126015656])

In [None]:
tf.saved_model.save(BERT, '/content/drive/MyDrive/nlp-project-models/roberta-base-essay')



In [None]:
saved_model = tf.saved_model.load('/content/drive/MyDrive/nlp-project-models/roberta-base-essay')

In [None]:
str1 = "Learning something new can be a scary experience. One of the hardest things I've ever had to do was learn how to swim. I was always afraid of the water, but I decided that swimming was an important  skill  that  I  should  learn.  I  also  thought  it  would  be  good  exercise  and  help  me  to become physically stronger. What I didn't realize was that learning to swim would also make me a more confident person. New  situations  always  make  me  a  bit  nervous,  and  my  first  swimming  lesson  was  no exception. After I changed into my bathing suit in the locker room, I stood timidly by the side of the  pool  waiting  for  the  teacher  and  other  students  to  show  up.  After  a  couple  of  minutes  the teacher  came  over.  She  smiled  and  introduced  herself,  and  two  more  students  joined  us. Although they were both older than me, they didn't seem to be embarrassed about not knowing how to swim. I began to feel more at ease."

test_ids,test_masks = bert_encode([str1], bert_tokenizer, MAX_LEN)

In [None]:
pred = BERT.predict([test_ids,test_masks])
lst=[]
for x in pred[0]:
  lst.append(round(x*2)/2.0)

lst

