In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch
import pandas as pd
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification

# df = pd.read_csv('/content/drive/MyDrive/training.csv')
# df.head()
df_train = pd.read_csv('/content/drive/MyDrive/training.csv', sep =',')

print(df_train.head())

                                               Input Sentiment
0          I can't understand the method of teaching  Negative
1  The instructor was interested in the students ...  Positive
2  The instructor don't use any examples for expl...  Negative
3          Teaching is good but always late to class   Neutral
4                               Explaination is poor  Negative


In [None]:
df_train['Sentiment'].value_counts()

Positive    576
Negative    493
Neutral     233
Name: Sentiment, dtype: int64

In [None]:
df_train['Sentiment'].isnull().values.any()

False

In [None]:
from tensorflow.keras.utils import to_categorical

In [None]:
encoded_dict = {'Neutral':0,'Positive':1, 'Negative':2 }
df_train['Sentiment'] = df_train.Sentiment.map(encoded_dict)

In [None]:
df_train

Unnamed: 0,Input,Sentiment
0,I can't understand the method of teaching,2
1,The instructor was interested in the students ...,1
2,The instructor don't use any examples for expl...,2
3,Teaching is good but always late to class,0
4,Explaination is poor,2
...,...,...
1297,All is there except indoor games,0
1298,All is well except fitness equipments,0
1299,Everything is fine except ground condition,0
1300,Ground is in best condition but no gaming equi...,0


In [None]:
y_train = to_categorical(df_train.Sentiment)

In [None]:
y_train

array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       ...,
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]], dtype=float32)

In [None]:
from transformers import AutoTokenizer,TFBertModel
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert = TFBertModel.from_pretrained('bert-base-cased')

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
x_train = tokenizer(
    text=df_train.Input.tolist(),
    add_special_tokens=True,
    max_length=70,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [None]:
input_ids = x_train['input_ids']
attention_mask = x_train['attention_mask']

In [None]:
x_train['input_ids']

<tf.Tensor: shape=(1302, 24), dtype=int32, numpy=
array([[  101,   146,  1169, ...,     0,     0,     0],
       [  101,  1109, 10332, ...,     0,     0,     0],
       [  101,  1109, 10332, ...,     0,     0,     0],
       ...,
       [  101,  5268,  1110, ...,     0,     0,     0],
       [  101,  8149,  1110, ...,     0,     0,     0],
       [  101,  1302,  4778, ...,     0,     0,     0]], dtype=int32)>

In [None]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense

In [None]:
max_len = 24
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
embeddings = bert(input_ids,attention_mask = input_mask)[0] 
out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32,activation = 'relu')(out)
y = Dense(3,activation = 'softmax')(out)
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True

NameError: ignored

In [None]:
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 24)]         0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 24)]         0           []                               
                                                                                                  
 tf_bert_model_1 (TFBertModel)  TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 24,                                          

In [None]:
optimizer = Adam(
    learning_rate=5e-05, # this learning rate is for bert model , taken from huggingface website 
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)
# Set loss and metrics
loss =CategoricalCrossentropy(from_logits = True)
metric = CategoricalAccuracy('balanced_accuracy'),
# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

In [None]:
train_history = model.fit(
    x ={'input_ids':x_train['input_ids'],'attention_mask':x_train['attention_mask']} ,y = y_train,epochs=1,batch_size=36)

  return dispatch_target(*args, **kwargs)




In [None]:
texts = input(str('input the text'))
x_val = tokenizer(
    text=texts,
    add_special_tokens=True,
    max_length=24,
    truncation=True,
    padding='max_length', 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True) 
validation = model.predict({'input_ids':x_val['input_ids'],'attention_mask':x_val['attention_mask']})*100
for key , value in zip(encoded_dict.keys(),validation[0]):
    print(key,value)