In [62]:
import sys
import datetime as dt
import pandas as pd
from transformers import AutoTokenizer,TFBertModel
# Importing Libraries for the Machine Learning Model
from xgboost import XGBClassifier
from lightgbm import LGBMModel,LGBMClassifier, plot_importance
from sklearn.metrics import confusion_matrix, accuracy_score,balanced_accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.feature_extraction.text import CountVectorizer
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, SVMSMOTE, RandomOverSampler
from scipy import sparse
import tensorflow as tf
pd.set_option("display.max_rows",500)
pd.set_option("display.max_columns",500)
tf.config.run_functions_eagerly(True)



In [65]:
print(tf.__version__)

2.4.1


In [66]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

In [63]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


# Bayer NLP task - BERT Evaluation
Evaluation of the BERT Transformer architecture in NLP Task.
(c) Yijie Xu

In [2]:
sys.path.insert(0, '..') # Set root directory
sys.path.insert(0, '/home/yijie/Documents/Bayer/BayerNLP/scripts')# Set root directory

In [3]:
sys.path

['/home/yijie/Documents/Bayer/BayerNLP/scripts',
 '..',
 '/home/yijie/Documents/Bayer/BayerNLP/notebooks',
 '/home/yijie/anaconda3/lib/python38.zip',
 '/home/yijie/anaconda3/lib/python3.8',
 '/home/yijie/anaconda3/lib/python3.8/lib-dynload',
 '',
 '/home/yijie/anaconda3/lib/python3.8/site-packages']

In [4]:
data =pd.read_csv("../data/sentences_with_sentiment.csv",sep=",")

In [5]:
data

Unnamed: 0,ID,Sentence,Positive,Negative,Neutral
0,1,The results in 2nd line treatment show an ORR ...,1,0,0
1,2,The long duration of response and high durable...,1,0,0
2,3,The median OS time in the updated results exce...,0,0,1
3,4,"Therefore, the clinical benefit in 2nd line tr...",1,0,0
4,5,"The data provided in 1st line, although prelim...",1,0,0
5,6,Taking into account the intrinsic limitation o...,1,0,0
6,7,This medicinal product has been authorised und...,0,0,1
7,8,This means that further evidence on this medic...,0,1,0
8,9,The European Medicines Agency will review new ...,0,0,1
9,10,The CHMP considers the following measures nece...,0,1,0


# Data Inspection and Analysis
Research Questions

• Is the dataset balanced?

• Is the amount of data sufficient for allowing a hold-out dataset?

• Do you have enough data to consider deep neural architectures or
might good feature engineering with more shallow models suffice?

• During the data collection process, for some sentences multiple experts
disagreed on the sentiment of a given sentence, how could you capture
such an ambiguity in your model and potentially notify users about
such unclear instances?

• How does your model come to a specific conclusion, what about model
interpretability?

• Think beyond the pure sentiment analysis of sentences, e.g. how would
you automatically extract relevant sentences from EPARs and ensure
that the analysis is only applied to specific sections? It is worth to
explore some EPARs on the EMA we

In [6]:
print(len(data)) # 266 Rows of data

266


In [7]:
#Classes
data["Positive"].value_counts()

1    160
0    106
Name: Positive, dtype: int64

In [8]:
data['Negative'].value_counts()

0    230
1     36
Name: Negative, dtype: int64

In [9]:
data['Neutral'].value_counts()

0    196
1     70
Name: Neutral, dtype: int64

In [10]:
pos=data["Positive"].value_counts()[1]
neg=data["Negative"].value_counts()[1]
neu= data["Neutral"].value_counts()[1]

In [11]:
#Check that no overlap occurs per row by summing in across columns

data["check"] =data[["Neutral","Negative","Positive"]].sum(axis=1)
data.head()

Unnamed: 0,ID,Sentence,Positive,Negative,Neutral,check
0,1,The results in 2nd line treatment show an ORR ...,1,0,0,1
1,2,The long duration of response and high durable...,1,0,0,1
2,3,The median OS time in the updated results exce...,0,0,1,1
3,4,"Therefore, the clinical benefit in 2nd line tr...",1,0,0,1
4,5,"The data provided in 1st line, although prelim...",1,0,0,1


# Data Cleaning and Processing
Perform a Null check and proceed with statistical features and other cleaning.
Some basic NLP processes, such as stemming and polarity, also done.

In [12]:
data.isnull().values.any()

False

In [12]:
#Lets combine all of the sentiments into a single polarity feature 
# Three classes here, 1,0, -1
def convert_sentiment(row):
    if row['Positive'] == 1:
        val = 1
    elif row['Negative'] ==1:
        val=-1
    else:
        val=0
    return val


In [13]:
data["sentiment"]=data.apply(lambda row: convert_sentiment(row), axis=1)

In [14]:
data.head()

Unnamed: 0,ID,Sentence,Positive,Negative,Neutral,check,sentiment
0,1,The results in 2nd line treatment show an ORR ...,1,0,0,1,1
1,2,The long duration of response and high durable...,1,0,0,1,1
2,3,The median OS time in the updated results exce...,0,0,1,1,0
3,4,"Therefore, the clinical benefit in 2nd line tr...",1,0,0,1,1
4,5,"The data provided in 1st line, although prelim...",1,0,0,1,1


In [15]:
#Data cleaning and standardization
from utils import processing

[nltk_data] Downloading package stopwords to /home/yijie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
data_processed = processing(data)
data=data_processed

In [17]:
#Fetch max length for tokenization process
print("max len of sentence",max([len(x.split()) for x in data.Sentence]))

max_length = 63


max len of sentence 63


# Model Evaluation with BERT

Tokenizer and Tensorflow-based pretrained BERT model. Oversampling performed with SVMSMOTE

Note that for BERT, raw text also works for tokenization, not necessary to stem as tokenization handles it auto.

In [20]:
# Apply BERT Tokenization and load pretrained model, 
# note TF model size 1.5GB
tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased')
bert = TFBertModel.from_pretrained('bert-large-uncased')

Downloading (…)"tf_model.h5";:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

2023-02-10 12:31:35.481888: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-02-10 12:31:35.482410: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2023-02-10 12:31:35.521929: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-10 12:31:35.522198: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce RTX 2080 Super with Max-Q Design computeCapability: 7.5
coreClock: 1.08GHz coreCount: 48 deviceMemorySize: 7.79GiB deviceMemoryBandwidth: 322.46GiB/s
2023-02-10 12:31:35.522218: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2023-02-10 12:31:35.531078: I tensorflow/stream_exec

In [42]:
target = data['sentiment']

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 42,stratify=target)
print ("The Train set size ", X_train.shape)
print ("The Test set size ", X_test.shape)

The Train set size  (212, 18)
The Test set size  (54, 18)


In [31]:
X_train_tok = tokenizer(
    text=X_train.Sentence.tolist(),
    add_special_tokens=True,
    max_length=63,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)



In [32]:
X_test_tok = tokenizer(
    text=X_test.Sentence.tolist(),
    add_special_tokens=True,
    max_length=63,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [43]:
y_train_tok = y_train.values
y_test_tok = y_test.values

In [59]:
from tensorflow.keras.utils import to_categorical
y_train_final = to_categorical(y_train_tok, 3)
y_test_final = to_categorical(y_test_tok, 3)

In [39]:
#Sanity check
X_train_tok['input_ids'].shape

TensorShape([212, 63])

In [40]:
X_train_tok['attention_mask'].shape

TensorShape([212, 63])

In [46]:
from tensorflow.keras.layers import Input, Dense
# Build model

input_ids = Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_length,), dtype=tf.int32, name="attention_mask")
# embeddings = dbert_model(input_ids,attention_mask = input_mask)[0]


embeddings = bert(input_ids,attention_mask = input_mask)[1] #(0 is the last hidden states,1 means pooler_output)
# out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = tf.keras.layers.Dropout(0.1)(embeddings)

out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32,activation = 'relu')(out)

y = Dense(3,activation = 'softmax')(out)

# define model and set trainability
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True


In [47]:
model.summary() # Not sure if we should freeze the model


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 63)]         0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 63)]         0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 335141888   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
dropout_73 (Dropout)            (None, 1024)         0           tf_bert_model[0][1]          

In [52]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy,BinaryCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy,BinaryAccuracy
optimizer = Adam(
    learning_rate=6e-06, # this learning rate is for bert model.
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)

# Set loss and metrics
loss = CategoricalCrossentropy(from_logits = True)
metric = BinaryAccuracy('accuracy'),
# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = ['accuracy'])

In [58]:
y_train_tok

array([ 0,  0,  0,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  0,  0,  1,  1,
        0,  0,  1,  1,  1,  1,  0,  0,  0,  1,  1,  0,  1,  1,  0,  1,  1,
       -1, -1,  0,  1,  1,  1,  1,  0,  0,  1,  0,  1, -1, -1,  1,  1,  1,
        1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  1, -1,
        0,  0,  1,  1,  1,  1,  0,  1,  0,  1,  0,  1,  1,  0,  1,  1,  1,
        1,  1,  0,  1,  1,  1, -1,  1,  0, -1, -1,  1,  0, -1,  0,  0,  0,
       -1,  0, -1,  1,  1,  0,  1, -1,  0,  1,  0,  1,  0, -1,  1,  1,  1,
        1, -1,  1,  1,  0, -1,  1,  0,  0,  1,  1,  1,  1,  0,  1, -1, -1,
        1,  0,  1,  1,  1,  1,  1, -1,  0,  1,  1, -1,  1,  0, -1,  0,  0,
       -1,  1,  1,  0,  1, -1,  1,  0,  1,  1, -1, -1,  0,  1,  0,  1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  0, -1,  1,  0,  0,  1,  1,  1,
        1,  0,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  1,  1,  1,  0,  1,
        1,  1,  1,  0,  1,  1,  1,  1])

In [64]:
final = model.fit(
    x ={'input_ids':X_train_tok['input_ids'],'attention_mask':X_train_tok['attention_mask']} ,
    y = y_train_final,
#   validation_split = 0.1,
  epochs=9,
    batch_size=10
)

Epoch 1/9

KeyboardInterrupt: 

# BERT