# Model Training

In [1]:
import pandas as pd

from training.model import Model

#### 1. Creating dataset

In [2]:
COVID_PREPROCESSED_DATA_PATH = "/Users/adamdabkowski/PycharmProjects/NLP-Project/data/preprocessed/covid"

In [3]:
data_json = pd.read_json(COVID_PREPROCESSED_DATA_PATH, orient="records", lines=True)

##### 1.1 balanced dataset

In [4]:
dataset_1 = data_json.loc[data_json['label'] == 1].head(476)
dataset_2 = data_json.loc[data_json['label'] == 0].head(476)

dataset = pd.concat([dataset_1, dataset_2])
dataset = dataset.sample(frac=1, random_state=10)
dataset

Unnamed: 0,text,label,hashtags,emojis,polarity,subjectivity,sentiment
187,The ESP tested less than 5 e of the inhabitant...,0,[],[],-0.166667,0.066667,negative
3110,Non immigrant like HOB visa pivotal to Post CO...,1,[],[],0.500000,0.800000,positive
3016,Google launches COVID-9 page and search portal...,1,[],[],0.500000,0.500000,positive
44,A picture allegedly showing a crowd in Berlin ...,0,[],[],-0.050000,0.050000,negative
3066,Rs 3 lake for COVID treatment e Delhi hospital...,1,[],[],0.000000,0.000000,neutral
...,...,...,...,...,...,...,...
3195,Maharashtra Reports 1st COVID-9 Cases e 344 De...,1,[],[],0.000000,0.000000,neutral
3146,Google Helps Place Ads on Sites Amplifying COV...,1,[],[],0.000000,0.000000,neutral
53,A picture allegedly showing a large crowd gath...,0,[],[],0.057143,0.264286,positive
2951,e How to make COVID vaccine at home e among th...,1,[],[],0.500000,0.500000,positive


##### 1.2 full dataset

In [10]:
dataset_1_full = data_json.loc[data_json['label'] == 1]
dataset_2_full = data_json.loc[data_json['label'] == 0]

dataset_full = pd.concat([dataset_1_full, dataset_2_full])
dataset_full = dataset_full.sample(frac=1, random_state=10)
dataset_full

Unnamed: 0,text,label,hashtags,emojis,polarity,subjectivity,sentiment
9626,The label on Detox disinfectant spray claims i...,0,[],[],0.136364,0.454545,positive
4199,The largest hole in the ozone layer over the A...,0,[],[],-0.140278,0.331944,negative
2849,The flu din e t kill any Americans this year e,0,[],[],0.000000,0.000000,neutral
9952,e Cocaine kills coronavirus e scientists are s...,0,[],[],-0.700000,0.800000,negative
9715,Photos show coronavirus is found in broiler ch...,0,[],[],-0.600000,0.950000,negative
...,...,...,...,...,...,...,...
9371,Exactly every 100 years e a viral epidemic bre...,0,[],[],0.250000,0.250000,positive
7276,Washington admits it is the responsible for CO...,0,[],[],0.200000,0.550000,positive
870,Photo of a man trying to resurrect his grandmo...,0,[],[],0.000000,0.000000,neutral
7278,George Sores is one of the people responsible ...,0,[],[],0.168182,0.502273,positive


#### 2. bert-base-multilingual-uncased (v1)

##### 2.1 balanced dataset

In [6]:
model_v1 = Model(model_path="bert-base-multilingual-uncased")

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
tf_train_v1, tf_test_v1 = model_v1.prepare_train_test_data(dataset)



INFO:tensorflow:Assets written to: ram://474abe84-be6c-4e49-8013-be633f3bd8e7/assets


INFO:tensorflow:Assets written to: ram://474abe84-be6c-4e49-8013-be633f3bd8e7/assets


Map:   0%|          | 0/760 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


INFO:tensorflow:Assets written to: ram://db3ca6e3-d0d7-46ad-a950-17e61bb3b075/assets


INFO:tensorflow:Assets written to: ram://db3ca6e3-d0d7-46ad-a950-17e61bb3b075/assets


Map:   0%|          | 0/190 [00:00<?, ? examples/s]

In [8]:
model_v1.compile()
model_v1.fit(train_data=tf_train_v1, epochs=5, validation_data=tf_test_v1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x28e04b57c70>

In [9]:
text = "WhatsApp censors the messages that circulate on its platform if it believes that they are hoaxes with the help of the media that verify false information in Spain." # 0

In [11]:
prediction_1 = model_v1.classify_text(text=text)

Predicted class: 0
Probability distribution: [9.9925196e-01 7.4803270e-04]


In [12]:
text = "Coronavirus: New Covid-19 tracing tool appears on smartphones" # 1

In [13]:
prediction_2 = model_v1.classify_text(text=text)

Predicted class: 1
Probability distribution: [0.00115485 0.99884516]


In [14]:
COVID_MODEL_V1_PATH = "model/covid/model_v1.h5py"

In [15]:
model_v1.save_model(COVID_MODEL_V1_PATH)

##### 2.2 full dataset

In [4]:
model_full_v1 = Model(model_path="bert-base-multilingual-uncased")

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
tf_train_full_v1, tf_test_full_v1 = model_full_v1.prepare_train_test_data(dataset_full)



INFO:tensorflow:Assets written to: ram://58238f8d-8c0a-45d9-af23-d7a0f3daae6e/assets


INFO:tensorflow:Assets written to: ram://58238f8d-8c0a-45d9-af23-d7a0f3daae6e/assets


Map:   0%|          | 0/8160 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


INFO:tensorflow:Assets written to: ram://f0355d65-f0b7-4e18-8fea-8a7e3ebfd436/assets


INFO:tensorflow:Assets written to: ram://f0355d65-f0b7-4e18-8fea-8a7e3ebfd436/assets


Map:   0%|          | 0/2041 [00:00<?, ? examples/s]

In [None]:
model_full_v1.compile()
model_full_v1.fit(train_data=tf_train_full_v1, epochs=5, validation_data=tf_test_full_v1)

In [None]:
text = "WhatsApp censors the messages that circulate on its platform if it believes that they are hoaxes with the help of the media that verify false information in Spain." # 0

In [None]:
prediction_1 = model_full_v1.classify_text(text=text)

In [None]:
text = "Coronavirus: New Covid-19 tracing tool appears on smartphones" # 1

In [None]:
prediction_2 = model_full_v1.classify_text(text=text)

In [None]:
COVID_MODEL_FULL_V1_PATH = "model/covid/model_full_v1.h5py"

In [None]:
model_full_v1.save_model(COVID_MODEL_FULL_V1_PATH)

#### 3. XLM-RoBERTa (v2)

##### 3.1 balanced dataset

In [4]:
model_v2 = Model(model_path="xlm-roberta-base")

2023-05-20 14:59:05.512778: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
All model checkpoint layers were used when initializing TFXLMRobertaForSequenceClassification.

Some layers of TFXLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
tf_train_v2, tf_test_v2 = model_v2.prepare_train_test_data(dataset)



INFO:tensorflow:Assets written to: ram://afb82f8b-54d2-4415-b6dc-1193c42baa2f/assets


INFO:tensorflow:Assets written to: ram://afb82f8b-54d2-4415-b6dc-1193c42baa2f/assets


Map:   0%|          | 0/760 [00:00<?, ? examples/s]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


INFO:tensorflow:Assets written to: ram://d454cb68-4b86-4253-a17c-c88acf9dbc52/assets


INFO:tensorflow:Assets written to: ram://d454cb68-4b86-4253-a17c-c88acf9dbc52/assets


Map:   0%|          | 0/190 [00:00<?, ? examples/s]

In [6]:
model_v2.compile()
model_v2.fit(train_data=tf_train_v2, epochs=5, validation_data=tf_test_v2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fc0215b7940>

In [7]:
text = "WhatsApp censors the messages that circulate on its platform if it believes that they are hoaxes with the help of the media that verify false information in Spain." # 0

In [8]:
prediction_1 = model_v2.classify_text(text=text)

Predicted class: 0
Probability distribution: [0.99733955 0.00266042]


In [9]:
text = "Coronavirus: New Covid-19 tracing tool appears on smartphones" # 1

In [10]:
prediction_2 = model_v2.classify_text(text=text)

Predicted class: 1
Probability distribution: [0.00473887 0.99526113]


In [11]:
COVID_MODEL_V2_PATH = "model/covid/model_v2.h5py"

In [12]:
model_v2.save_model(COVID_MODEL_V2_PATH)

#### 4. distilbert-base-multilingual-cased (v3)

##### 4.1 balanced dataset

In [5]:
model_v3 = Model(model_path="distilbert-base-multilingual-cased")

2023-05-25 16:59:33.775289: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some layers from the model checkpoint at distilbert-base-multilingual-cased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'activation_13', 'vocab_layer_norm', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a Be

In [6]:
tf_train_v3, tf_test_v3 = model_v3.prepare_train_test_data(dataset)





INFO:tensorflow:Assets written to: ram://ebd2a3e9-c9a9-436a-80d5-7a7ccf1f80a4/assets


INFO:tensorflow:Assets written to: ram://ebd2a3e9-c9a9-436a-80d5-7a7ccf1f80a4/assets


Map:   0%|          | 0/760 [00:00<?, ? examples/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


























INFO:tensorflow:Assets written to: ram://bfbe08b8-0721-4eef-af7f-3e2f3181c3b6/assets


INFO:tensorflow:Assets written to: ram://bfbe08b8-0721-4eef-af7f-3e2f3181c3b6/assets


Map:   0%|          | 0/190 [00:00<?, ? examples/s]

In [7]:
model_v3.compile()
model_v3.fit(train_data=tf_train_v3, epochs=5, validation_data=tf_test_v3)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fc827defbb0>

In [8]:
text = "WhatsApp censors the messages that circulate on its platform if it believes that they are hoaxes with the help of the media that verify false information in Spain." # 0

In [9]:
prediction_1 = model_v3.classify_text(text=text)

Predicted class: 0
Probability distribution: [0.9959774  0.00402259]


In [10]:
text = "Coronavirus: New Covid-19 tracing tool appears on smartphones" # 1

In [11]:
prediction_2 = model_v3.classify_text(text=text)

Predicted class: 1
Probability distribution: [0.04872704 0.9512729 ]


In [12]:
COVID_MODEL_V3_PATH = "/Users/adamdabkowski/PycharmProjects/NLP-Project/model/covid/model_v3.h5py"

In [13]:
model_v3.save_model(COVID_MODEL_V3_PATH)