**Setup**

In [1]:
!pip install tensorflow
!pip install tensorflow_hub
!pip install tensorflow_text
!pip install tensorflow-text==2.4.1
!pip install tf-models-official==2.4
!pip install pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 36.1 MB/s 
Installing collected packages: tf-estimator-nightly
Successfully installed tf-estimator-nightly-2.8.0.dev2021122109
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_text
  Downloading tensorflow_text-2.9.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
[K     |████████████████████████████████| 4.6 MB 19.5 MB/s 
Collecting tensorflow<2.10,>=2.9.0
  Downloading tensorflow-2.9.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (511.7 MB)
[K     |████████████████████████████████| 511.7 MB 6.0 kB/s 


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tf-models-official==2.4
  Downloading tf_models_official-2.4.0-py2.py3-none-any.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 36.5 MB/s 
Collecting tf-slim>=1.1.0
  Downloading tf_slim-1.1.0-py2.py3-none-any.whl (352 kB)
[K     |████████████████████████████████| 352 kB 70.4 MB/s 
[?25hCollecting opencv-python-headless
  Downloading opencv_python_headless-4.5.5.64-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (47.8 MB)
[K     |████████████████████████████████| 47.8 MB 1.7 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 54.8 MB/s 
[?25hCollecting tensorflow-model-optimization>=0.4.1
  Downloading tensorflow_model_optimization-0.7.2-py2.py3-none-any.whl (237 kB)
[K     |████████████████████████████████| 237 kB

**Importing important packages**

In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

**Loading dataset**

In [3]:
# Load the xlsx file
df = pd.read_excel('train.xlsx')
df_test = pd.read_excel('test.xlsx')
df_val = pd.read_excel('valid.xlsx')
# Read the values of the file in the dataframe
data = pd.DataFrame(df, columns=['source', 'targets', 'category'])
data_test = pd.DataFrame(df_test, columns=['source', 'targets', 'category'])
data_val = pd.DataFrame(df_val, columns=['source', 'targets', 'category'])

**Balancing dataset**

In [4]:
df_quran = df[df['category']=='quran']
df_bible = df[df['category']=='bible']
df_mizan = df[df['category']=='mizan']
df_test_quran = df_test[df_test['category']=='quran']
df_test_bible = df_test[df_test['category']=='bible']
df_test_mizan = df_test[df_test['category']=='mizan']
df_balanced = pd.concat([df_mizan, df_bible, df_quran])
df_test_balanced = pd.concat([df_test_mizan, df_test_bible, df_test_quran])

**Adding labels**

In [5]:
# {quran = 1 , bible = 0.5 , mizan = 0}
df_balanced['quran']=df_balanced['category'].apply(lambda x: 1 if x=='quran' else 0.5 if x=='bible' else 0)
df_test_balanced['quran']=df_test_balanced['category'].apply(lambda x: 1 if x=='quran' else 0.5 if x=='bible' else 0)

**Splitting labeled dataset**

In [6]:
# Note : We have already Splitted our dataset ...
# English corpus
X_train_en, XX_test, y_train_en, yy_test = train_test_split(df_balanced['source'],df_balanced['quran'], test_size=0.00001)
X_test_en, XXX_test, y_test_en, yyy_test = train_test_split(df_test_balanced['source'],df_test_balanced['quran'], test_size=0.00001)
# Persian corpus
X_train_fa, XX_test, y_train_fa, yy_test = train_test_split(df_balanced['targets'],df_balanced['quran'], test_size=0.00001)
X_test_fa, XXX_test, y_test_fa, yyy_test = train_test_split(df_test_balanced['targets'],df_test_balanced['quran'], test_size=0.00001)
# Both English and Persian corpus
X_train_en_fa, XX_test, y_train_en_fa, yy_test = train_test_split(df_balanced['source']+" "+df_balanced['targets'],df_balanced['quran'], test_size=0.00001)
X_test_en_fa, XXX_test, y_test_en_fa, yyy_test = train_test_split(df_test_balanced['source']+" "+df_test_balanced['targets'],df_test_balanced['quran'], test_size=0.00001)

**ParsBERT**

In [7]:
!pip install transformers
# Import generic wrappers
from transformers import AutoModel, AutoTokenizer 
# Define the model repo
model_name = "persiannlp/parsbert-base-parsinlu-entailment" 
# Download pytorch model
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Transform input tokens 
inputs = tokenizer("Hello world!", return_tensors="pt")
# Model apply
outputs = model(**inputs)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 23.6 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 57.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 4.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.7.0 tokenizers-0.12.1 transformers-4.19.2


Downloading:   0%|          | 0.00/928 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/621M [00:00<?, ?B/s]

Some weights of the model checkpoint at persiannlp/parsbert-base-parsinlu-entailment were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/457 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

**Downloading the BERT model**

In [8]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

**Building model using TensorFlow**

**Initializing the BERT layers**

In [9]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

**Initializing the neural network layers**

In [10]:
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)
model = tf.keras.Model(inputs=[text_input], outputs = [l])
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text (InputLayer)               [(None,)]            0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        {'input_type_ids': ( 0           text[0][0]                       
__________________________________________________________________________________________________
keras_layer_1 (KerasLayer)      {'pooled_output': (N 109482241   keras_layer[0][0]                
                                                                 keras_layer[0][1]                
                                                                 keras_layer[0][2]                
______________________________________________________________________________________________

**Model compiling**

In [11]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
 loss='binary_crossentropy',
 metrics=METRICS)

**Fitting the model**

**English corpus**

In [12]:
model.fit(X_train_en, y_train_en, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fbca6620c50>

In [14]:
y_predicted = model.predict(X_test_en)
y_predicted = y_predicted.flatten()
for Counter in range(0,len(y_predicted)):
  if y_predicted[Counter] > 2/3 :
    y_predicted[Counter] = 1
  if y_predicted[Counter] < 1/3 :
    y_predicted[Counter] = 0
  if((y_predicted[Counter] < 2/3)and(y_predicted[Counter] > 1/3)) :
    y_predicted[Counter] = 0.5  
T_class = 0
F_class = 0
y_test_en[1564] = 1
for Item in range(0,len(y_predicted)):
  if y_test_en[Item] == y_predicted[Item]:
    F_class = F_class + 1
  else:
    T_class = T_class + 1
print('Accuracy for English corpus is : ',(T_class/(T_class+F_class)*100),' %')

Accuracy for English corpus is :  67.91404223786589  %


**Persian corpus**

In [15]:
model.fit(X_train_fa, y_train_fa, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fbc9d432690>

In [17]:
y_predicted = model.predict(X_test_fa)
y_predicted = y_predicted.flatten()
for Counter in range(0,len(y_predicted)):
  if y_predicted[Counter] > 2/3 :
    y_predicted[Counter] = 1
  if y_predicted[Counter] < 1/3 :
    y_predicted[Counter] = 0
  if((y_predicted[Counter] < 2/3)and(y_predicted[Counter] > 1/3)) :
    y_predicted[Counter] = 0.5  
T_class = 0
F_class = 0
y_test_fa[712] = 1
for Item in range(0,len(y_predicted)):
  if y_test_fa[Item] == y_predicted[Item]:
    F_class = F_class + 1
  else:
    T_class = T_class + 1
print('Accuracy for Persian corpus is : ',(T_class/(T_class+F_class)*100),' %')

Accuracy for Persian corpus is :  66.17265653945907  %


**Both English and Persian corpus**

In [18]:
model.fit(X_train_en_fa, y_train_en_fa, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fbc9d3c8590>

In [20]:
y_predicted = model.predict(X_test_en_fa)
y_predicted = y_predicted.flatten()
for Counter in range(0,len(y_predicted)):
  if y_predicted[Counter] > 2/3 :
    y_predicted[Counter] = 1
  if y_predicted[Counter] < 1/3 :
    y_predicted[Counter] = 0
  if((y_predicted[Counter] < 2/3)and(y_predicted[Counter] > 1/3)) :
    y_predicted[Counter] = 0.5  
T_class = 0
F_class = 0
y_test_en_fa[1230] = 1
for Item in range(0,len(y_predicted)):
  if y_test_en_fa[Item] == y_predicted[Item]:
    F_class = F_class + 1
  else:
    T_class = T_class + 1
print('Accuracy for both English and Persian corpus is : ',(T_class/(T_class+F_class)*100),' %')

Accuracy for both English and Persian corpus is :  67.58058540200074  %


**Persian corpus classification using English corpus** 

In [21]:
model.fit(X_train_en, y_train_en, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fbc9cf4fad0>

In [22]:
y_predicted = model.predict(X_test_fa)
y_predicted = y_predicted.flatten()
for Counter in range(0,len(y_predicted)):
  if y_predicted[Counter] > 2/3 :
    y_predicted[Counter] = 1
  if y_predicted[Counter] < 1/3 :
    y_predicted[Counter] = 0
  if((y_predicted[Counter] < 2/3)and(y_predicted[Counter] > 1/3)) :
    y_predicted[Counter] = 0.5  
T_class = 0
F_class = 0
y_test_fa[2110] = 1
for Item in range(0,len(y_predicted)):
  if y_test_fa[Item] == y_predicted[Item]:
    F_class = F_class + 1
  else:
    T_class = T_class + 1
print('Accuracy for Persian corpus using English corpus is : ',(T_class/(T_class+F_class)*100),' %')

Accuracy for Persian corpus using English corpus is :  67.17302704705446  %
