In [1]:
import csv
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam
import keras
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

Load smaller_LaBSE_15lang.

The smaller LaBSE(language-agnostic BERT sentence embedding) is a patched version of google/LaBSE/2 to handle fewer languages.

Support: Arabic, Chinese, English, French, German, Italian, Japanese, Korean, Dutch, Polish, Portuguese, Spanish, Thai, Turkish, Russian.

Load preprocess and encoder.

https://tfhub.dev/jeongukjae/smaller_LaBSE_15lang/1

In [2]:
#Load if the files are saved on the computer
LaBSE_preprocess=hub.KerasLayer("15lang_preprocess")
LaBSE_encoder=hub.KerasLayer("LaBSE_15")

# Loading models from tfhub.dev
#LaBSE_encoder = hub.KerasLayer("https://tfhub.dev/jeongukjae/smaller_LaBSE_15lang/1")
#LaBSE_preprocess = hub.KerasLayer("https://tfhub.dev/jeongukjae/smaller_LaBSE_15lang_preprocess/1")

In [None]:
get_embedding_for_word(sentences):
    sentences = tf.keras.layers.Input(shape=(), dtype=tf.string, name="sentences")
    encoder_inputs = LaBSE_preprocess(sentences)
    sentence_representation = LaBSE_encoder(encoder_inputs)["pooled_output"]
    normalized_sentence_representation = tf.nn.l2_normalize(sentence_representation, axis=-1)  # for cosine similarity
    model = tf.keras.Model(sentences, normalized_sentence_representation)
    return model(sentences)

Divide the text into sequences and the next words.

Example:

    sentence = 'the boy who lived'.
    
    sequence=['the boy who'], next word='lived'.

In [3]:
def Divide_text_file_into_sequences_and_next_words(file_path,language):
    
    #Dictionary of regular expression for wanted language.
    #Thid dictionary is the only part of the code that needs editing in case of 
    #working with another language (which is supported by smaller_LaBSE_15lang)
    regex_for_language = {
        "english": "[^a-zA-Z .]",
        "russian": "[^А-я .]"
    }
        
    text = open(file_path,encoding="utf8").read().lower()
    text=re.sub(regex_for_language[language],'',text)
    text=re.sub(' +',' ',text)
    sentences=text.split('.')
    sentences=[x for x in sentences if x]
    sequences=[]
    temp_labels=[]
    for sentence in sentences:
        sentence=sentence.strip()
        temp_sentence=sentence.split(" ")
        for i in range(len(temp_sentence)):
            s = [' '.join(temp_sentence[0:i+1])]
            sequences.append(s)
            temp_labels.append(temp_sentence[i])
    temp_labels.pop(0)#Shift left the temp labels list (temp_labels is a list with the next word for each sequence in the text)
    sequences.pop(-1)#delete the last sequences (there is no next word for it)
    sequences=[x for x in sequences if x!=['']]
    temp_labels=[x for x in temp_labels if x!='']
    return sequences,temp_labels,sentences

# Text tokenization using keras Tokenizer.

In [4]:
def tokenization(sentences):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(sentences) #After the Tokenizer has been created, we then fit it on the data.
    word_index = tokenizer.word_index #word index maps words in our vocabulary to their numeric representation.
    total_unique_words = len(word_index) + 1 #number of words in vocabulary
    return word_index,total_unique_words

# Create vector for each label.

(label is the next word for each sequence)

In [5]:
def vectors_for_labels(word_index,total_unique_words,temp_labels):
    labels=[]
    for i in range(len(temp_labels)):
        lst=[0]*total_unique_words
        lst[word_index[temp_labels[i]]]=1
        labels.append(lst)
    return labels

# Build dataset for Predicting the next word.

In [6]:
def build_dataset_for_predicting_the_next_word(file_path,language,optional_file_path=None):
    sequences,temp_labels,sentences = Divide_text_file_into_sequences_and_next_words(file_path,language)
    if optional_file_path:
        optional_sequences,optional_temp_labels,optional_sentences = Divide_text_file_into_sequences_and_next_words(optional_file_path,language)
        word_index,total_unique_words = tokenization(sentences+optional_sentences)
        optional_labels = vectors_for_labels(word_index,total_unique_words,optional_temp_labels)
        labels = vectors_for_labels(word_index,total_unique_words,temp_labels)
        return sequences, labels, total_unique_words,optional_sequences,optional_labels
    else:
        word_index,total_unique_words = tokenization(sentences)
        labels = vectors_for_labels(word_index,total_unique_words,temp_labels)
        return sequences, labels, total_unique_words

# Plot creation method for the training process.

In [7]:
def get_plot(history,wanted_history,data_set_type):
    plt.plot(history.history[wanted_history])
    plt.title('model '+ wanted_history)
    plt.ylabel(wanted_history)
    plt.xlabel('epoch')
    plt.legend([data_set_type], loc='upper left')
    return plt

# Build language model.

In [8]:
def build_model(intermediate_layer,total_unique_words,dropout_value=0.4,activation_function='softmax',learning_rate=0.0001,
               loss_function='categorical_crossentropy'):
    
    #Dictionarie for the wanted intermediate_layer, 768 is the hidden size of LaBSE's sequence_output.
    intermediate_layers = {
        "gru": tf.keras.layers.GRU(768),
        "lstm": tf.keras.layers.LSTM(768)
    }
    
    
    text_input=tf.keras.layers.Input(shape=(),dtype=tf.string,name="input_layer")
    LaBSE_preprocessed_text=LaBSE_preprocess(text_input)
    LaBSE_encoder_output=LaBSE_encoder(LaBSE_preprocessed_text)
    
    #"sequence_output": representations of every token in the input sequence with 
    #shape [batch size, max sequence length, hidden size(768)].
    l=intermediate_layers[intermediate_layer](LaBSE_encoder_output['sequence_output'])
    l=tf.keras.layers.Dropout(dropout_value,name='dropout')(l)
    l=tf.keras.layers.Dense(total_unique_words,activation=activation_function,name='output')(l)

    model=tf.keras.Model(inputs=[text_input],outputs=[l])

    model.compile(optimizer=Adam(learning_rate=learning_rate), loss=loss_function, metrics=['accuracy'])
    model.summary() 
    return model

In the next section of the project

#TBD#

For research purposes, An experiment with 2 different intermediate layers will be performed - GRU and LSTM.

# Build train and test datasets for english tasks

In [9]:
sequences,lables,total_unique_words = build_dataset_for_predicting_the_next_word("Harry Potter and the Philosophers Stone.txt","english")
sequences_train, sequences_test, lables_train, lables_test = train_test_split(sequences, lables, test_size=0.5)

# First NLP task - predict the next word.
# [English,LSTM]
Tuned the model to predict the next word in the sequence for english language.

language: english.

Text data: Harry Potter and the Philosophers Stone.

Intermediate layer: LSTM.

Frozen layers during the tuning: LaBSE embedding layers.


In [None]:
filename = 'englishLSTMNextWord'  
if os.path.isdir(filename):# Checks if tuned model is exist.
    model = keras.models.load_model(filename)  # If yes,load weights of the tuned model.
else:#Else, tuned the model.
    model = build_model("lstm",total_unique_words)
    train_history = model.fit(sequences_train, lables_train, epochs=1)
    model.save(filename)  # Saves the weights of the tuned model for future use.
    get_plot(train_history,'accuracy','train').show()
    get_plot(train_history,'loss','train').show()

score = model.evaluate(sequences_test, lables_test) 
print('Test loss:', score[0]) 
print('Test accuracy:', score[1])

# Second NLP task - predict the next word.
# [English,GRU]
Tuned the model to predict the next word in the sequence for english language.

language: english.

Text data: Harry Potter and the Philosophers Stone.

Intermediate layer: GRU.

Frozen layers during the tuning: LaBSE embedding layers.


In [None]:
filename = 'englishGRUNextWord'  
if os.path.isdir(filename):# Checks if tuned model is exist.
    model = keras.models.load_model(filename)  # If yes,load weights of the tuned model.
else:#Else, tuned the model.
    model = build_model("gru",total_unique_words)
    train_history = model.fit(sequences_train, lables_train, epochs=1)
    model.save(filename)  # Saves the weights of the tuned model for future use.
    get_plot(train_history,'accuracy','train').show()
    get_plot(train_history,'loss','train').show()

score = model.evaluate(sequences_test, lables_test) 
print('Test loss:', score[0]) 
print('Test accuracy:', score[1])

In the next section of the project, the task is to examine whether a particular book was written by the author Mikhail Sholoshov.

The model will tuned on a Поднятая целина (Podantia celina - russian-language book by the author mikhail sholokhov) and will be tested on Nachalen (russian-language book by the author mikhail sholokhov). 

The model will be tuned to predict the next word in a sequence for the train text - Podantia celina. The test will be performed on sequences from the russian book Nachalene.

If the test score is high, it can assumed that the training book and the test book were written by the same author.

For research purposes, An experiment with 2 different intermediate layers will be performed - GRU and LSTM.

# Build train and test datasets for russian tasks

In [10]:
train_sequences,train_lables,total_unique_words,test_sequences,test_labels = build_dataset_for_predicting_the_next_word("Поднятая целина.txt","russian","Nachalen.txt")

# Task - examine whether a particular book was written by the author Mikhail Sholoshov.
# [Russian,GRU]
Tuned the model to predict the next word in the sequence for Russian language.

language: Russian.

Train text data: Поднятая целина (Podantia celina - russian-language book by the author mikhail sholokhov).

Test text data: Nachalen (russian-language book by the author mikhail sholokhov).

Intermediate layer: GRU.

Frozen layers during the tuning: LaBSE embedding layers.


In [11]:
trained_model_path = 'russianGRUNextWord'  
if os.path.isdir(trained_model_path):# Checks if tuned model is exist.
    model = keras.models.load_model(trained_model_path)  # If yes,load weights of the tuned model.
else:#Else, tuned the model.
    model = build_model("gru",total_unique_words)
    train_history = model.fit(train_sequences, train_lables, epochs=1)
    model.save(trained_model_path)  # Saves the weights of the tuned model for future use.
    get_plot(train_history,'accuracy','train').show()
    get_plot(train_history,'loss','train').show()

score = model.evaluate(test_sequences, test_labels) 
print('Test loss:', score[0]) 
print('Test accuracy:', score[1])

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_layer (InputLayer)       [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_type_ids':   0           ['input_layer[0][0]']            
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                      

MemoryError: 

# Task - examine whether a particular book was written by the author Mikhail Sholoshov.
# [Russian,LSTM]
Tuned the model to predict the next word in the sequence for Russian language.

language: Russian.

Train text data: Поднятая целина (Podantia celina - russian-language book by the author mikhail sholokhov).

Test text data: Nachalen (russian-language book by the author mikhail sholokhov).

Intermediate layer: LSTM.

Frozen layers during the tuning: LaBSE embedding layers.


In [None]:
trained_model_path = 'russianLSTMNextWord'  
if os.path.isdir(trained_model_path):# Checks if tuned model is exist.
    model = keras.models.load_model(trained_model_path)  # If yes,load weights of the tuned model.
else:#Else, tuned the model.
    model = build_model("lstm",total_unique_words)
    train_history = model.fit(train_sequences, train_lables, epochs=1)
    model.save(trained_model_path)  # Saves the weights of the tuned model for future use.
    get_plot(train_history,'accuracy','train').show()
    get_plot(train_history,'loss','train').show()

score = model.evaluate(test_sequences, test_labels) 
print('Test loss:', score[0]) 
print('Test accuracy:', score[1])

prediction1=model.predict(["Wonder how long Potter’s going to"])
prediction2=model.predict(["You know how I think they"])
pca = PCA(n_components=2)
x=np.array([prediction1[0], prediction2[0]], np.float32)
reduced = pca.fit_transform(x)
t = reduced.transpose()
plt.scatter(t[0], t[1])
plt.show()