# Bantu Language Modeling



## Data Loading and preprocessing

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

path_sw = r"/content/sw-train.txt"
path_kw = r"/content/cwe-train.txt"


In [2]:
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

class Preprocessing:

    def __init__(self, data:list = None) -> None:
        self.data = data
        self.vocab = None
        self.encoder = None
        self.decoder = None


    def clean_data(self):
      self.data[0] = self.data[0].replace("\n", " ")


    def read_file(self, path):

      '''
      https://colab.research.google.com/drive/1pFL6pHFsG6QAl0th99mIKsPaUJ84w6gN?usp=sharing#scrollTo=XM3TiINWVhyf      
      '''

      self.data = open(path).readlines()
      

    def set_vocab_encoder_decoder(self):

      '''
      set the vocab and encoder/decoder for the dataset
      https://colab.research.google.com/drive/16_OYcypTu0OtA1eOT6Zjg65atzXFX9GH#scrollTo=DCf7rn8bX3Ws      
      '''


      self.vocab = sorted(list(set(self.data[0])))
      self.encoder = dict((c,i) for i,c in enumerate(self.vocab))
      self.decoder = dict((i,c) for i,c in enumerate(self.vocab))

    def set_X_y(self, maxlen=40, steps = 2, sample_size:tuple = tuple()):

      '''
      cut the text in semi-redundant sequences of maxlen characters. 
      if sample_size specified, choose subset of that size.

      '''
      

      print(sample_size[0], sample_size[1])
      if len(sample_size) == 1:
        self.data[0] = self.data[0][:sample_size[0]]
      elif len(sample_size) == 2:
        self.data[0] = self.data[0][sample_size[0]:sample_size[1]]
      else:
        self.data[0] = self.data[0]

      maxlen = 40
      steps = 2

      sentences = []
      next_chars = []

      # Append sentences to X
      # Append the next character the sentence precedes as the target value

      for i in range(0, len(self.data[0]) - maxlen, steps):
          sentences.append(self.data[0][i : i + maxlen])
          next_chars.append(self.data[0][i + maxlen])

      print("Number of sequences:", len(sentences))

      x = np.zeros((len(sentences), maxlen, len(self.vocab)), dtype=float)
      y = np.zeros((len(sentences), len(self.vocab)), dtype=float)

      # encode the values and features      
      for i, sentence in enumerate(sentences):
          for t, char in enumerate(sentence):
              x[i, t, self.encoder[char]] = 1

          y[i, self.encoder[next_chars[i]]] = 1

      print(x.shape, y.shape)
        
      return x, y



## Model Definitions

In [3]:
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, Dropout, \
                                    LSTM, Bidirectional
from tensorflow.keras import Model,Input
from tensorflow.keras.optimizers import Adam, RMSprop
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import save_model, load_model
import time

class Model:
  
  def __init__(self, X_train, X_test, y_train, y_test, input_dim, output_dim, input_length):
    self.model = None
    self.X_train, self.X_test, self.y_train, self.y_test = X_train, X_test, y_train, y_test
    self.y_pred = None
    self.input_dim, self.output_dim, self.input_length = input_dim, output_dim, input_length
    self.batch_size = 64
    self.epochs = 50
    self.history = None
    self.model_path = None

  def build_model_variant1(self, shape=()):

    '''
    A simple Bidirectional LSTM model
    '''

    self.model = Sequential([
        
      Input(shape=(self.input_length, self.input_dim)),
      Bidirectional(LSTM(64,return_sequences=False)),
      Dense(self.input_dim, activation="softmax"),

    ])

    optimizer = RMSprop(learning_rate=1e-2, decay=1e-6)
    self.model.compile(loss='categorical_crossentropy', optimizer=optimizer)

    self.model.summary() 

  def build_model_variant2(self, shape=()):

    '''
    A single LSTM layer model
    '''

    self.model = Sequential([
        
      Input(shape=(self.input_length, self.input_dim)),
      LSTM(128,return_sequences=False),
      Dense(self.input_dim, activation="softmax"),

    ])

    optimizer = RMSprop(learning_rate=1e-2, decay=1e-6)

    self.model.compile(loss = 'categorical_crossentropy', optimizer = optimizer)

    self.model.summary()  

  def model_fit(self, batch_size=None, epochs = None, name=""):

    if batch_size is not None:
      self.batch_size = batch_size

    if epochs is not None:
      self.epochs = epochs

    # save model history locally
    # https://stackoverflow.com/questions/47843265/how-can-i-get-a-keras-models-history-after-loading-it-from-a-file-in-python
    from keras.callbacks import CSVLogger
    tagger = str(time.time())[0:10]
    csv_logger = CSVLogger(name + "_" + tagger + '_training.log', separator=',', append=False)

    self.history = self.model.fit(self.X_train, self.y_train, \
                                  batch_size=self.batch_size, \
                                  epochs=self.epochs, \
                                  validation_split=0.15, \
                                  callbacks=[csv_logger],
                                  verbose=2)
    # save model
    path = os.getcwd() + "/" + name + "_" + tagger + "_loss_" + \
          str(self.history.history["val_loss"][-1])[0:6].replace(".", "_") + \
          "_model.h5"
    self.model_path = path
    save_model(self.model, path)


  def model_predict(self, X_test = None, y_test = None):
    if X_test is not None:
      self.X_test = X_test

    if y_test is not None:
      self.y_test = y_test

    self.y_pred = self.model.predict(self.X_test)

    for y in self.y_pred:
      for ind, val in enumerate(y):
        if val == max(y):
          y[ind] = 1.0
        else:
          y[ind] = 0.0


  def model_evaluate(self):
    
    for y in self.y_pred:
      for ind, val in enumerate(y):
        if val == max(y):
          y[ind] = 1.0
        else:
          y[ind] = 0.0

    return accuracy_score(self.y_test, self.y_pred)


  def load_model(self, path):
    self.model = load_model(path)

## Evaluate

### Swahili

In [4]:
from time import sleep

# SWAHILI
s_p = Preprocessing()
s_p.read_file(path_sw)
s_p.clean_data()
s_p.set_vocab_encoder_decoder()

input_dim = len(s_p.vocab)
inputlen = emb_dim = 40

data_len = len(s_p.data[0])

counter = 0

# Train in stages to prevent session crashing for a total of 20 epochs
while counter < 20:
  sleep(60)
  start = int((data_len / 160) * counter)
  end = int(((data_len / 160) * (counter+1)) - 1)

  s_p.read_file(path_sw)
  s_p.clean_data()

  sw_X_train, sw_X_test = s_p.set_X_y(sample_size = (start, end))

  input_dim = len(s_p.vocab)
  inputlen = emb_dim = 40

  sw_X_train, sw_X_test, sw_y_train, sw_y_test = train_test_split(sw_X_train, sw_X_test, test_size=0.1, random_state = 42)

  if counter == 0:
    s_m = Model(sw_X_train, sw_X_test, sw_y_train, sw_y_test, input_dim, emb_dim, inputlen)
    s_m.build_model_variant2()
    print('Counter ', counter)

  else:
    s_m.load_model(model_path)

  counter += 1
  optimizer = RMSprop(learning_rate=1e-2, decay=1e-6)
  s_m.model.compile(loss = 'categorical_crossentropy', optimizer = optimizer)

  s_m.model_fit(batch_size = 256, epochs = 1, name="sw")
  model_path = s_m.model_path
  print(model_path)



# # sm = make_model(path_sw)
# # km = make_model(path_kw)


0 245380
Number of sequences: 122670
(122670, 40, 48) (122670, 48)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 128)               90624     
                                                                 
 dense (Dense)               (None, 48)                6192      
                                                                 
Total params: 96,816
Trainable params: 96,816
Non-trainable params: 0
_________________________________________________________________
Counter  0
367/367 - 10s - loss: 2.3028 - val_loss: 2.0554 - 10s/epoch - 27ms/step
/content/sw_1667446371_loss_2_0554_model.h5
245381 490762
Number of sequences: 122671
(122671, 40, 48) (122671, 48)
367/367 - 5s - loss: 1.9699 - val_loss: 1.9198 - 5s/epoch - 14ms/step
/content/sw_1667446446_loss_1_9197_model.h5
490763 736143
Number of sequences: 122670
(122670, 40, 48) (122670, 

### Kwere

In [5]:
# KWERE

k_p = Preprocessing()
k_p.read_file(path_kw)
k_p.clean_data()
k_p.set_vocab_encoder_decoder()

print('Data ingested')

kw_X, kw_y = k_p.set_X_y(sample_size = (0,600000))

print('X, y set')

input_dim = len(k_p.vocab)
inputlen = emb_dim = 40

kw_X_train, kw_X_test, kw_y_train, kw_y_test = train_test_split(kw_X, kw_y, test_size=0.2, random_state = 42)
k_m = Model(kw_X_train, kw_X_test, kw_y_train, kw_y_test, input_dim, emb_dim, inputlen)


Data ingested
Number of sequences: 301696
(301696, 40, 31) (301696, 31)
X, y set


In [6]:
k_m.build_model_variant2()
k_m.model_fit(batch_size = 512, epochs = 15, name="kw")

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 128)               81920     
                                                                 
 dense (Dense)               (None, 31)                3999      
                                                                 
Total params: 85,919
Trainable params: 85,919
Non-trainable params: 0
_________________________________________________________________
Epoch 1/15
401/401 - 11s - loss: 1.7854 - val_loss: 1.4575 - 11s/epoch - 29ms/step
Epoch 2/15
401/401 - 3s - loss: 1.3314 - val_loss: 1.2814 - 3s/epoch - 8ms/step
Epoch 3/15
401/401 - 3s - loss: 1.2069 - val_loss: 1.2229 - 3s/epoch - 8ms/step
Epoch 4/15
401/401 - 3s - loss: 1.1437 - val_loss: 1.1833 - 3s/epoch - 8ms/step
Epoch 5/15
401/401 - 3s - loss: 1.1027 - val_loss: 1.1627 - 3s/epoch - 8ms/step
Epoch 6/15
401/401 - 3s - loss: 1.0713 - val_

In [7]:
k_m.model_predict()
k_m.model_evaluate()



0.6457739476300961

## Cross-Val Loss Evaluation

In [5]:
from math import log2

def evaluate_one(lang, model, vocab, encoder):

  '''
  calculate cross-entropy loss
  iteratively read the test file's characters, calculate loss of each
  and subtract from total loss
  '''

  testfile = open(lang+'-test.txt', 'r')
  
  max_history = 30 #max sentence length
  history = []
  loss_anything_goes = 0
  count = 0

  while True:
    c = testfile.read(1)

    if c=='\n':
      continue

    if not c:
      break

    count += 1

    loss_anything_goes -= log2(predict_next_proba(c, model, vocab, encoder, history))

    if len(history) == max_history:
      history.pop(0)

    history.append(c)

  return [loss_anything_goes/count]


def predict_next_proba(c, model, vocab, encoder, history):
  '''
  returns the probability of the expected character
  '''

  inputlen = 40
  x_test = np.zeros((1, inputlen, len(vocab)),dtype=float)

  # create a matrix of current sequence of characters,
  for ind,val in enumerate(history[:-1]):   
      x_test[0, ind, encoder[val]] = 1

  # pass our x into the model and return a prediction matrix
  y_pred = model.predict(x_test, verbose=0)[0]

  # return the computed probability of our character
  proba = y_pred[encoder[c]]

  return proba


In [None]:
swahili_loss = evaluate_one("sw", s_m.model,s_p.vocab, s_p.encoder)

In [None]:
print("Swahili Cross Entropy Loss: ", swahili_loss)

In [10]:
kwere_loss = evaluate_one("cwe", k_m.model,k_p.vocab, k_p.encoder)

In [11]:
print("Kwere Cross Entropy Loss: ", kwere_loss)

Kwere Cross Entropy Loss:  [4.701911114505756]
