In [34]:

from google.colab import drive
drive.mount('/content/gdrive/') 
 
!pip install morfessor
import morfessor
import os
import time
import math
import statistics
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from io import open
import numpy as np
import random

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


## Model 1: Morfessor-Baseline

In [35]:
#Model 1: Morfessor-Baseline
class baseline:


    def __init__(self):
        self.info = ''
        self.scores = None

#Trainer: trains from data in inFile and outputs binary trained model to modelFile
    def trainModel(self, inFile, modelFile):
        io = morfessor.MorfessorIO()
        train_data = list(io.read_corpus_file(inFile))

        print('Training now...')
        model = morfessor.BaselineModel()
        model.load_data(train_data)
        model.train_batch()

        io.write_binary_model_file(modelFile, model)
        
#Evaluater: test trained model from modelFile on data at inFile and outputs results to outFile on a given language
    def evaluateModel(self, inFile, outFile, modelFile, language, evaluate = True):
        io = morfessor.MorfessorIO()
        model = io.read_binary_model_file(modelFile)

        segmentations = []
        f = open(inFile, 'r')
        for lines in f:
            segmentations.append(model.viterbi_segment(lines.split()[0])[0])

        if evaluate == True:
            util = utilities()
            self.scores = util.evalMorphSegments(segmentations, util.loadEvaluationData(inFile)[1])
            f = open(outFile, 'a')
            self.info = ('\n' + language + ':\nTokens: ' + str(util.loadEvaluationData(inFile)[0]) + '\nActual segmentations: ' + 
                        str(util.loadEvaluationData(inFile)[1]) + '\nPredicted segmentations: ' + 
                        str(segmentations) + '\nScores: Precsion: ' + str(self.scores[0]) + 
                        ' Recall: ' + str(self.scores[1]) + 
                        ' F-score: ' + str(self.scores[2]) + '\n')
            f.write(self.info)
            print(self.info)

        return segmentations
       

    def getScores(self):
        return self.scores



## Useful methods for LM models

In [36]:
#Useful methods for LM1 and LM2
class utilities:

    #Loads data from corpus
    def loadEvaluationData(self, file):
        g = []
        h = []
        f = open(file,'r')
        for line in f:
            tokens = line.split()
            g.append(tokens[0])
            temp = []
            for j in range(1, len(tokens)):
                temp.append(tokens[j])
            h.append(temp)
        return [g,h]    


    #reverses elements in a 2D array with the option to exculde last element per sub array from this reversal
    def reverseListElem(self, list, exceptLastElem = False):
        revList = []
        if exceptLastElem:  
            temp = len(list) 
            for i in range(temp):
                revList.append(list[i][::-1][1:])
                revList[i].append(list[i][-1]) 
        else:
            temp = len(list) 
            for i in range(temp):
                revList.append(list[i][::-1]) 
                    
        return revList
            

    #Determines score for a list of segmentations
    def evalMorphSegments(self, predicted, target):
        correct = 0.0
        for pred, targ in zip(predicted, target):
            for p in pred:
                if p in targ:
                    correct += 1

        predicted_length = sum([len(pred) for pred in predicted])
        target_length = sum([len(targ) for targ in target])
        precision, recall = correct/predicted_length, correct/target_length
        f_score = 2/(1/precision + 1/recall)
        return (precision, recall, f_score)


    #computes entropy for a probabilty distribution
    def entropy(self, distribution):
        if len(distribution) < 1:
            return 0

        entropy_acc = 0
        for probability in distribution:
            entropy_acc += probability * math.log(probability,2)
        return -entropy_acc 


    #Segments tokens according to specified objective function
    def segmenter(self, tokens, lentropies, rentropies, language, objectiveFunction):
        count = 0
        morphList = []
        for i in range(len(lentropies)):
            if objectiveFunction == 1:
                segmentList = self.objectiveFunction3(lentropies[i], rentropies[i], language)
            elif objectiveFunction == 2:
                segmentList = self.objectiveFunction2(lentropies[i], rentropies[i], language)
            elif objectiveFunction == 3:
                segmentList = self.objectiveFunction3(lentropies[i], rentropies[i], language)
            elif objectiveFunction == 4:
                segmentList = self.objectiveFunction4(lentropies[i], rentropies[i], language)
            elif objectiveFunction == 5:
                segmentList = self.objectiveFunction5(lentropies[i], rentropies[i], language)
            else: 
                segmentList = self.objectiveFunction6(lentropies[i], rentropies[i], language)
            
            s = list(zip(segmentList, tokens[i]))
            temp = []
            tempString = ''
            for j in range(len(s)):
                if s[j][0] == 0:
                    tempString+=s[j][1]
                else:
                    tempString+=s[j][1]
                    temp.append(tempString)
                    tempString = ''

            morphList.append(temp)
        return morphList


    #Segment words at random locations
    def objectiveFunction1(self, lwordentropies, rwordentropies, language):
        segmentArray = []
        if len(lwordentropies) > 1:
            for i in range(len(lwordentropies)-1):
                segmentArray.append(random.randint(0, 1))
        segmentArray.append(1)
        return segmentArray


    #Segments based on experimentally determined constants based on language
    def objectiveFunction2(self, lwordentropies, rwordentropies, language):
        segmentArray = []
        if language == 'xhosa':
            val = 4
        elif language == 'zulu':
            val = 3
        elif language == 'swati':
            val = 12
        else:
            val = 2.5
        for i in range(0, len(lwordentropies)-1):
            if lwordentropies[i] + rwordentropies[i] > val:
                segmentArray.append(1)
            else:
                segmentArray.append(0)
        segmentArray.append(1)
        return segmentArray


    #Segments using right and left entropy and mean with standard deviation 
    def objectiveFunction3(self, lwordentropies, rwordentropies, language):
        if len(lwordentropies) == 1:
            return [1]
        elif len(lwordentropies) == 2:
            return [0, 1]
        elif len(lwordentropies) > 2:
            segmentArray = []
            numListl = []
            numListr = []
            for j in range(len(lwordentropies)-1):
                numListl.append(lwordentropies[j].item())
                numListr.append(rwordentropies[j].item())
            meanl = statistics.mean(numListl)
            meanr = statistics.mean(numListr)
            stdevl = statistics.stdev(numListl)
            stdevr = statistics.stdev(numListr)
            mean, stdev = (meanl + meanr)/2, (stdevl + stdevr)/2
            for i in range(len(numListr)):
                if (numListl[i] + numListr[i])/2 <= (mean + 1*stdev):
                    segmentArray.append(0)
                else:
                    segmentArray.append(1)
            segmentArray.append(1)
            return segmentArray


    #Uses entropy at neighbouring positions at a each index to determine a segmentation
    def objectiveFunction4(self, lwordentropies, rwordentropies, language):
        if len(lwordentropies) == 1:
            return [1]
        elif len(lwordentropies) == 2:
            return [0, 1]
        elif len(lwordentropies) > 2:
            segmentArray = []
            x2 = lwordentropies[1]
            x1 = lwordentropies[0]
            y2 = rwordentropies[1]
            y1 = rwordentropies[0]
            if x1 > x2 and y1 > y2:
                segmentArray.append(1)
            else:
                segmentArray.append(0)
            
            if len(lwordentropies) > 3:
                for i in range(1, len(lwordentropies)-2):
                    if lwordentropies[i] > lwordentropies[i-1] and lwordentropies[i] > lwordentropies[i+1] or rwordentropies[i] > rwordentropies[i-1] and rwordentropies[i] > rwordentropies[i+1]:
                        segmentArray.append(1)
                    else:
                        segmentArray.append(0)

            x2 = lwordentropies[len(lwordentropies)-2]
            x1 = lwordentropies[len(lwordentropies)-3]
            y2 = rwordentropies[len(lwordentropies)-2]
            y1 = rwordentropies[len(lwordentropies)-3]
            if x1 > x2 and y1 > y2:
                segmentArray.append(1)
            else:
                segmentArray.append(0)
            
            segmentArray.append(1)
            return segmentArray


    #Segmentation is based on the mean and standard deviation of left entropy 
    def objectiveFunction5(self, lwordentropies, rwordentropies, language):
        if len(lwordentropies) == 1:
            return [1]
        elif len(lwordentropies) == 2:
            return [0, 1]
        elif len(lwordentropies) > 2:
            segmentArray = []
            numListl = []
            numListr = []
            for j in range(len(lwordentropies)-1):
                numListl.append(lwordentropies[j].item())
                numListr.append(rwordentropies[j].item())
            meanl = statistics.mean(numListl)
            meanr = statistics.mean(numListr)
            stdevl = statistics.stdev(numListl)
            stdevr = statistics.stdev(numListr)
            mean, stdev = (meanl + meanr)/2, (stdevl + stdevr)/2
            for i in range(len(numListr)):
                if numListl[i] <= (mean + 1*stdev):
                    segmentArray.append(0)
                else:
                    segmentArray.append(1)
            segmentArray.append(1)
            return segmentArray
    

    #Segmentation based on neigbouring positions in a taoken using left entropy only 
    def objectiveFunction6(self, lwordentropies, rwordentropies, language):
        if len(lwordentropies) == 1:
            return [1]
        elif len(lwordentropies) == 2:
            return [0, 1]
        elif len(lwordentropies) > 2:
            segmentArray = []
            x2 = lwordentropies[1]
            x1 = lwordentropies[0]
            y2 = rwordentropies[1]
            y1 = rwordentropies[0]
            if x1 > x2 and y1 > y2:
                segmentArray.append(1)
            else:
                segmentArray.append(0)
            
            if len(lwordentropies) > 3:
                for i in range(1, len(lwordentropies)-2):
                    if lwordentropies[i] > lwordentropies[i-1] and lwordentropies[i] > lwordentropies[i+1]:
                        segmentArray.append(1)
                    else:
                        segmentArray.append(0)

            x2 = lwordentropies[len(lwordentropies)-2]
            x1 = lwordentropies[len(lwordentropies)-3]
            y2 = rwordentropies[len(lwordentropies)-2]
            y1 = rwordentropies[len(lwordentropies)-3]
            if x1 > x2 and y1 > y2:
                segmentArray.append(1)
            else:
                segmentArray.append(0)
            
            segmentArray.append(1)
            return segmentArray
    

    #word segmentation
    def objectiveFunction7(self, lwordentropies, rwordentropies, language):
        segmentArray = []
        for i in range(0, len(lwordentropies)-1):
                segmentArray.append(0)
        segmentArray.append(1)
        return segmentArray

## Model 2: LM1

In [41]:
#Model 2: LM1
# Dictionary and Corpus does character indexing for LM1
class Dictionary(object):


    def __init__(self):
        self.char2idx = {}
        self.idx2char = []
 

    def add_char(self, char):
        if char not in self.char2idx:
            self.idx2char.append(char)
            self.char2idx[char] = len(self.idx2char) - 1
        return self.char2idx[char]
 

    def __len__(self):
        return len(self.idx2char)
 
 
class Corpus(object):


    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(path + 'train.txt')
        self.valid = self.tokenize(path + 'valid.txt')
        self.test = self.tokenize(path + 'test.txt')
 

    def tokenize(self, path):
        '''Tokenizes a text file.'''
        assert os.path.exists(path)
        # Add chars to the dictionary
        with open(path, 'r', encoding='utf8') as f:
            for line in f:
                chars = line.split() 
                for char in chars:
                    self.dictionary.add_char(char)
                    self.dictionary.add_char('%') 
 
 
        # Tokenize file content
        with open(path, 'r', encoding='utf8') as f:
            idss = []
            for line in f:
                chars = line.split()
                ids = []
                for char in chars:
                    ids.append(self.dictionary.char2idx[char])
                ids.append(self.dictionary.char2idx['%'])
                idss.append(torch.tensor(ids).type(torch.int64))
            ids = torch.cat(idss)
 
        return ids



#LM1 LSTM 
class RNNModel(nn.Module):
    '''Container module with an encoder, a recurrent module, and a decoder.'''


    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
        super(RNNModel, self).__init__()
        self.ntoken = ntoken
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError( '''An invalid option for `--model` was supplied,
                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']''')
            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        if tie_weights:
            if nhid != ninp:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        self.init_weights()
        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers


    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.weight)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)


    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output)
        decoded = decoded.view(-1, self.ntoken)
        return F.log_softmax(decoded, dim=1), hidden


    def init_hidden(self, bsz):
        weight = next(self.parameters())
        if self.rnn_type == 'LSTM':
            return (weight.new_zeros(self.nlayers, bsz, self.nhid),
                    weight.new_zeros(self.nlayers, bsz, self.nhid))
        else:
            return weight.new_zeros(self.nlayers, bsz, self.nhid)



#LM1 trainer: contains methods necessary for LM1 training 
class trainModel:


    def __init__(self, datasets, save, model ='LSTM', emsize =200, nhid =200, nlayers =2,
                lr =20.0, clip =0.25, epochs =1, batch_size =20, bptt =35, dropout =0.2, tied =False, 
                seed =1111, cuda =False, log_interval =200,  nhead =2,
               dry_run =False):

        torch.manual_seed(seed)
        if torch.cuda.is_available():
            if not cuda:
                print('WARNING: You have a CUDA device, so you should probably run with --cuda')
        device = torch.device('cuda' if cuda else 'cpu')

        #Load data
        corpus = Corpus(datasets)
        eval_batch_size = 10
        train_data = self.batchify(corpus.train, batch_size, device)
        val_data = self.batchify(corpus.valid, eval_batch_size, device)
        test_data = self.batchify(corpus.test, eval_batch_size, device)

        # Build the model
        ntokens = len(corpus.dictionary)
        model = RNNModel(model, ntokens, emsize, nhid, nlayers, dropout, tied).to(device)
        criterion = nn.NLLLoss()

        # Training code
        best_val_loss = None
        try:
            for epoch in range(1, epochs+1):
                epoch_start_time = time.time()
                self.train(model, corpus, batch_size, train_data, bptt, criterion, clip, lr, log_interval, dry_run, epoch)
                val_loss = self.evaluate(val_data, model, corpus, eval_batch_size, criterion, bptt)
                print('-' * 89)
                print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                        'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                                   val_loss, math.exp(val_loss)))
                print('-' * 89)
                if not best_val_loss or val_loss < best_val_loss:
                    with open(save, 'wb') as f:
                        torch.save(model, f)
                    best_val_loss = val_loss
                else:
                    lr /= 4.0
        except KeyboardInterrupt:
            print('-' * 89)
            print('Exiting from training early')

        # Load the best saved model.
        with open(save, 'rb') as f:
            model = torch.load(f)
            if model in ['RNN_TANH', 'RNN_RELU', 'LSTM', 'GRU']:
                model.rnn.flatten_parameters()

        # Run on test data.
        test_loss = self.evaluate(test_data, model, corpus, eval_batch_size, criterion, bptt)
        print('=' * 89)
        print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
            test_loss, math.exp(test_loss)))
        print('=' * 89)


    def batchify(self, data, bsz, device):
        nbatch = data.size(0) // bsz
        data = data.narrow(0, 0, nbatch * bsz)
        data = data.view(bsz, -1).t().contiguous()
        return data.to(device)


    def repackage_hidden(self, h):
        if isinstance(h, torch.Tensor):
            return h.detach()
        else:
            return tuple(self.repackage_hidden(v) for v in h)


    def get_batch(self, source, i, bptt):
        seq_len = min(bptt, len(source) - 1 - i)
        data = source[i:i+seq_len]
        target = source[i+1:i+1+seq_len].view(-1)
        return data, target


    def evaluate(self, data_source, model, corpus, eval_batch_size, criterion, bptt):
        model.eval()
        total_loss = 0.
        ntokens = len(corpus.dictionary)
        hidden = model.init_hidden(eval_batch_size)
        with torch.no_grad():
            for i in range(0, data_source.size(0) - 1, bptt):
                data, targets = self.get_batch(data_source, i, bptt)
                output, hidden = model(data, hidden)
                hidden = self.repackage_hidden(hidden)
                total_loss += len(data) * criterion(output, targets).item()
        return total_loss / (len(data_source) - 1)


    def train(self, model, corpus, batch_size, train_data, bptt, criterion, clip, lr, log_interval, dry_run, epoch):
        model.train()
        total_loss = 0.
        start_time = time.time()
        ntokens = len(corpus.dictionary)
        hidden = model.init_hidden(batch_size)
        for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
            data, targets = self.get_batch(train_data, i, bptt)
            model.zero_grad()
            hidden = self.repackage_hidden(hidden)
            output, hidden = model(data, hidden)
            loss = criterion(output, targets)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
            for p in model.parameters():
                p.data.add_(p.grad, alpha=-lr)

            total_loss += loss.item()

            if batch % log_interval == 0 and batch > 0:
                cur_loss = total_loss / log_interval
                elapsed = time.time() - start_time
                print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                        'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(train_data) // bptt, lr,
                    elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss)))
                total_loss = 0
                start_time = time.time()
            if dry_run:
                break


#LM1 evaluater: uses a given trained and test data to produce and evaluate segmentations
class evaluateModel:	


    def __init__(self, datasets, testdata, output, lmodel, rmodel, language, objectiveFunction, evaluate = True):
        self.datasets = datasets
        self.testdata = testdata
        self.lmodel = lmodel
        self.rmodel = rmodel
        self.util = utilities()
        self.tokens = self.util.loadEvaluationData(testdata)[0]
        self.actual = self.util.loadEvaluationData(testdata)[1]
        self.entropy = self.getEntropy(self.tokens, datasets, lmodel, rmodel)
        self.prediction = self.util.segmenter(self.tokens, self.entropy[0], self.entropy[1], language, objectiveFunction) 
        if evaluate == True:
            self.evaluation = self.util.evalMorphSegments(self.prediction, self.actual)
            self.info = (language + ':\nTestdata: ' + str(self.testdata) + '\nTokens: ' + str(self.tokens) + 
                '\nReversed tokens: ' + str(self.util.reverseListElem(self.tokens)) + '\nActual segmentation: ' + 
                str(self.actual) + '\nPrediction: ' + str(self.prediction) + '\nLeft entropy: ' + 
                str(self.entropy[0]) + '\nRight entropy: ' + str(self.entropy[1]) + '\nScores: '+ 
                'Precision: ' + str(self.evaluation[0]) + ' Recall: ' + str(self.evaluation[1]) + 
                ' F-Score: ' + str(self.evaluation[2]) + '\n' + '-'*10000 + '\n')
            f = open(output, 'a')
            f.write(self.info)


    def getEntropy(self, inf, datasets, lcheckpoint, rcheckpoint, cuda = False, seed = 1111, temperature = 1.0):
        # Set the random seed manually for reproducibility.
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            if not cuda:
                print('WARNING: You have a CUDA device, so you should probably run with --cuda')
        device = torch.device('cuda' if cuda else 'cpu')
        if temperature < 1e-3:
            print('--temperature has to be greater or equal 1e-3')
            sys.exit()
        
        f = open(lcheckpoint, 'rb')
        leftModel = torch.load(f).to(device)
        leftModel.eval()

        f = open(rcheckpoint, 'rb')
        rightModel = torch.load(f).to(device)
        rightModel.eval()
    
        corpus = Corpus(datasets)
        ntokens = len(corpus.dictionary)

        leftEntropy = []
        rightEntropy = []
        i = 0

        with torch.no_grad():  # no tracking history
            for words in inf:
                words = words.strip()           
                leftEntropy.append([])
                rightEntropy.append([])
                lefthidden = leftModel.init_hidden(1)
                righthidden = rightModel.init_hidden(1)
                leftinput = torch.tensor([[corpus.dictionary.char2idx[words[0]]]], dtype=torch.long).to(device) 
                rightinput = torch.tensor([[corpus.dictionary.char2idx[words[-1]]]], dtype=torch.long).to(device) 

                for j in range(len(words)-1):
                    #left model
                    output, lefthidden = leftModel(leftinput, lefthidden)
                    char_weights = output.squeeze().div(temperature).exp().cpu()
                    leftEntropy[i].append(self.util.entropy(char_weights[:]))
                    leftinput.fill_(corpus.dictionary.char2idx[words[j]])

                    #right model
                    output, righthidden = rightModel(rightinput, righthidden)
                    char_weights = output.squeeze().div(temperature).exp().cpu()
                    rightEntropy[i].append(self.util.entropy(char_weights[:]))
                    rightinput.fill_(corpus.dictionary.char2idx[words[len(words)-j-1]])

                leftEntropy[i].append('$')
                rightEntropy[i].append('$')
                i += 1

        return (leftEntropy, self.util.reverseListElem(rightEntropy, True))

    
    def getScores(self):
        return self.evaluation


    def getInfo(self):
        return self.info

    def getPredictedSegmentations(self):
        return self.prediction


## Model 3: LM2

In [48]:
#Model 3: LM2 
#Converts sequences to vectors that the LSTM can intepret 
def one_hot_encode(arr, n_labels):
    
    # Initialize the the encoded array
    one_hot = np.zeros((np.multiply(*arr.shape), n_labels), dtype=np.float32)
    
    # Fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    
    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot


#LM2 LSTM
class CharRNN(nn.Module):
    
    def __init__(self, tokens, n_steps =100, n_hidden =512, n_layers =2, drop_prob =0.5, lr =0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        # Creating character dictionaries
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        ## Define the LSTM
        self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        ## Define a dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        ## Define the final, fully-connected output layer
        self.fc = nn.Linear(n_hidden, len(self.chars))
        
        # Initialize the weights
        self.init_weights()
      
    
    def forward(self, x, hc):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hc`. '''
        
        ## Get x, and the new hidden state (h, c) from the lstm
        x, (h, c) = self.lstm(x, hc)
        
        ## Ppass x through the dropout layer
        x = self.dropout(x)
        
        # Stack up LSTM outputs using view
        x = x.contiguous().view(-1, self.n_hidden)
        ## Put x through the fully-connected layer
        x = self.fc(x)
        
        # Return x and the hidden state (h, c)
        return x, (h, c)
    
    
    def predict(self, char, h=None, cuda=False, top_k=None):
        ''' Given a character, predict the next character.
        
            Returns the predicted character and the hidden state.
        '''
        if cuda:
            self.cuda()
        else:
            self.cpu()
        
        if h is None:
            h = self.init_hidden(1)
        
        x = np.array([[self.char2int[char]]])
        x = one_hot_encode(x, len(self.chars))
        
        inputs = torch.from_numpy(x)
        
        if cuda:
            inputs = inputs.cuda()
        
        h = tuple([each.data for each in h])
        out, h = self.forward(inputs, h)
 
        p = F.softmax(out, dim=1).data
        
        if cuda:
            p = p.cpu()
        
        if top_k is None:
            top_ch = np.arange(len(self.chars))
        else:
            p, top_ch = p.topk(top_k)
            top_ch = top_ch.numpy().squeeze()
        
        p = p.numpy().squeeze()
        
        char = np.random.choice(top_ch, p=p/p.sum())
            
        return p, h
    
 
    def init_weights(self):
        ''' Initialize weights for fully connected layer '''
        initrange = 0.1
        
        # Set bias tensor to all zeros
        self.fc.bias.data.fill_(0)
        # FC weights as random uniform
        self.fc.weight.data.uniform_(-1, 1)
        
 
    def init_hidden(self, n_seqs):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x n_seqs x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        return (weight.new(self.n_layers, n_seqs, self.n_hidden).zero_(),
                weight.new(self.n_layers, n_seqs, self.n_hidden).zero_())
        


#LM2 trainer: trains LM2 with given data and a output path
class trainModel_:

    def __init__(self, dataPath, savePath, n_hidden =514, nlayers =2,
            lr =0.001, clip =5, epochs =1, n_seqs =128, n_steps =100, dropout =0.5,
                cuda =False):
    
        with open(dataPath, 'r', encoding='utf8') as f:
            text = f.read()

        chars = tuple(set(text))
        int2char = dict(enumerate(chars))
        char2int = {ch: ii for ii, ch in int2char.items()}
        encoded = np.array([char2int[ch] for ch in text])
        batches = self.get_batches(encoded, n_seqs, n_steps)
        x, y = next(batches)
        net = CharRNN(chars, n_steps, n_hidden, nlayers, dropout, lr)
        print(net)
        
        self.train(net, encoded, epochs, n_seqs, n_steps, lr, clip, cuda=False, print_every=1)
        
        checkpoint = {'n_hidden': net.n_hidden,
                    'n_layers': net.n_layers,
                    'state_dict': net.state_dict(),
                    'tokens': net.chars}
        with open(savePath, 'wb') as f:
            torch.save(checkpoint, f)
 
 
 
    def train(self, net, data, epochs, n_seqs, n_steps, lr, clip, cuda, print_every, val_frac=0.2):
        ''' Training a network 
        
            Arguments
            ---------
            
            net: CharRNN network
            data: text data to train the network
            epochs: Number of epochs to train
            n_seqs: Number of mini-sequences per mini-batch, aka batch size
            n_steps: Number of character steps per mini-batch
            lr: learning rate
            clip: gradient clipping
            val_frac: Fraction of data to hold out for validation
            cuda: Train with CUDA on a GPU
            print_every: Number of steps for printing training and validation loss
        
        '''
        
        net.train()
        
        opt = torch.optim.Adam(net.parameters(), lr=lr)
        
        criterion = nn.CrossEntropyLoss()
        
        # create training and validation data
        val_idx = int(len(data)*(1-val_frac))
        data, val_data = data[:val_idx], data[val_idx:]
        
        if cuda:
            net.cuda()
        
        counter = 0
        n_chars = len(net.chars)
        
        for e in range(epochs):
            
            h = net.init_hidden(n_seqs)
            
            for x, y in self.get_batches(data, n_seqs, n_steps):
                
                counter += 1
                
                # One-hot encode our data and make them Torch tensors
                x = one_hot_encode(x, n_chars)
                inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
                
                if cuda:
                    inputs, targets = inputs.cuda(), targets.cuda()
    
                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                h = tuple([each.data for each in h])
    
                net.zero_grad()
                
                output, h = net.forward(inputs, h)
                
                loss = criterion(output, targets.view(n_seqs*n_steps).type(torch.LongTensor))
    
                loss.backward()
                
                # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
                nn.utils.clip_grad_norm_(net.parameters(), clip)
    
                opt.step()
                
                if counter % print_every == 0:
                    
                    # Get validation loss
                    val_h = net.init_hidden(n_seqs)
                    val_losses = []
                    
                    for x, y in self.get_batches(val_data, n_seqs, n_steps):
                        
                        # One-hot encode our data and make them Torch tensors
                        x = one_hot_encode(x, n_chars)
                        x, y = torch.from_numpy(x), torch.from_numpy(y)
                        
                        # Creating new variables for the hidden state, otherwise
                        # we'd backprop through the entire training history
                        val_h = tuple([each.data for each in val_h])
                        
                        inputs, targets = x, y
                        if cuda:
                            inputs, targets = inputs.cuda(), targets.cuda()
    
                        output, val_h = net.forward(inputs, val_h)
                        val_loss = criterion(output, targets.view(n_seqs*n_steps).type(torch.LongTensor))
                    
                        val_losses.append(val_loss.item())
                    
                    print("Epoch: {}/{}...".format(e+1, epochs),
                        "Step: {}...".format(counter),
                        "Loss: {:.4f}...".format(loss.item()),
                        "Val Loss: {:.4f}".format(np.mean(val_losses)))
                    

    def get_batches(self, arr, n_seqs, n_steps):
        '''Create a generator that returns batches of size
        n_seqs x n_steps from arr.
        
        Arguments
        ---------
        arr: Array you want to make batches from
        n_seqs: Batch size, the number of sequences per batch
        n_steps: Number of sequence steps per batch
        '''
        
        batch_size = n_seqs * n_steps
        n_batches = len(arr)//batch_size
        
        # Keep only enough characters to make full batches
        arr = arr[:n_batches * batch_size]
        
        # Reshape into n_seqs rows
        arr = arr.reshape((n_seqs, -1))
        
        for n in range(0, arr.shape[1], n_steps):
            
            # The features
            x = arr[:, n:n+n_steps]
            
            # The targets, shifted by one
            y = np.zeros_like(x)
            
            try:
                y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+n_steps]
            except IndexError:
                y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
            yield x, y



#LM2 evaluater: generates and evaluates segmentations given a trained model and test data
class evaluateModel_:

    def __init__(self, testData, output, lmodel, rmodel, language, objectiveFunction, evaluate = True):
        
        with open(lmodel, 'rb') as f:
            checkpoint = torch.load(f)
        self.lloaded = CharRNN(checkpoint['tokens'], n_hidden=checkpoint['n_hidden'], n_layers=checkpoint['n_layers'])
        self.lloaded.load_state_dict(checkpoint['state_dict'])
        
        with open(rmodel, 'rb') as g:
            checkpoint = torch.load(g)
        self.rloaded = CharRNN(checkpoint['tokens'], n_hidden=checkpoint['n_hidden'], n_layers=checkpoint['n_layers'])
        self.rloaded.load_state_dict(checkpoint['state_dict'])

        lentropies = []
        rentropies = []
        util = utilities()
        evalData = util.loadEvaluationData(testData)

        for word in evalData[0]:
            left = []
            right = []
            for i in range(len(word)-1):

                lprobabilityDist = self.lloaded.predict(word[i])[0]
                lcharEntropy = util.entropy(lprobabilityDist)
                left.append(lcharEntropy)
                rprobabilityDist = self.rloaded.predict(word[len(word)-1-i])[0]
                rcharEntropy = util.entropy(rprobabilityDist)
                right.append(rcharEntropy)

            left.append('$')
            right.append('$')
            lentropies.append(left)
            rentropies.append(right)

        rentropies = util.reverseListElem(rentropies, True)
        
        self.prediction = util.segmenter(evalData[0], lentropies, rentropies, language, objectiveFunction)
        if evaluate == True:
            self.evaluation = util.evalMorphSegments(self.prediction, evalData[1])
            self.info = (language + ':\nTest data: ' + str(testData) + '\nTokens: ' + str(evalData[0]) + 
                '\nReversed tokens: ' + str(util.reverseListElem(evalData[0])) + '\nActual segmentation: ' + 
                str(evalData[1]) + '\nPrediction: ' + str(self.prediction) + '\nLeft entropy: ' + 
                str(lentropies) + '\nRight entropy: ' + str(rentropies) + '\nScores: '+ 
                'Precision: ' + str(self.evaluation[0]) + ' Recall: ' + str(self.evaluation[1]) + 
                ' F-Score: ' + str(self.evaluation[2]) + '\n' + '-'*10000 + '\n')
            with open(output, 'a') as g:
                g.write(self.info)
                    
                   
    def getScores(self):
        return self.evaluation


    def getInfo(self):
        return self.info


    def getPredictedSegmentations(self):
        return self.prediction


## Driver functions

In [77]:
def trainMorfessor():  
    languages = ['ndebele', 'swati', 'xhosa', 'zulu']
    for language in languages:
        path = '/content/gdrive/My Drive/Colab Notebooks/colab_data/morfessor_data/'
        m = baseline()
        m.trainModel(path + language + '.train.txt', path + 'saved_models/' + language)


def evalMorfessor():  
    languages = ['ndebele', 'swati', 'xhosa', 'zulu']
    precision, recall, f_score = 0, 0, 0
    for language in languages:
        path = '/content/gdrive/My Drive/Colab Notebooks/colab_data/morfessor_data/'
        m = baseline()
        s = m.evaluateModel(path + language + '.test.txt', path + 'results.txt', path + 'saved_models/' + language, language)
        precision+=m.getScores()[0]
        recall+=m.getScores()[1]
        f_score+=m.getScores()[2]
        print('\nScores for ', language, ': Precsion: ', m.getScores()[0], ' Recall: ', m.getScores()[1], ' F-score: ', m.getScores()[2])
    av = '\nAverages: Precision: ' + str(precision/4) + ' Recall: ' + str(recall/4) + ' F-Score: ' + str(f_score/4)
    f = open(path + 'results.txt','a')
    f.write(av)
    print(av)
    

def trainLM1():
    path = '/content/gdrive/My Drive/Colab Notebooks/colab_data/entropy_data/'
    languages = ['ndebele', 'swati', 'xhosa', 'zulu']
    for language in languages:
        trainModel(path + language + '_data/left/' + language + '.', path + 'LM1_models/' + language + '_left.pt')
        trainModel(path + language + '_data/right/' + language + '.', path + 'LM1_models/' + language + '_right.pt')


def evalLM1(objectiveFunction):
    path = '/content/gdrive/My Drive/Colab Notebooks/colab_data/entropy_data/'
    languages = ['ndebele', 'swati', 'xhosa', 'zulu']
    precision, recall, f_score = 0, 0, 0
    open(path + 'LM1_models/results.txt', 'w')
    for language in languages:
        s = evaluateModel(path + language + '_data/left/' + language + '.', path + language + '_data/evaluate.txt', path + 'LM1_models/results.txt', path + 'LM1_models/' + language + '_left.pt', path + 'LM1_models/' + language + '_right.pt', language, objectiveFunction)
        print(s.getInfo())
        precision+=s.getScores()[0]
        recall+=s.getScores()[1]
        f_score+=s.getScores()[2]
    av = '\nAverages: Precision: ' + str(precision/4) + ' Recall: ' + str(recall/4) + ' F-Score: ' + str(f_score/4)
    f = open(path + 'LM1_models/results.txt','a')
    f.write(av)
    print(av)


def trainLM2():
    path = '/content/gdrive/My Drive/Colab Notebooks/colab_data/entropy_data/'
    languages = ['ndebele', 'swati', 'xhosa', 'zulu']
    for language in languages:
        spath = path + 'LM2_models/'
        lname = language + '_left.net'
        lpath = path + language + '_data/left/train.txt'
        rname = language + '_right.net'
        rpath = path + language + '_data/right/train.txt'
        trainModel_(lpath, spath + lname)
        trainModel_(rpath, spath + rname)


def evalLM2(objectiveFunction): 
    path = '/content/gdrive/My Drive/Colab Notebooks/colab_data/entropy_data/'
    languages = ['ndebele', 'swati', 'xhosa', 'zulu']
    precision, recall, f_score = 0, 0, 0
    open(path + 'LM2_models/results.txt','w')
    for language in languages:
        s = evaluateModel_(path + language + '_data/evaluate.txt', path + 'LM2_models/results.txt', path + 'LM2_models/' + language + '_left.net', path + 'LM2_models/' + language + '_right.net', language, objectiveFunction)
        print(s.getInfo())
        precision+=s.getScores()[0]
        recall+=s.getScores()[1]
        f_score+=s.getScores()[2]
    av = '\nAverages: Precision: ' + str(precision/4) + ' Recall: ' + str(recall/4) + ' F-Score: ' + str(f_score/4)
    f = open(path + 'LM2_models/results.txt','a')
    f.write(av)
    print(av)


def evalInteractive(model):
    print('Enter a list of words belonging to a single language that you would like to segment:')
    words = input().split()
    print('Enter the language of the words given: (zulu, xhosa, ndebele, swati) ')
    language = input()
    print('Enter objective function would you like to use to segment: [1-6]')
    objectiveFunction = input()
    print('Segmenting', words, 'belonging to', language, 'using', model,'with objective function', objectiveFunction)
    path = '/content/gdrive/My Drive/Colab Notebooks/colab_data/entropy_data/'
    evaluationPath = path + 'interactive.txt'
    f = open(evaluationPath, 'w')
    for w in words:
        f.write(w)
        f.write(' '+ w + '\n')
    f.close()
    s = None
    if model == 'LM1':
        s = evaluateModel(path + language + '_data/left/' + language + '.', evaluationPath, path + 'LM1_models/results.txt', path + 'LM1_models/' + language + '_left.pt', path + 'LM1_models/' + language + '_right.pt', language, objectiveFunction, False)
    else:
        s = evaluateModel_(evaluationPath, path + 'LM2_models/results.txt', path + 'LM2_models/' + language + '_left.net', path + 'LM2_models/' + language + '_right.net', language, objectiveFunction, False)
    print(s.getPredictedSegmentations())


def evalMorfessorInteractive():
    print('Enter a list of words belonging to a single language that you would like to segment:')
    words = input().split()
    print('Enter the language of the words given: (zulu, xhosa, ndebele, swati) ')
    language = input()
    path = '/content/gdrive/My Drive/Colab Notebooks/colab_data/morfessor_data/'
    evaluationPath = path + 'interactive.txt'
    f = open(evaluationPath, 'w')
    for w in words:
        f.write(w)
        f.write(' '+ w + '\n')
    f.close()
    b = baseline()
    s = b.evaluateModel(evaluationPath, None, path + 'saved_models/' + language, language, False)
    print('Segmenting', words, 'belonging to', language, 'using Morfessor\n', s)

## Main

In [80]:
#print('-'*1000, '\nMorfessor: ')
#trainMorfessor()
#evalMorfessor()
#print('-'*1000, '\nLM1: ')
#trainLM1()
#evalLM1(2)
#print('-'*1000, '\nLM2: ')
#trainLM2()
#evalLM2(2)
#evalMorfessorInteractive()
#evalInteractive('LM1')
evalInteractive('LM2')


Enter a list of words belonging to a single language that you would like to segment:
ifom yesicelo okanye
Enter the language of the words given: (zulu, xhosa, ndebele, swati) 
xhosa
Enter objective function would you like to use to segment: [1-6]
2
Segmenting ['ifom', 'yesicelo', 'okanye'] belonging to xhosa using LM2 with objective function 2
[['i', 'fom'], ['yes', 'ic', 'elo'], ['o', 'kanye']]
