In [None]:
#Uncomment to use this code in Google Collab
#from google.colab import drive
#drive.mount('/content/gdrive')

#Importing Libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import string
import time
import math
from torch.utils.data import DataLoader

In [None]:
#initiliazing Variables
context_size = 2 #Dont Change
embedded_size = 256
batch_num = 500
epoch = 3
LR=0.01
BATCH_SIZE = 1 #Dont Change
BATCH_NUM = 10000
DROPOUT = 0.5

In [None]:
#Models Class
class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedded_size, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedded_size)
        self.linear1 = nn.Linear(context_size * embedded_size, embedded_size)
        self.linear2 = nn.Linear(embedded_size, vocab_size)
        #Drop out function
        self.dropout = nn.Dropout(DROPOUT)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.dropout(out)
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

In [None]:
#Batchifies the data for dataloader class
def generate_batch(batch):

    context = []    
    target = []
    for entry in batch:
      for word in entry[0]:
        #Handles unknown words from validation and testing data since they dont have tokens
        if word in tokens:
          context.append(tokens[word])
        else:
          context.append(tokens["UNK"])
      if entry[1] in tokens:
        target.append(tokens[entry[1]])
      else:
        target.append(tokens["UNK"])

    label = torch.tensor(target, dtype=torch.long).cuda()
    context = torch.tensor(context, dtype=torch.long).cuda()

    return label,context

In [None]:
#Reads Validation Data
def eval_file():
    #Use path for Google collab "/content/gdrive/My Drive/nchlt_text.zu.valid"
    with open("nchlt_text.zu.valid") as f:
      batch_words_num = 0
      words = []
      trigrams = []
      batch_counter = 0
      batch = ""
      for line in f:
          #Removes Digits
          line = ''.join([i for i in line if not i.isdigit()])
          batch+=line.strip()+" "
          if '.' in line:
              batch_counter+=1
              #Removes Punctuation
              batch = batch.translate(str.maketrans('', '', string.punctuation))
              allwords = batch.lower().split(" ")
              batch_words_num += len(allwords)
              for word in allwords:
                  words.append(word)
              for i in range(len(words) - 2):
                  trigrams.append(([words[i], words[i + 1]], words[i + 2]))
              batch = ""
              words=[]
    return DataLoader(trigrams, batch_size=BATCH_SIZE, shuffle=False, collate_fn=generate_batch)



In [None]:
#Reads Training Data
batch_counter = 0
batch = ""
vocabulary = {}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lines_num = 0
total_accu = None
#Use path for Google collab "/content/gdrive/My Drive/nchlt_text.zu.train"
with open("nchlt_text.zu.train") as f:
    for line in f:
        line = line.strip()
        #Removes Digits
        line = ''.join([i for i in line if not i.isdigit()])
        #Removes Punctuation
        line = line.translate(str.maketrans('', '', string.punctuation))
        words = line.lower().split(" ")
        #To create a token for unkown words we count how many times exists in data
        for word in words:
            if word in vocabulary:
                vocabulary[word] += 1
            else:
                vocabulary[word] = 1
        lines_num +=1
    f.close()

c=0
tokens = {}
index = {}
#creating tokens for each word that occurs more than one and for others we say they are Unknown
for word in vocabulary:
    if vocabulary[word]>1:
        c+=1
        tokens[word] = c
        index[c] = word
tokens["UNK"] = c+1
index[c+1] = "UNK"

#Preprocesing data with window method
#Use path for Google collab "/content/gdrive/My Drive/nchlt_text.zu.train"
with open("nchlt_text.zu.train") as f:
    batch_words_num = 0
    words = []
    trigrams = []
    total_loss = 0
    for line in f:
        line = ''.join([i for i in line if not i.isdigit()])
        batch+=line.strip()+" "
        if '.' in line:
            batch_counter+=1
            batch = batch.translate(str.maketrans('', '', string.punctuation))
            allwords = batch.lower().split(" ")
            batch_words_num += len(allwords)
            for word in allwords:
                if word in tokens:
                    words.append(word)
                else:
                    words.append("UNK")
            for i in range(len(words) - 2):
                trigrams.append(([words[i], words[i + 1]], words[i + 2]))
            batch = ""
            words=[]

In [None]:
#To create new Model
model = NGramLanguageModeler(len(vocabulary), embedded_size,context_size).cuda()

In [None]:
#To load Old Model
#Use path for Google collab "/content/gdrive/My Drive/model"
model = torch.load("model")

In [None]:
#initilizing variables for Model
losses = []
batch_c = 0
#dataloader for training data
dataloader = DataLoader(trigrams, batch_size=BATCH_SIZE, shuffle=False, collate_fn=generate_batch)
#Loss Function
loss_function = nn.NLLLoss().cuda()
#Optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
#Changes Learning Rate according optimization
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
#dataloader for Validation data
dataloader_valid = eval_file()
#Use path for Google collab "/content/gdrive/My Drive/log.txt"
file1 = open("log.txt", "w")
file1.write("Start Training")
file1.close()
print("Start Training")
best_model = None
best_val_loss = float("inf")

In [None]:
#Start Training
for epochnum in range(1,epoch+1):
    model.train()
    start = time.time()
    counter = 0
    totalcount = 0
    #Use path for Google collab "/content/gdrive/My Drive/log.txt"
    file1 = open("log.txt", "a")
    #Going through batches
    for label, text in enumerate(dataloader):
        label = text[0]
        text = text[1]
        optimizer.zero_grad()
        log_probs = model(text)
        loss = loss_function(log_probs, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        losses.append(total_loss)
        counter+=1
        #Prints out log file every BATCH_NUM size
        if counter == BATCH_NUM:
            end = time.time()
            time_taken = end - start
            totalcount +=counter
            cur_loss = total_loss/totalcount
            #Write to file
            file1.write("\n")
            file1.write('| epoch {:3d} | {:5d}/{:5d} batches | ' 'lr {:02.2f} | ms/batches {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format(epochnum, totalcount, len(dataloader) , LR,time_taken,cur_loss, math.exp(cur_loss)))
            #System Write
            print('| epoch {:3d} | {:5d}/{:5d} batches | ' 'lr {:02.2f} | ms/batches {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format(epochnum, totalcount, len(dataloader) , LR,time_taken,cur_loss, math.exp(cur_loss)))
            counter = 0
            start = time.time()

    #Evaluating model with validation data
    model.eval()
    total_acc, total_count = 0, 0
    start1 = time.time()
    total_loss_eval = 0
    with torch.no_grad():
        #Going through validation data batches
        for label, text in enumerate(dataloader_valid):
            label = text[0]
            text = text[1]
            predited_label = model(text).cuda()
            loss = loss_function(predited_label, label)
            total_loss_eval += loss.item()
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    #Creating logs from results
    time_taken = time.time() - start        
    accu_val = total_acc/total_count
    total_loss_eval = total_loss_eval/total_count
    ppl = math.exp(total_loss_eval)
    print("End of Epoch "+str(epochnum))
    print("Evaluation for validation data : "+str(accu_val))
    print("Time : " +str(time_taken))
    print("loss : "+str(total_loss_eval))
    print("ppl : "+str(ppl))
    file1.write("\nEnd of Epoch "+str(epochnum))
    file1.write("\nEvaluation for validation data : "+str(accu_val))
    file1.write("\nTime : " +str(time_taken))
    file1.write("\nloss : "+str(total_loss_eval))
    file1.write("\nppl : "+str(ppl))
    #Changing Learning rate
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    total_loss = 0
    #Use path for Google collab "/content/gdrive/My Drive/model"
    torch.save(model, "model")
    file1.close()


In [None]:
#Reading Test file and preprocessing
def test_file():
    #Use path for Google collab "/content/gdrive/My Drive/nchlt_text.zu.test"
    with open("nchlt_text.zu.test") as f:
      batch_words_num = 0
      words = []
      trigrams = []
      batch_counter = 0
      batch = ""
      for line in f:
          #Remove digits
          line = ''.join([i for i in line if not i.isdigit()])
          batch+=line.strip()+" "
          if '.' in line:
              batch_counter+=1
              #Remove punctuation
              batch = batch.translate(str.maketrans('', '', string.punctuation))
              allwords = batch.lower().split(" ")
              batch_words_num += len(allwords)
              for word in allwords:
                  words.append(word)
              for i in range(len(words) - 2):
                  trigrams.append(([words[i], words[i + 1]], words[i + 2]))
              batch = ""
              words=[]
              
    return DataLoader(trigrams, batch_size=BATCH_SIZE, shuffle=False, collate_fn=generate_batch)



In [None]:
#Use path for Google collab "/content/gdrive/My Drive/log.txt"
file1 = open("log.txt", "a")
dataloader_test = test_file()
model.eval()
total_acc, total_count = 0, 0
start = time.time()
total_loss_test = 0
#Evaluating model with test data
with torch.no_grad():
    for label, text in enumerate(dataloader_test):
        label = text[0]
        text = text[1]
        predited_label = model(text).cuda()
        loss = loss_function(predited_label, label)
        total_loss_test += loss.item()
        total_acc += (predited_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
#Creating logs from results
time_taken = time.time() - start        
accu_test = total_acc/total_count
total_loss_test = total_loss_test/total_count
ppl = math.exp(total_loss_test)
print("Evaluation for TEST data : "+str(accu_test))
print("Time : " +str(time_taken))
print("loss : "+str(total_loss_test))
print("ppl : "+str(ppl))
file1.write("\nEvaluation for TEST data : "+str(accu_test))
file1.write("\nTime : " +str(time_taken))
file1.write("\nloss : "+str(total_loss_test))
file1.write("\nppl : "+str(ppl))
file1.close()
#Use path for Google collab "/content/gdrive/My Drive/model"
torch.save(model, "model")