In [1]:
import sys
import random
import copy
import pdb
import math
import numpy
import torch 

#Returns sequences of shuffled sentence and label
class SequenceBatcher(object):
  def __init__(self, device):
    self.device = device
    return

  def __call__(self, batch):
    x, y = zip(*batch)
    max_x_len = max(len(xx) for xx in x)
    
    x = torch.LongTensor([xx + [0] * (max_x_len - len(xx)) for xx in x])
    y = torch.FloatTensor([yy for yy in y])

    return x.to(self.device),y.to(self.device)

#Shuffles and returns dataset
class Shuffler(torch.utils.data.Dataset):
  def __init__(self, corpus, vocabs, neg_size, corpus_size, max_snt=0, unk='<unknown>', neg_rate=0.1, weight=True):
    self.corpus = corpus
    self.neg_size = neg_size
    self.neg_rate = neg_rate

    self.weight = weight

    num_grammatical_sentences = max_snt if (max_snt > 0) else corpus_size
    self.shuffle_index = [0] + [int(random.random() < self.neg_rate) * random.randrange(self.neg_size) for _ in range(1,num_grammatical_sentences)]
    self.temp_index = np.copy(self.shuffle_index)
    self.num_items = sum(self.shuffle_index) + sum([int(i==0) for i in self.shuffle_index])

    self.vocabs = vocabs
    self.unk = unk

    self.corpus_fp = open(corpus, mode="r", encoding="utf-8")

    self.grammatical_snt_idx = 0

    self.x = [] # contains next grammatical sequence
    self.tmp=[]
    self.j=0
    g=self.create_dataset()
    self.A=g[0]
    self.label=g[1]
    return

  def readline(self):
    return self.corpus_fp.readline().lower().split()

  def tok2id(self, tokens):
    for ii, token in enumerate(tokens):
      try:
        token_id = self.vocabs[token]
      except KeyError:
        token_id = self.vocabs[self.unk]
      tokens[ii] = token_id
    return tokens

  def __len__(self):
    return self.num_items

  def create_dataset(self):
        A=[]
        y=[]
        for i in range(self.num_items):
            if not self.temp_index[self.grammatical_snt_idx]:
                A.append(self.tok2id(self.readline()))
                y.append(1)
                self.grammatical_snt_idx += 1
            else:
                if self.temp_index[self.grammatical_snt_idx]==self.shuffle_index[self.grammatical_snt_idx]:
                    self.tmp=self.tok2id(self.readline())
                self.temp_index[self.grammatical_snt_idx] -= 1
                A.append(random.sample(self.tmp[:-1],len(self.tmp[:-1])))
                y.append(0)
                if self.temp_index[self.grammatical_snt_idx]==0:
                    self.grammatical_snt_idx += 1
                    
                
        return [A,y]
    
  def __getitem__(self, index):
        return self.A[index],self.label[index]
                
            



In [2]:
from scipy.spatial.distance import cosine
import numpy as np
from operator import itemgetter
import utils
import os
import lzma # to read xz files

import pickle

class WordVector :
  def __init__(self) :
    return

  def __init__(self, path, unk="<unk_vocab>", beg='<s>', end='</s>', vcb_list=None) :
    self.path = path 
    self.dim = -1
    self.unk = unk
    self.beg = beg
    self.end = end
    self.delim = " "
    self.dic_pickle_path = self.path + '.dic.pkl'
    self.mat_pickle_path = self.path + '.mat.pkl'
    self.dictionary = {}
    self.batch_sz = 100000
    self.mat = np.zeros(shape=(0,0))

    try :
      with open(self.dic_pickle_path, 'rb') as dic_handle, open(self.mat_pickle_path, 'rb') as mat_handle:
        print(("Loading from the existing pickle file {0}".format(path + '.{dic,mat}.pkl')))
        self.dictionary = pickle.load(dic_handle)
        self.mat = pickle.load(mat_handle)
        self.dim = self.mat.shape[1]
    except FileNotFoundError: 
      self.size = os.stat(path).st_size 
      print("HELOOOOO")
      self.load() 

    if not (self.unk in list(self.dictionary.keys())) : 
      print(("Warning: could not find the unknown token {0}. A zero vector will be used instead".format(self.unk)))
      self.update_item(self.unk, np.array([0]*self.dim)) 
    if not (self.beg in list(self.dictionary.keys())) : 
      print(("Warning: could not find the sentence beginning token {0}. A zero vector will be used instead".format(self.beg)))
      self.update_item(self.beg, np.array([0]*self.dim)) 
    if not (self.end in list(self.dictionary.keys())) : 
      print(("Warning: could not find the sentence ending token {0}. A zero vector will be used instead".format(self.end)))
      self.update_item(self.end, np.array([0]*self.dim)) 

    self.vocabs = list(self.dictionary.keys())

    if not os.path.isfile(self.dic_pickle_path):
      with open(self.dic_pickle_path, 'wb') as handle:
        pickle.dump(self.dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL)

    if not os.path.isfile(self.mat_pickle_path):
      with open(self.mat_pickle_path, 'wb') as handle:
        pickle.dump(self.mat, handle, protocol=pickle.HIGHEST_PROTOCOL)

    if vcb_list is not None:
      vcb_intersect = set(self.vocabs).intersection(set(vcb_list))
      vcb_intersect = vcb_intersect.union(set([self.unk, self.beg, self.end]))
      mat = np.zeros((len(vcb_intersect), self.dim))
      dic = {}
      for i,w in enumerate(list(vcb_intersect)):
        dic[w] = i
        mat[i] = self[w]
      self.dictionary = dic
      self.mat = mat
      self.vocabs = list(vcb_intersect)



  def __getitem__(self, w): 
    if isinstance(w,int) :
      return self.mat[w]

    try :
      return self.mat[self.dictionary[w]]
    except KeyError :
      return self.mat[self.dictionary[self.unk]]

  def __len__(self):
    return len(self.dictionary)

  def __setitem__(self, w, val): 
    assert(len(val) == self.dim)
    self.mat[self.dictionary[w]] = np.array(val)


  def load(self) :
    verbose = 1
    if self.path.endswith('xz'):
      openner = lzma.open
      encoding = 'latin-1'
      verbose = 0 # does not show the bar correctly
    else:
      openner = open
      encoding = 'utf-8'
    with openner(self.path, 'rt', encoding=encoding) as fp :
      cnt = 0
      progress = 0

      line = fp.readline()
      tokens = line.split(self.delim) 
      if self.dim < 0:
        if len(tokens) == 2 :
          self.dim = int(tokens[1])
          line = fp.readline()
        else :
          self.dim = len(tokens)-1

      assert(self.dim > 0)
      
      self.mat.resize((0,self.dim), refcheck=False)

      while line :
        if not(cnt%self.batch_sz) :
          (r,c) = self.mat.shape
          self.mat.resize((r+self.batch_sz,c), refcheck=False)

        if verbose and not(cnt%1000):
          progress = fp.tell()*1.0/self.size
          #utils.update_progress(progress,"Loading word vectors", 40)

        tokens = line.rstrip().split(self.delim)
        assert(len(tokens) == self.dim + 1)
        self.dictionary[tokens[0]] = cnt 
        self.mat[cnt] = np.array([float(x) for x in tokens[1:]])

        line = fp.readline()
        cnt += 1

      if cnt < self.mat.shape[0] : self.mat.resize((cnt,self.dim), refcheck=False)

      #utils.update_progress(1,"Loading word vectors", 40)
    return  

  def cosine_dist(self, w1, w2) :
    return cosine(self.dictionary[w1],self.dictionary[w2])

  def update_item(self, word, vector) :
    assert(len(vector) == self.dim)
    try :
      idx = self.dictionary[word]
    except KeyError :
      (r,c) = self.mat.shape 
      self.dictionary[word] = r
      self.mat.resize((r+1,c))
      idx = r
    self.mat[idx] = vector

  def normalize(self):
    mean = np.mean(self.mat, axis=0).tolist()
    std = np.std(self.mat, axis=0).tolist()
    self.mat = (self.mat - mean)/std 

###########################################
class MultiWordVector:
  def __init__(self, word_vectors):
    self.word_vectors = word_vectors
    self.dictionary = {}
    self.dim = -1
    self.unk="<unk_vocab>"
    if self.word_vectors: self.dim = self.word_vectors[0].dim
    if self.word_vectors: self.unk = self.word_vectors[0].unk
    for wv in self.word_vectors:
      assert(self.dim == wv.dim)
      for w in wv.dictionary:
        try:
          self.dictionary[w].append(wv)
        except KeyError:
          self.dictionary[w] = [wv]
    pass

  def __getitem__(self, w):
    try:
      return self.dictionary[w][0][w]
    except KeyError:
      return self.word_vectors[0][w]
      


In [3]:
def load_embeddings(path, vocabs):
  wv = WordVector(path)
  embedding_matrix = numpy.zeros((len(vocabs), wv.dim))

  for w,idx in vocabs.items():
    embedding_matrix[idx] = wv[w]
  return torch.from_numpy(embedding_matrix.astype('float32'))



In [4]:
import glob
import os
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import pdb
import sys
import pickle

from tqdm import tqdm

#from self_attention import SelfAttention


class SelfAttention(nn.Module):
  def __init__(self, input_dim, attention_dim, annotation_dim, dropout_rate=0.5, regulization_rate=1e-2):
    super().__init__()
    
    self.input_dim = input_dim
    self.att_dim = attention_dim
    self.ann_dim = annotation_dim
    self.regulization_rate = regulization_rate

    self.w1 = nn.Linear(self.input_dim, self.att_dim, bias=False)
    self.w2 = nn.Linear(self.att_dim, self.ann_dim, bias=False)
    self.dropout = nn.Dropout(dropout_rate)

  def forward(self, H): 
    #batch_size, sequence_length, feature_size = H.shape
    x = self.w1(H) #x.shape: (batch_size, sequence_length, attention_dim)
    x = self.dropout(x)
    x = self.w2(x) #x.shape: (batch_size, sequence_len, annotation_dim)
    x = self.dropout(x)
    
    A = F.softmax(x, dim=1) #A.shape: (batch_size, sequence_len, annotation_dim)
    M = torch.bmm(H.transpose(1,-1),A) #M.shape=(batch_size, features_size, annotation_dim)

    #AAT = torch.bmm(A,A.transpose(1,-1))
    #P = torch.norm(AAT - torch.eye(sequence_length))
    #P = self.regulization_rate * torch.norm(AAT - torch.eye(sequence_length))
    return M, A
 
class Encoder(nn.Module):
  def __init__(self, vocab_size, embedding_mat, padding_idx, num_lstm_layers=2, bidirectional=True):
    super().__init__()

    embedding_dim   = embedding_mat.shape[1]    
    #self.embedding_layer = nn.Embedding(vocab_size, embedding_dim, padding_idx)#, _weight=embedding_mat)

    self.embedding_layer = nn.Embedding.from_pretrained(embedding_mat,padding_idx=0, freeze=True)
    self.embedding_generalizer = nn.Linear(embedding_dim,embedding_dim)

    self.lstm_output_size = 2*embedding_dim
    self.LSTM = nn.LSTM(embedding_dim, self.lstm_output_size, num_layers=num_lstm_layers, batch_first=True, bidirectional=bidirectional)

    #self.n_output_dim = lstm_output_size * num_lstm_layers * (1 + bidirectional)
 
  def forward(self, x):
    emb = self.embedding_layer(x) #Input
    gemb = self.embedding_generalizer(emb)
    output, (hn, cn) = self.LSTM(emb + gemb)
    hn = torch.cat([h for h in hn], dim=-1) # (num_layers * num_directions, batch, hidden_size) -> (batch, hidden_size * num_layers * num_directions)
    return output, hn

class Classifier(nn.Module):
  def __init__(self, input_dim, output_dim):
    super().__init__()

    self.classifier = nn.Linear(input_dim, output_dim)
    self.activation = nn.Sigmoid()

  def forward(self, x):
    return self.activation(self.classifier(x))

class SentenceClassifier(nn.Module):
  def __init__(self, encoder, classifier, attention=None,modelpath=None):
    super().__init__()

    self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    self.sequence_batcher = SequenceBatcher(self.device)

    self.encoder = encoder.to(self.device)
    self.attention = attention.to(self.device) if attention else None
    self.flatten = torch.nn.Flatten()
    self.classifier = classifier.to(self.device)
    self.modelpath = modelpath

  def forward(self, x):
    batch_size, sequence_length = x.shape
    enc_output, enc_hn = self.encoder(x)
    if self.attention:
      m_att, a_att = self.attention(enc_output)
      m_att = self.flatten(m_att)
      enc = torch.cat([enc_hn, m_att], dim=-1)
      #AAT = torch.bmm(a_att,a_att.transpose(1,-1))
      p_att = 0 #1e-2 * torch.norm(AAT - torch.eye(sequence_length).to(self.device))
    else:
      enc = enc_hn
      p_att = 0

    return self.classifier(enc), p_att

  def fit(self, train_generator, val_generator, n_epochs=1, lr=1e-2):
    optimizer = optim.Adam(self.parameters(), lr=lr)
    training_generator = torch.utils.data.DataLoader(train_generator, batch_size=64, shuffle=True, num_workers=0, collate_fn=self.sequence_batcher)
    validation_generator = torch.utils.data.DataLoader(val_generator, batch_size=64, shuffle=True, num_workers=0, collate_fn=self.sequence_batcher)

    BCELoss = nn.BCELoss()

    best_val_acc = 0
    best_epoch_index = 0
    early_stop_cnt = 5
    for e in range(n_epochs):
      losses = []
      accs = []
      self.train()
      with tqdm(total=len(train_generator)) as pbar:
        pbar.set_description(f'Training Epoch {e+1}')
        for i, (bx, by) in enumerate(training_generator):

          optimizer.zero_grad()

          output, p_att_loss = self.forward(bx)
          output = output.squeeze(-1)
          loss = BCELoss(output, by) + p_att_loss

          loss.backward()
          optimizer.step()

          losses.append(loss.item())
          acc = ((output > 0.5).float() == by).float().mean()
          accs.append(acc.item())
          pbar.set_postfix(acc=(sum(accs)/len(accs)), loss=(sum(losses) / len(losses)))
          pbar.update(len(bx))

      if val_generator and e % 1 == 0:
        val_losses = []
        val_accs = []
        self.eval()
        with tqdm(total=len(val_generator)) as pbar:
          pbar.set_description(f'Validation Epoch {e+1}')

          for i, (bx, by) in enumerate(validation_generator):
            with torch.no_grad():
              output, p_att_loss = self.forward(bx)
            output = output.squeeze(-1)
            loss = BCELoss(output, by) + p_att_loss

            val_losses.append(loss.item())
            
            acc = ((output > 0.5).float() == by).float().mean()
            val_accs.append(acc.item())
            pbar.set_postfix(acc=(sum(val_accs) / len(val_accs)), loss=(sum(val_losses) / len(val_losses)))
            pbar.update(len(bx))
          
          val_acc = sum(val_accs) / len(val_accs)
          if val_acc > 0.99:
            torch.save(self.state_dict(), self.modelpath)
            break

          if val_acc > best_val_acc:
            torch.save(self.state_dict(), self.modelpath)
            best_epoch_index = e
            best_val_acc = val_acc
          
          if e + 1 - best_epoch_index >= early_stop_cnt:
            break

In [5]:
#From training building a vocab set
def training_language(lang,emb):
    train_corpus_path = "D:\\data\\raw_data\\"+lang+".raw.trn"
    val_corpus_path = "D:\\data\\raw_data\\"+lang+".raw.val"
    vocabs_path = "D:\\data\\raw_data\\"+lang+".raw.trn.vcb"
    neg_size    = 10
    max_vocab   = 1000000
    epochs      = 8
    histpath    = "D:\\data\\raw_data\\"+lang+"_raw.hist.pkl"
    modelpath   = "D:\\data\\raw_data\\lstm_with_attention\\"+lang+"_raw.model"
    vocabs_dict = "D:\\data\\raw_data\\"+lang+"_raw.vcb.pkl"
    unk         = '<unknwon>'
    embpath     = "D:\\data\\raw_data\\word_vectors\\wiki."+emb+".align.vec"


    vfreqs = {}
    
    if os.path.isfile(modelpath):
        print("Already trained :) ", lang)
        return    

    try:
        with open(vocabs_path, mode="r", encoding="utf-8") as fp:
            for line in fp:
                (word,frq) = line.strip().split('\t')
                vfreqs[word] = int(frq)
        corpus_size = vfreqs['<s>']
    except IOError:
        print("No input file")
        return

    vfreqs = {}
    with open(vocabs_path, mode="r", encoding="utf-8") as fp:
        for line in fp:
            (word,frq) = line.strip().split('\t')
            vfreqs[word] = int(frq)
    corpus_size = vfreqs['<s>']

    vocabs = [k for k, v in sorted(vfreqs.items(), key=lambda item: item[1])]
    max_vocabs = min([max_vocab, len(vocabs)])
    vocabs = vocabs[::-1][0:max_vocab-1]
    # 0 is reserved for masking
    vocabs = {k:i for i,k in enumerate(vocabs,2)}
    vocabs[unk] = 1
    vocabs['<PAD>'] = 0
    with open(vocabs_dict, 'wb') as fp:
        pickle.dump(vocabs, fp, protocol=2)

    emb_mat = load_embeddings(embpath, vocabs)

    train_gen = Shuffler(train_corpus_path, vocabs, neg_size, corpus_size, unk=unk, weight=False, max_snt=0)
    val_gen = Shuffler(val_corpus_path, vocabs, neg_size, corpus_size, unk=unk, weight=False, max_snt=0)

    print("train size: {0}".format(len(train_gen)))
    print("validation size: {0}".format(len(val_gen)))

    bidirectional=True
    num_lstm_layers=1
    encoder = Encoder(len(vocabs), emb_mat, 0, bidirectional=bidirectional, num_lstm_layers=num_lstm_layers)

    att_input_dim = (int(bidirectional)+1)*encoder.lstm_output_size
    att_dim = 10
    ann_dim = 1
    attention = SelfAttention(att_input_dim, att_dim, ann_dim)

    cls_input_dim = encoder.lstm_output_size * num_lstm_layers * (int(bidirectional)+1) + (att_input_dim*ann_dim if attention else 0)
    classifier = Classifier(cls_input_dim, 1)

    snt_classifier = SentenceClassifier(encoder, classifier, attention,modelpath)
    snt_classifier.fit(train_gen, val_gen, lr=0.01, n_epochs=epochs)

    print('saving model in ', modelpath)
    torch.save(snt_classifier.state_dict(), modelpath)



In [None]:
import os.path
import gc
import sys
import random
import copy
import pdb
import math
import numpy
import torch


#embedding_dict={"Afrikaans":"af","Arabic":"ar","Bulgarian":"bg","Bengali":"bn","Bosnian":"bs","Catalan":"ca","Czech":"cs","Danish":"da","German":"de","English":"en","Spanish":"es","Estonian":"et","Persian":"fa","Finnish":"fi","French":"fr","Hebrew":"he","Hindi":"hi","Croatian":"hr","Hungarian":"hu","Indonesian":"id","Italian":"it","Korean":"ko","Latvian":"lv","Macedonian":"mk","Malay":"ms","Dutch":"nl","Polish":"pl","Portuguese":"pt","Romanian":"ro","Russian":"ru","Slovak":"sk","Slovenian":"sl","Albanian":"sq","Swedish":"sv","Thai":"th","Tagalog":"tl","Turkish":"tr","Ukrainian":"uk","Vietnamese":"vi"}
embedding_dict = {"Swedish":"sv"}
j=0
for i in embedding_dict.keys():
    gc.collect()
    torch.cuda.empty_cache()
    training_language(i,embedding_dict[i])
    j+=1
print(j)

Loading from the existing pickle file D:\data\raw_data\word_vectors\wiki.sv.align.vec.{dic,mat}.pkl
train size: 1361270
validation size: 1358522


Training Epoch 1: 100%|██████████████████████████████| 1361270/1361270 [21:16<00:00, 1066.64it/s, acc=0.85, loss=0.311]
Validation Epoch 1: 100%|███████████████████████████| 1358522/1358522 [08:08<00:00, 2782.00it/s, acc=0.903, loss=0.214]
Training Epoch 2: 100%|██████████████████████████████| 1361270/1361270 [27:06<00:00, 836.95it/s, acc=0.866, loss=0.285]
Validation Epoch 2: 100%|███████████████████████████| 1358522/1358522 [07:32<00:00, 3004.81it/s, acc=0.874, loss=0.268]
Training Epoch 3:  30%|█████████                     | 408384/1361270 [06:08<14:30, 1094.69it/s, acc=0.861, loss=0.294]

In [6]:
#Testing on native data
import sys
import random
import copy
import pdb
import math
import numpy
import torch 

vocabs_path="D:\\data\\raw_data\\Danish.raw.trn.vcb"
embpath="D:\\data\\raw_data\\word_vectors\\wiki.da.align.vec"
neg_size    = 10
max_vocab   = 1000000
unk         = '<unknwon>'
vfreqs = {}
vocabs_dict = "D:\\data\\raw_data\\Danish_raw.vcb.pkl"


try:
    with open(vocabs_path, mode="r", encoding="utf-8") as fp:
        for line in fp:
            (word,frq) = line.strip().split('\t')
            vfreqs[word] = int(frq)
    corpus_size = vfreqs['<s>']
except IOError:
    print("No input file")


vocabs = [k for k, v in sorted(vfreqs.items(), key=lambda item: item[1])] 
max_vocabs = min([max_vocab, len(vocabs)])
vocabs = vocabs[::-1][0:max_vocab-1]
# 0 is reserved for masking
vocabs = {k:i for i,k in enumerate(vocabs,2)}
vocabs['<unknwon>'] = 1
vocabs['<PAD>'] = 0
with open(vocabs_dict, 'wb') as fp:
    pickle.dump(vocabs, fp, protocol=2)

emb_mat = load_embeddings(embpath, vocabs)  


bidirectional=True
num_lstm_layers=1
encoder = Encoder(len(vocabs), emb_mat, 0, bidirectional=bidirectional, num_lstm_layers=num_lstm_layers)

att_input_dim = (int(bidirectional)+1)*encoder.lstm_output_size
att_dim = 10
ann_dim = 1
attention = SelfAttention(att_input_dim, att_dim, ann_dim)

cls_input_dim = encoder.lstm_output_size * num_lstm_layers * (int(bidirectional)+1) + (att_input_dim*ann_dim if attention else 0)
classifier = Classifier(cls_input_dim, 1)

the_model = SentenceClassifier(encoder, classifier, attention,None)

modelpath="D:\\data\\raw_data\\lstm_with_attention\\Danish_raw.model"
the_model.load_state_dict(torch.load(modelpath),strict=False)

Loading from the existing pickle file D:\data\raw_data\word_vectors\wiki.da.align.vec.{dic,mat}.pkl


<All keys matched successfully>

In [7]:
#Testing on native data
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

data_path="D:\\data\\raw_data\\Danish.raw.tst"
data = Shuffler(data_path, vocabs, neg_size, corpus_size, unk=unk, weight=False, max_snt=0)
data_new = torch.utils.data.DataLoader(data, batch_size=64, shuffle=True, num_workers=0, collate_fn=SequenceBatcher('cuda'))
the_model.eval()
BCELoss=nn.BCELoss()
y_actual=np.array([])
y_pred_prob=np.array([])
for i, (bx, by) in enumerate(data_new):
     with torch.no_grad():
              output,p_att_loss = the_model.forward(bx)
     output = output.squeeze(-1)
     y_actual=np.append(y_actual,by.cpu().numpy())
     y_pred_prob=np.append(y_pred_prob,output.cpu().numpy())
    
y_pred=[1 if i>0.5 else 0 for i in y_pred_prob ]
y_pred=np.array(y_pred)
print(accuracy_score(y_actual,y_pred))
print(precision_score(y_actual,y_pred))
print(recall_score(y_actual,y_pred))
print(f1_score(y_actual,y_pred))
print(BCELoss(torch.tensor(y_pred_prob),torch.tensor(y_actual)))
print(sum(y_actual)/len(y_actual))


0.8263370978052419
0.8385050003620536
0.9166912217433372
0.875856686037961
tensor(0.3945, dtype=torch.float64)
0.6682880592518571


In [6]:
#Cross testing
import sys
import random
import copy
import pdb
import math
import numpy
import torch 

vocabs_path="D:\\data\\raw_data\\Hindi.raw.trn.vcb"
embpath="D:\\data\\raw_data\\word_vectors\\wiki.hi.align.vec"
neg_size    = 10
max_vocab   = 1000000
unk         = '<unknwon>'
vfreqs = {}
vocabs_dict = "D:\\data\\raw_data\\Hindi_raw.vcb.pkl"


try:
    with open(vocabs_path, mode="r", encoding="utf-8") as fp:
        for line in fp:
            (word,frq) = line.strip().split('\t')
            vfreqs[word] = int(frq)
    corpus_size = vfreqs['<s>']
except IOError:
    print("No input file")


vocabs = [k for k, v in sorted(vfreqs.items(), key=lambda item: item[1])] 
max_vocabs = min([max_vocab, len(vocabs)])
vocabs = vocabs[::-1][0:max_vocab-1]
# 0 is reserved for masking
vocabs = {k:i for i,k in enumerate(vocabs,2)}
vocabs['<unknwon>'] = 1
vocabs['<PAD>'] = 0
with open(vocabs_dict, 'wb') as fp:
    pickle.dump(vocabs, fp, protocol=2)

emb_mat = load_embeddings(embpath, vocabs)  


bidirectional=True
num_lstm_layers=1
encoder = Encoder(len(vocabs), emb_mat, 0, bidirectional=bidirectional, num_lstm_layers=num_lstm_layers)

att_input_dim = (int(bidirectional)+1)*encoder.lstm_output_size
att_dim = 10
ann_dim = 1
attention = SelfAttention(att_input_dim, att_dim, ann_dim)

cls_input_dim = encoder.lstm_output_size * num_lstm_layers * (int(bidirectional)+1) + (att_input_dim*ann_dim if attention else 0)
classifier = Classifier(cls_input_dim, 1)



Loading from the existing pickle file D:\data\raw_data\word_vectors\wiki.hi.align.vec.{dic,mat}.pkl


In [11]:
#Cross testing
the_model = SentenceClassifier(encoder, classifier, attention,None)
modelpath="D:\\data\\raw_data\\lstm_with_attention\\Danish_raw.model"

m1=torch.load(modelpath)
own_state=the_model.state_dict()
for name, param in m1.items():
    if name=='encoder.embedding_layer.weight':
        continue
    else:
        param = param.data
        own_state[name].data.copy_(param)
del m1

In [12]:
#Cross testing
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

data_path="D:\\data\\raw_data\\Hindi.raw.trn"
data = Shuffler(data_path, vocabs, neg_size, corpus_size, unk=unk, weight=False, max_snt=0)
data_new = torch.utils.data.DataLoader(data, batch_size=64, shuffle=True, num_workers=0, collate_fn=SequenceBatcher('cuda'))
the_model.eval()
BCELoss=nn.BCELoss()
y_actual=np.array([])
y_pred_prob=np.array([])
for i, (bx, by) in enumerate(data_new):
     with torch.no_grad():
              output,p_att_loss = the_model.forward(bx)
     output = output.squeeze(-1)
     y_actual=np.append(y_actual,by.cpu().numpy())
     y_pred_prob=np.append(y_pred_prob,output.cpu().numpy())
    
y_pred=[1 if i>0.5 else 0 for i in y_pred_prob]
y_pred=np.array(y_pred)
print(accuracy_score(y_actual,y_pred))
print(precision_score(y_actual,y_pred))
print(recall_score(y_actual,y_pred))
print(f1_score(y_actual,y_pred))
print(BCELoss(torch.tensor(y_pred_prob),torch.tensor(y_actual)))
print(sum(y_actual)/len(y_actual))



0.6514623335212744
0.7015306522836993
0.8349221941662656
0.762436039391557
tensor(0.6945, dtype=torch.float64)
0.6698793300662014
