In [1]:
# pip install bert-embedding

In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import tensor
from torch.utils.data import DataLoader
import numpy

### Working with Bert

In [3]:
from bert_embedding import BertEmbedding

  from ._conv import register_converters as _register_converters


In [4]:
sentence="I will come tomorrow"
bert_embedding = BertEmbedding()

In [5]:
def bert_to_tensor(result,nonzero=False):
    embeds=list(zip(*result))[1]
    x,y =len(np.nonzero(embeds)[0]),768
    if nonzero:x=len((embeds))
    tens=torch.empty(x,y)
    idx=0
    for k in embeds:
        if k:
            tens[idx]=tensor(k[0])
            idx+=1
    return tens

### Data Preparation

In [6]:
from pickle import load
from pickle import dump
from collections import Counter

# load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

def to_vocab(lines):
	vocab = Counter()
	for line in lines:
		tokens = line.split()
		vocab.update(tokens)
	return vocab


def save_clean_sentences(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)


### Subset Dataset

In [None]:
en_lines = load_clean_sentences('english_vocab.pkl')
fr_lines = load_clean_sentences('french_vocab.pkl')

In [None]:
en_subset_sentences=[]
fr_subset_sentences=[]
for idx,sentence in enumerate(fr_lines) :
    if len(sentence)<=150 and len(en_lines[idx])<=150:
        en_subset_sentences.append(en_lines[idx])
        fr_subset_sentences.append(fr_lines[idx])

In [None]:
len(en_subset_sentences),len(fr_subset_sentences)

In [None]:
len(to_vocab(en_subset_sentences)),len(to_vocab(fr_subset_sentences))

In [None]:
max(fr_subset_sentences,key=len)

In [None]:
fr_vocab_to_idx={}
fr_idx_to_vocab={}
fr_vocab_to_idx['PAD']=0
fr_idx_to_vocab[0]='PAD'
fr_vocab_to_idx['unk']=1
fr_idx_to_vocab[1]='unk'
idx=2
for word in fr_vocab:
    fr_vocab_to_idx[word]=idx
    fr_idx_to_vocab[idx]=word
    idx+=1
    

In [None]:
en_vocab_to_idx={}
en_idx_to_vocab={}
en_vocab_to_idx['PAD']=0
en_idx_to_vocab[0]='PAD'
en_vocab_to_idx['unk']=1
en_idx_to_vocab[1]='unk'
idx=2
for word in en_vocab:
    en_vocab_to_idx[word]=idx
    en_idx_to_vocab[idx]=word
    idx+=1

In [None]:
en_sentences_tokens=[]
for sentence in en_subset_sentences:
    new_sent=[en_vocab_to_idx[word] for word in sentence.split()]
    en_sentences_tokens.append(new_sent)

In [None]:
fr_sentences_tokens=[]
for sentence in fr_subset_sentences:
    new_sent=[fr_vocab_to_idx[word] for word in sentence.split()]
    fr_sentences_tokens.append(new_sent)

In [None]:
len(fr_sentences_tokens)

In [None]:
fr_sentences_tokens[1]

In [None]:
en_sentences_tokens[1]

### Train set and valid set

In [None]:
np.random.seed(42)
idxs=np.random.permutation(len(en_subset_sentences))
border=int(0.8*len(idxs))
train_idxs=idxs[:border]
valid_idxs=idxs[border:]

In [None]:
en_subset_sentences_train=[]
en_subset_sentences_valid=[]
fr_sentences_tokens_train=[]
fr_sentences_tokens_valid=[]
for idx in train_idxs:
    en_subset_sentences_train.append(en_subset_sentences[idx])
    fr_sentences_tokens_train.append(fr_sentences_tokens[idx])
for idx in valid_idxs:
    en_subset_sentences_valid.append(en_subset_sentences[idx])
    fr_sentences_tokens_valid.append(fr_sentences_tokens[idx])

In [None]:
len(en_subset_sentences_train)+len(en_subset_sentences_valid)

In [None]:
en_subset_sentences_train[0]

In [None]:
save_clean_sentences(en_subset_sentences_train,"en_train.pkl")
save_clean_sentences(en_subset_sentences_valid,"en_valid.pkl")
save_clean_sentences(fr_sentences_tokens_train,"fr_train.pkl")
save_clean_sentences(fr_sentences_tokens_valid,"fr_valid.pkl")


### Load Train and Valid sets

In [7]:
en_subset_sentences_train=load_clean_sentences("en_train.pkl")
en_subset_sentences_valid=load_clean_sentences("en_valid.pkl")
fr_sentences_tokens_train=load_clean_sentences("fr_train.pkl")
fr_sentences_tokens_valid=load_clean_sentences("fr_valid.pkl")

### Dataset and Data Loader

In [8]:
from torch import LongTensor

In [9]:
class Dataset:
    def __init__(self, x,y):
        self.x = x
        self.y = y
        
    def __getitem__(self, idx):
        en_sequence = self.x[idx].replace(" ", "")
        fr_sequence = self.y[idx]
        return en_sequence,fr_sequence
    def __len__(self):
        return len(self.x)


def collatebatch(batch):
    batch_size = len(batch)
    seqs, tags, lengths = list(zip(*batch))
    max_length = max(lengths)

    padded_seqs = torch.zeros((batch_size, max_length), dtype=torch.long)
    padded_tags = torch.zeros((batch_size, max_length), dtype=torch.long)
    for idx, length in enumerate(lengths):
        padded_seqs[idx, 0:length] = tensor(seqs[idx][0:length])
        padded_tags[idx, 0:length] = tensor(tags[idx][0:length])
    return padded_seqs, padded_tags

def seq2seq_collate(samples,pad_first=False,backwards=False):
    pad_idx=0
    max_len_x,max_len_y = max([len(s[0]) for s in samples]),max([len(s[1]) for s in samples])
    res_x = torch.zeros(len(samples), max_len_x).long() + pad_idx
    res_y = torch.zeros(len(samples), max_len_y).long() + pad_idx
    if backwards: pad_first = not pad_first
    for i,s in enumerate(samples):
        if pad_first: 
            res_x[i,-len(s[0]):],res_y[i,-len(s[1]):] = LongTensor(s[0]),LongTensor(s[1])
        else:         
            res_x[i,:len(s[0]):],res_y[i,:len(s[1]):] = LongTensor(s[0]),LongTensor(s[1])
    if backwards: res_x,res_y = res_x.flip(1),res_y.flip(1)
    return res_x,res_y

def seq2seq_collate_bert(samples,pad_first=False,backwards=False):
    pad_idx=0
    max_len_x,max_len_y = max([len(s[0]) for s in samples]),max([len(s[1]) for s in samples])
    res_x = torch.zeros(len(samples), max_len_x,768).float() + pad_idx
    res_y = torch.zeros(len(samples), max_len_y).long() + pad_idx
    for i,s in enumerate(samples):
        vec=bert_to_tensor(bert_embedding(s[0]),True)
        res_x[i,:len(s[0])] =vec 
        res_y[i,:len(s[1]):] = LongTensor(s[1])
    return res_x,res_y


In [10]:
# vec=bert_to_tensor(bert_embedding(en_subset_sentences[0].replace(" ","")),True)
# vec.shape

In [11]:
def get_dls(dataset,collate_fn, bs):
    dataloader = DataLoader(dataset, bs, collate_fn=collate_fn, num_workers=1)
    return dataloader

In [12]:
train_dataset=Dataset(en_subset_sentences_train,fr_sentences_tokens_train)
valid_dataset=Dataset(en_subset_sentences_valid,fr_sentences_tokens_valid)

In [13]:
train_dl=get_dls(train_dataset,seq2seq_collate_bert,2)
valid_dl=get_dls(valid_dataset,seq2seq_collate_bert,8)

In [14]:
tens1=next(iter(train_dl))


### Model 

In [15]:
# ##### Max french sentence length is 36 words.
# so the final output layer should be of size(58802,36)

In [16]:
def conv3x3(in_planes, out_planes, stride=1, groups=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, groups=groups, bias=False)


def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)

In [17]:
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, norm_layer=None):
        super(BasicBlock, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        if groups != 1 or base_width != 64:
            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = norm_layer(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = norm_layer(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


In [18]:
basemodel3=nn.Sequential(BasicBlock(1,64),nn.Conv2d(64,512,3,padding=1),nn.AdaptiveMaxPool3d((1,36,58802)))

In [20]:
vec=basemodel3(tens1[0].unsqueeze(1))
vec.shape

torch.Size([2, 1, 36, 58802])

In [21]:
vec=vec.reshape(2,36,-1)

In [22]:
vec.shape

torch.Size([2, 36, 58802])

In [23]:
def seq2seq_loss(input, target):
    bs,sl = target.size()
    bs_in,sl_in,nc = input.size()
    if sl>sl_in: input = F.pad(input, (0,0,0,0,0,sl-sl_in))
    input = input[:,:sl]
    return F.cross_entropy(input.contiguous().view(-1,nc), target.view(-1))#, ignore_index=1)

In [24]:
seq2seq_loss(vec,tens1[1])

tensor(11.4495, grad_fn=<NllLossBackward>)

### Training_loop

In [21]:
opt=torch.optim.Adam(basemodel3.parameters())

In [22]:
for epoch in range(1):
    for x,y in train_dl:
        input=basemodel3(x.unsqueeze(1))
        input=input.reshape(x.shape[0],36,-1)
        loss=seq2seq_loss(input,y)
        opt.zero_grad()
        loss.backward()
        opt.step()
        print(loss.item())
    

11.426058769226074
11.315314292907715
11.137598991394043
11.039987564086914
10.876242637634277
10.79249095916748
10.739994049072266
10.74013900756836
10.536087989807129
10.50688648223877
10.301801681518555
10.40124797821045
10.206629753112793
10.119711875915527
10.112310409545898
9.83863639831543


Process Process-1:
Traceback (most recent call last):
  File "/home/phani/anaconda2/envs/kaggle/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/phani/anaconda2/envs/kaggle/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/phani/anaconda2/envs/kaggle/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 106, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "<ipython-input-9-c79455d9888e>", line 46, in seq2seq_collate_bert
    vec=bert_to_tensor(bert_embedding(s[0]),True)
  File "/home/phani/anaconda2/envs/kaggle/lib/python3.6/site-packages/bert_embedding/bert.py", line 110, in __call__
    return self.embedding(sentences, oov_way='avg')
  File "/home/phani/anaconda2/envs/kaggle/lib/python3.6/site-packages/bert_embedding/bert.py", line 138, in embedding
    sequence_outputs.asnumpy()):
  File "/home/phani/anaconda2/envs/kaggl

KeyboardInterrupt: 