# Mount drive 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import sys
sys.path.append('/content/drive/My Drive/MLDL/NLP/pygcn-master/pygcn')

# Preprocessing

Tokenization and paring

In [0]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import time
import math 

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [0]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # SOS 와 EOS 포함

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [0]:
# Convert unicode strings to ASCII
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# 소문자, 다듬기, 그리고 문자가 아닌 문자 제거


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [0]:
def readLangs(lang1, lang2, reverse=False):
  print("Reading lines...")

  # spliting text data by line
  lines = open('/content/drive/My Drive/Colab Notebooks/data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
      read().strip().split('\n')

  # spliting lines by tap (splited into pairs)
  pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

  # reverse the pairs and create each Lang instance of two languages
  if reverse:
      pairs = [list(reversed(p)) for p in pairs]
      input_lang = Lang(lang2)
      output_lang = Lang(lang1)
  else:
      input_lang = Lang(lang1)
      output_lang = Lang(lang2)

  return input_lang, output_lang, pairs

In [0]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [0]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [9]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
print(random.choice(pairs))

Reading lines...
Read 135842 sentence pairs
Trimmed to 10599 sentence pairs
Counting words...
Counted words:
fra 4345
eng 2803
['elle est mariee a un etranger .', 'she s married to a foreigner .']


# Adjacency Matrix

we use spacy model to construct dependecny tree

In [10]:
# Download spacy model (Gemran)
!python -m spacy download de

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/de_core_news_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/de
You can now load the model via spacy.load('de')


In [0]:
# Load Dependency Parsing function (German & English)
import spacy
from spacy import displacy
nlp = spacy.load('de')

In [12]:
# Make doc list of german sentences
input_sentence = [pair[0] for pair in pairs]
doc = []
print("reading german sentences...")
for i in range(len(input_sentence)):
  doc.append(nlp(input_sentence[i]))
  if (i+1)%1000==0:
    print(str(i+1)+" sentences were processed")

# Check length of list
print(str(len(doc))+" sentences were preocessed")

reading german sentences...
1000 sentences were processed
2000 sentences were processed
3000 sentences were processed
4000 sentences were processed
5000 sentences were processed
6000 sentences were processed
7000 sentences were processed
8000 sentences were processed
9000 sentences were processed
10000 sentences were processed
10599 sentences were preocessed


In [13]:
# display dependency tree for a german sentence
print('depdendency tree for a german sentence')
displacy.render(doc[10], style='dep', jupyter=True, options={'distance': 90})

depdendency tree for a german sentence


In [0]:
# Construct adjacency matrix
adjs=[]
for sen in range(len(doc)):
  # adj1, adj2, ..., adj10599 for each 1~10599 sentences
  globals()['adj{}'.format(sen)] = torch.zeros(len(doc[sen]),len(doc[sen]), device=device)
  for i in range(len(doc[sen])):
    # Dependency tree for each words
    globals()['doc{}_dep'.format(i)] = torch.zeros(len(doc[sen]), device=device)
    for j in range(len(doc[sen])):
      doc_head = doc[sen][i].head
      doc_child = set(list(doc[sen][i].children))
      globals()['doc{}_dep'.format(i)][j] = 1 if doc[sen][j]==doc[sen][i] or doc[sen][j]==doc_head or doc[sen][j] in doc_child else 0
    globals()['adj{}'.format(sen)][i] = globals()['doc{}_dep'.format(i)]
  adjs.append(globals()['adj{}'.format(sen)])    

In [16]:
# Check shape of adjacency matrix sample
adjs[1000].shape

torch.Size([6, 6])

# EncoderGCN Model

In [0]:
class EncoderGCN(nn.Module):
  def __init__(self, encoder, gcn):
    super().__init__()
    self.encoder = encoder
    self.gcn = gcn
  def forward(self, src, adj):
    # RNN (GRU)
    hidden = self.encoder.initHidden()
    features, hidden = self.encoder(src, hidden) # features = [input_length, batch_size, out_features]
    # GCN
    features = features.permute(1,0,2) # features = [batch_size, input_length, out_features]
    outputs = self.gcn(features, adj)
    dec_hid = torch.sum(outputs, dim=1)
    return outputs, dec_hid

# Attention

In [0]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        
        self.attn = nn.Linear(enc_hid_dim + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
        
    def forward(self, hidden, encoder_outputs):
        
        # hidden = [batch size, dec hid dim]
        # encoder_outputs = [batch size, src len, enc hid dim]
        
        batch_size = encoder_outputs.shape[0]
        src_len = encoder_outputs.shape[1]
        
        # repeat decoder hidden state src_len times
        hidden = hidden.repeat(1, src_len, 1)
                
        # hidden = [batch size, src len, dec hid dim]
        
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
        
        # energy = [batch size, src len, dec hid dim]

        attention = self.v(energy).squeeze(2)
        
        # attention= [batch size, src len]
        
        return F.softmax(attention, dim=1)


# Decoder

In [0]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()

        self.output_dim = output_dim
        self.attention = attention
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.GRU(enc_hid_dim + emb_dim, dec_hid_dim)
        
        self.fc_out = nn.Linear(enc_hid_dim + dec_hid_dim + emb_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs):
             
        # input = [batch size]
        # hidden = [batch size, dec hid dim]
        # encoder_outputs = [batch size, src len, enc hid dim]
        
        input = input.unsqueeze(0)
        
        # input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        # embedded = [1, batch size, emb dim]
        
        a = self.attention(hidden, encoder_outputs)
                
        # a = [batch size, src len]
        
        a = a.unsqueeze(1)
        
        # a = [batch size, 1, src len]
        
        weighted = torch.bmm(a, encoder_outputs)
        
        # weighted = [batch size, 1, enc hid dim ]
        
        weighted = weighted.permute(1, 0, 2)
        
        # weighted = [1, batch size, enc hid dim ]
        
        rnn_input = torch.cat((embedded, weighted), dim = 2)
        
        # rnn_input = [1, batch size, enc hid dim + emb dim]
            
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        # output = [seq len, batch size, dec hid dim * n directions]
        # hidden = [n layers * n directions, batch size, dec hid dim]
        
        # seq len, n layers and n directions will always be 1 in this decoder, therefore:
        # output = [1, batch size, dec hid dim]
        # hidden = [1, batch size, dec hid dim]
        # this also means that output == hidden
        assert (output == hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
        
        # prediction = [batch size, output dim]
        
        return prediction, hidden.squeeze(0)

# Seq2seq Model

In [0]:
class seq2seq(nn.Module):
  def __init__(self, encoder, decoder):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder

  def forward(self, src, adj, trg, teacher_forcing_ratio = 0.5):
    batch_size = src.shape[1]
    trg_len = trg.shape[0]
    trg_vocab_size = self.decoder.output_dim
    
    # tensor to store decoder outputs
    outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(device)

    # output and hidden state of EncoderGCN
    encoder_outputs, hidden = self.encoder(src, adj)

    # first input to the decoder is the <sos> tokens
    input = trg[0,:]

    for t in range(1, trg_len):
      output, hidden = self.decoder(input, hidden, encoder_outputs)
      outputs[t] = output
    
    # decide if we are going to use teacher forcing or not
      teacher_force = random.random() < teacher_forcing_ratio

    # get the highest predicted token from our predictions
      top1 = output.argmax(1) 

    # if teacher forcing, use actual next token as next input
    # if not, use predicted token
      input = trg[t] if teacher_force else top1

    return outputs

# Training the Seq2Seq model

We initialize our EncoderRNN, GCN, Decoder and seq2Seq model

In [0]:
from models import GCN, EncoderRNN

# Tokenization
germ_tensors = [tensorFromSentence(input_lang, pair[0]) for pair in pairs]
germ_tensor = germ_tensors[1000][:-1]

eng_tensors = [tensorFromSentence(output_lang, pair[1]) for pair in pairs]
eng_tensor = eng_tensors[1000][:-1]


# EncdoerRNN and GCN
enc_hid_dim = 256
gcn_hid_dim_1 = 256
gcn_hid_dim_2 = 512

encoder = EncoderRNN(input_lang.n_words, enc_hid_dim)
gcn = GCN(nfeat=enc_hid_dim,
            nhid=gcn_hid_dim_1,
            nclass=gcn_hid_dim_2,
            dropout=0.5)

# EncoderGCN
enc_gcn = EncoderGCN(encoder, gcn)

In [0]:
# Attention and Decoder 
embed_dim = 256
dec_hid_dim = 512
dec_dropout = 0.5

attn = Attention(gcn_hid_dim_2, dec_hid_dim)
dec = Decoder(output_lang.n_words, embed_dim , gcn_hid_dim_2, dec_hid_dim, dec_dropout, attn)

# seq2seq 
model = seq2seq(enc_gcn, dec).to(device)

In [0]:
output=model(germ_tensor, adj1000.unsqueeze(0), eng_tensor)

## We initialize weights by N(0,0) and bias to 0

In [98]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            
model.apply(init_weights)

seq2seq(
  (encoder): EncoderGCN(
    (encoder): EncoderRNN(
      (embedding): Embedding(4345, 256)
      (gru): GRU(256, 256)
    )
    (gcn): GCN(
      (gc1): GraphConvolution (256 -> 256)
      (gc2): GraphConvolution (256 -> 512)
    )
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1024, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(2803, 256)
    (rnn): GRU(768, 512)
    (fc_out): Linear(in_features=1280, out_features=2803, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

## We set optimizer and loss function

In [0]:
# Optimizer setting
optimizer = optim.Adam(model.parameters())

# Initializing loss function
criterion = nn.CrossEntropyLoss()

In [0]:
def train(model, iterators, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, (src, adjs, trg) in enumerate(iterators):
      
        optimizer.zero_grad()
        
        # To delete '.' in the end of squences
        output = model(src[:-1], adjs.unsqueeze(0), trg)
                
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
        length = i+1 
        
    return epoch_loss / length

In [0]:
def evaluate(model, iterators, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, (src, adjs, trg) in enumerate(iterators):

            output = model(src[:-1], adjs.unsqueeze(0), trg, teacher_forcing_ratio = 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()

            length = i+1
        
    return epoch_loss / length

In [0]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [103]:
print(germ_tensors[0].shape, adjs[0].shape)

torch.Size([5, 1]) torch.Size([4, 4])


In [0]:
N_EPOCHS = 10
CLIP = 1

train_iterator = zip(germ_tensors, adjs, eng_tensors)

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')

torch.save(model.state_dict(), 'tut3-model.pt')