#Run to import modules + define fcts

In [1]:
from IPython.display import clear_output
import glob
import json
import re
import pickle
import os
import shutil
import functools
import torch
import gdown

import numpy as np
import random as rd
import pandas as pd

from math import log
from collections import defaultdict

from nltk.tokenize import RegexpTokenizer,TweetTokenizer


from torchtext.vocab import FastText,vocab
from torch.utils.data import Dataset,DataLoader
from torch import nn
import torch.nn.functional as F


from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#Simple regular expression based tokenizer which tokenizes based on non alpha numeric characters, and which
#tokenizes "d'une" as "d'" and "une"
tokenizer = RegexpTokenizer(r'''\w'|\w+|[^\w\s]''')


def tokenize_sentence(sentence: str,tokenizer: RegexpTokenizer):
  '''Simple tokenizer, removes or replaces special characters
  sentence : str sentence to be tokenized
  tokenizer : tokenizer with tokenize method '''

  #Lower capital leters
  tokenized=sentence.lower()
  #Change special character
  tokenized=re.sub("’","'",tokenized)
  #Remove unwanted characters
  tokenized=re.sub("(@\w*\\b\s?|#\w*\\b\s?|&\w*\\b\s?|\n\s?|\\\\|\<|\>|\||\*)","",tokenized)
  tokenized=re.sub("\/","",tokenized)
  #Replace articles since model does not embed contractions well
  tokenized=re.sub("l'","le ",tokenized)
  tokenized=re.sub("d'","de ",tokenized)
  tokenized=re.sub("j'","je ",tokenized)
  tokenized=re.sub("qu'","que ",tokenized)
  tokenized=re.sub("t'","te ",tokenized)
  tokenized=re.sub("c'","ce ",tokenized)
  #Tokenize sentence
  tokenized=tokenizer.tokenize(tokenized)
  return(tokenized)

In [3]:
class text_Dataset(Dataset):
  '''Class to be used to generate a pytorch dataloader of sentences extracted from french articles '''

  def __init__(self, paths, tokenizer_function, vocab_stoi, sequence_size=10, df =None, pad_token="<pad>", start_token="<s>", end_token="</s>",file_type="csv"):
    '''Instantiate french article dataset class
    path_to_articles : list containing path
    tokenizer_function : callable function which tokenizes a string 
    vocab_stoi : stoi indexing
    sequence_size : len of sentence sampled without start of sentence and end of sentence token
    pad_token : padding token default <pad>
    start_token : start of sentence token default <s>
    end_token : end of sentence token default </s>'''

    super().__init__()

    self.paths = paths#list containing list of paths to articles and proust
    self.tokenizer_function = tokenizer_function#tokenizer used
    self.sequence_size = sequence_size
    self.pad_token = pad_token
    self.start_token = start_token
    self.end_token = end_token
    self.stoi=vocab_stoi
    self.file_type=file_type
    self.df=df

    if self.file_type == "json":
      self.len_data = len(self.paths)
    else:
      self.len_data = len(df)
      pass
  def __len__(self):
    '''number of elements in the dataset'''

    return(self.len_data)
  

  def pad(self,token_sequence):
    '''Function to pad sequence
    token_sequence : list of tokens of len < self.sequence_size'''

    pad_size = self.sequence_size-len(token_sequence)
    result = [self.start_token] + token_sequence + [self.pad_token for i in range(pad_size)] + [self.end_token]
    
    return result
  

  def sample(self,tokens):
    ''' sample sequence from token sequence
    tokens : list of tokens'''

    nb_tokens=len(tokens)
    starting_index=np.random.randint(nb_tokens)

    
    if starting_index + self.sequence_size < nb_tokens :
      result = [self.start_token] + tokens[starting_index : starting_index + self.sequence_size] + [self.end_token]
    
    else:
      result= tokens[starting_index : nb_tokens]
      result=self.pad(result)
    
    return result


    pass

  def __getitem__(self, idx):
    '''get a sentence of size sequence_size from file of index idx
    '''

    if self.file_type=="json":
      text_file=read_json(self.paths[idx])

      label=text_file["label"]
      text=text_file["text"]

      tokens=self.tokenizer_function(text)
      
      sequence=[self.stoi[token] for token in self.sample(tokens)]
    
    else:

      label=(self.df).iloc[idx]["label"]
      text=(self.df).iloc[idx]["text"]

      tokens=self.tokenizer_function(text)
      sequence=[self.stoi[token] for token in self.sample(tokens)]

    return(torch.LongTensor(sequence),label)


In [4]:
class rnn(nn.Module):
  '''simple single gru cell rnn class'''

  def __init__(self, param_dicts,  pretrained_embedding = None ):
    '''Instantiates rnn encoder class
    param_dicts : dictionnary of parameter
    pretrained_embedding : vectors embedding with __getitem__ method
    batch_size : batch size
    '''
    
    super().__init__()

    try:
      self.nb_layers = param_dicts["nb_layers"]
    except:
      self.nb_layers = 1
    
    try:
      self.embed_size = param_dicts["embed_size"]
    except:
      self.embed_size = 300
    
    try:
      self.hidden_dim = param_dicts["hidden_dim"]
    except:
      self.hidden_dim = 256
    
    #instantiates gru function with imput of dimension embed_size and hidden state of the same dimension
    self.gru=nn.GRU(self.embed_size, self.hidden_dim, num_layers = self.nb_layers, batch_first = True)
  
  
  def forward(self, x, initial_state ):

    '''forward function
    x : batch of sequences
    state : previous hidden state'''
    
    previous_states,final_state = self.gru(x,initial_state)

    return previous_states, final_state

In [5]:
class style_transfer(nn.Module):
  '''Style transfer class.  '''
  def __init__(self, parameters_dicts, pretrained_embedding):
    '''parameters_dict : dictionnary containing parameters 
    batch_size : batch size'''
    super().__init__()

    #First of all retrieve general parameters from parameter dictionnary
    #label smoothing parameter
    try:
      self.label_smoothing_sigma = parameters_dicts["style"]["label_smoothing_sigma"]
    except:
      self.label_smoothing_sigma = 0.1
    #learning rate
    try :
      self.lr = parameters_dicts["style"]["lr"]
    except:
      self.lr = 0.001
    # Style embedding dimension
    try :
      self.style_embed_dim = parameters_dicts["style"]["style_embed_dim"]
    except :
      self.style_embed_dim = 128
    #Max sequence length when generating sequence with forward
    try:
      self.max_generated_sequence_len = parameters_dicts["style"]["max_generated_sequence_len"]
    except:
      self.max_generated_sequence_len = 15
    #end of sentence index
    try:
      self.eos_idx = parameters_dicts["style"]["eos_idx"]
    except:
      self.eos_idx = 1
    #start of sentence index
    try:
      self.sos_idx = parameters_dicts["style"]["sos_idx"]
    except:
      self.sos_idx = 2#1152451
    #size of the vocabulary
    try:
      self.vocab_size = parameters_dicts["style"]["vocab_size"]
    except:
      self.vocab_size = pretrained_embedding.shape[0]
    # word embedding size
    try :
      self.embed_size = parameters_dicts["style"]["embed_size"]
    except :
      self.embed_size = 10
    # number of additionnal tokens
    try :
      self.additionnal_tokens = parameters_dicts["style"]["additionnal_tokens"]
    except :
      self.additionnal_tokens = 3 #pad, sos, unk
    #dropout probability
    try :
      self.dropout_p = parameters_dicts["style"]["dropout_p"]
    except:
      self.dropout_p = 0.3
    #batch size
    try :
      self.batch_size = parameters_dicts["style"]["batch_size"]
    except:
      self.batch_size = 1


    #Parameter of gumbel softmax
    self.tau = 1.0
    # For eval to eval on batches of size != batch size
    self.size = self.batch_size
    ##########################################################################################################
    # Autoencoder
    ##########################################################################################################
    
    #Define encoder
    try:
      encoder_dict = parameters_dicts["autoencoder"]["encoder"]
    except:
      encoder_dict = None
    encoder = rnn(encoder_dict)

    self.hidden_dim = encoder.hidden_dim

    #Define decoder
    try:
      decoder_dict = parameters_dicts["autoencoder"]["decoder"]
    except:
      decoder_dict = None
    
    decoder = rnn(decoder_dict)
    
    #Define linear network used to get a vector of the size of the vocabulary from result of decoder
    self.convert_to_vocab = nn.Linear(self.hidden_dim, self.vocab_size + self.additionnal_tokens)


    #Autoencoder
    self.autoencoder={"encoder" : encoder, "decoder" : decoder}

    self.dropout_layer=nn.Dropout(self.dropout_p)

    self.encoder_initial_state = None
    self.decoder_initial_state = None
    ##########################################################################################################
    # Layers used to embed text idx tokens
    ##########################################################################################################
    

    #Below : in pretrained_embed we create an Embedding layer from the pretrained embedding vectors given as argument,
    #We also add an embedding vector full of 0s which will be associated to padding token
    embedding_vectors=torch.cat( (pretrained_embedding, torch.zeros(size=(1,self.embed_size))))
    self.pretrained_embed=nn.Embedding.from_pretrained(embedding_vectors, freeze=True, padding_idx = self.vocab_size)

    #Below : in learnable_embedding we create an Embedding layer consisting of 2 trainable vectors.
    #First vector will be embedding of <unk> token and second will be embedding of <s> token
    self.learnable_embed=nn.Embedding.from_pretrained( torch.randn(size=(2,self.embed_size)), freeze=False )

    ##########################################################################################################
    # Layers used to embed style
    ##########################################################################################################

    #Simple linear model to embed style (0 or 1 labels) into vector of defined style size
    self.encoder_label_embedder = nn.Linear(1, self.style_embed_dim)
    self.decoder_label_embedder = nn.Linear(1, self.style_embed_dim)

    ################################################
    #Loss / optimizer
    #################################

    self.reconstruction_loss = nn.CrossEntropyLoss()

    self.autoencoder_optim = torch.optim.Adam([{"params" : self.learnable_embed.parameters()},
            {'params': (self.autoencoder)["encoder"].parameters()},
            {'params': (self.autoencoder)["decoder"].parameters()},
             {'params': self.encoder_label_embedder.parameters()},
             {'params': self.decoder_label_embedder.parameters()},
             {'params': self.convert_to_vocab.parameters()}],
            lr=self.lr)

    return None #end of __init__
  
  #######################################################################################################################################

  #############################################
  #
  #############################################
  def get_final_hiddens(self,lengths,hiddens):
    '''We can hope to predict an eos token. In this case, we want to retrieve the hidden state right after the eos was predicted (and not the later ones)
    lengths : int tensor which contains the position in which the eos token has been predicted in our generated sentences
    hiddens : list of final hidden layer. Shape (batch_size,max_seq_length,hidden_dim), hiddens[i,j] is the hidden state obtained
    after predicting the jth token of the ith batch using the previous j-1 tokens'''

    final_hiddens = torch.zeros((self.size,1, self.hidden_size))
    for i in range(self.size):
      final_hiddens[i,:,:] = hiddens[i,lengths[i],:]
    return final_hiddens

  
  #############################################
  #embbeding from token indexes function
  #############################################
  def embed(self,x):
    '''Embedding of input using the 2 embedding layers defined above
    x : batch input or text indexes of dim 2 (batch_size,sequence_size) ex [[0,1,2],[3,4,5]] '''
    
    embedding_mask = x> self.vocab_size #mask : tensor of boolean, True means that index belongs to either <unk> or to <s> (not in pretrained vectors)

    pretrained_x = x.clone() #In this vector, we will store embedding of the tokens which are not <unk> or <s>
    
    #Set indexes of <unk> and <s> to the index of padding index, embedding of these vectors is zero tensor
    pretrained_x[embedding_mask] = self.vocab_size

    #Now embed this vector using pretrained embedding vectors
    embedded_x = self.pretrained_embed(pretrained_x)
    embedded_x[embedding_mask] = self.learnable_embed(x[embedding_mask]-self.vocab_size-1)

    return embedded_x

  #############################################
  # Label smoothing function
  #############################################
  def label_smoothing(self,labels) :
    ''' return a smooth (noisy) version of the labels 
    labels : batch tensor of labels (0 or 1) dim 3 (batch_size,1,1)'''
    
    mask = (labels == 1)#Label 1
    labels=torch.tensor(labels).float()

    noise=torch.rand(mask.sum())
    labels[mask] -= self.label_smoothing_sigma * noise#remove noise to labels 1
    
    noise=torch.rand((~mask).sum())
    labels[~mask] += self.label_smoothing_sigma *noise#add noise to labels 0

    return labels
  
  #############################################
  # Use Gumbel trick to get embedding
  #############################################

  def gumbel_word_sample(self, embeddings, output):
    '''Uses gumbel softmax to differentiably sample hot encoded vector and return average embedding using softmax.
    embeddings : embedding vectors
    output : last hidden state of the decoder (embeddings of size hidden_dim)  '''

    if self.training:
      drop = nn.Dropout(p=0.2)
      output=drop(output)
      
    vocab_embedd = self.convert_to_vocab(output)#embed to vocabulary size
    gumbel_softmax_output=F.gumbel_softmax(vocab_embedd, tau = self.tau)#apply gumber softmax

    token_approx = torch.matmul(gumbel_softmax_output,embeddings)#average embeddings by softmax probability to get new embedding

    return token_approx, gumbel_softmax_output
  
  #############################################
  # Create initial states from both the encoder and the generator
  #############################################
  def initial_state_embedder(self, input, labels, same_labels = True, obj_labels = None):
    '''Create initial hidden state for rnn training. Encoder's state is stored under  self.encoder_initial_state and decoder's state under 
    self.decoder_initial_state. returns None
    input : inputs of the encoder (batch_size,seq_len, embed_dim)
    labels : style labels 3 (batch_size,1,1)
    same_labels : use same labels / styles for encoder and decoder
    obj_labels : if same_labels is set to False, use obj_labels embedding as input to the decoder
    '''
    
    #First smooth the labels
    smooth_labels=self.label_smoothing(labels.float())
    #Get encoder style embedding
    encoder_style_embed = self.encoder_label_embedder(smooth_labels)
    #Create encoder's initial state
    self.encoder_initial_state = torch.cat((encoder_style_embed , torch.zeros(size=(self.size, 1, self.hidden_dim - self.style_embed_dim)) ), 2)
    self.encoder_initial_state = self.encoder_initial_state.view(1,self.size,self.hidden_dim)

    ###################################################
    #Now we want the decoder's initial state. To do that, since the decoder is content dependent, we need to retrieve the content
    #embedding from the output of the encoder
    
    # First if we want the outputs to have the same style as the input
    if same_labels:
      
      _, encoder_final_state = self.autoencoder["encoder"](input,self.encoder_initial_state)
      content = encoder_final_state[:,:,self.style_embed_dim:] #retrive from hidden states embeddings of sizes content_dim representing the content

      #Now, using this content and the encoder's inputs, we will create the decoder's input
      decoder_style_embed = self.decoder_label_embedder(smooth_labels)
      self.decoder_initial_state = torch.cat( (decoder_style_embed.view(1,self.size,-1), content.view(1,self.size,-1)) , 2)
    
    # Now if we want the outputs to have the styles encoded in other_labels
    else:

      smooth_obj_labels = self.label_smoothing(obj_labels.float())
      _, encoder_final_state = self.autoencoder["encoder"](input,self.encoder_initial_state)
      content = encoder_final_state[:,:,self.style_embed_dim:] #retrive from hidden states embeddings of sizes content_dim representing the content

      decoder_style_embed = self.decoder_label_embedder(smooth_obj_labels)
      self.decoder_initial_state = torch.cat( (decoder_style_embed.view(1,self.size,-1), content.view(1,self.size,-1)) , 2)
    
    return None

  #############################################
  # get encoder's outputs
  #############################################
  def run_encoder(self, x):
    ''' get output and last hidden state of embedding x, creating the initial states using the smooth labels and passing them to encoder
    x : embedded_input
    initial_state : self.encoder_initial_state'''

    
    state = self.encoder_initial_state
    output,result=self.autoencoder["encoder"](x,state)

    return(output,result)
  
  #############################################
  # autoencode data
  #############################################
  def generate_tokens(self, embedded_tokens):
      '''given a list of embedded_tokens, return all the outputs of the decoder embedded into vocab size using convert_to_vocab and raw
      embedded_tokens : current tokens generated'''

      initial_state = self.decoder_initial_state
      output, hidden_state = self.autoencoder["decoder"](embedded_tokens, initial_state)


      if self.training:
        output = self.dropout_layer(output)

      to_vocab_ = self.convert_to_vocab(output)
      return to_vocab_, output

  #############################################
  # get batch reconstruction loss
  #############################################
  def batch_loss(self, gen_input, targets_tokens, labels, eval_size=None  ):
    '''compute loss over a batch
    gen_input : output of encoder
    targets_tokens : indexes
    labels : style labels
    '''

    if self.training:
      self.size = self.batch_size
    else:
      self.size = eval_size
    
    reconstruction_loss =0
    f_labels = labels

    self.initial_state_embedder(gen_input, labels, True)
    vocab_output, _ = self.generate_tokens(gen_input)

    loss = self.reconstruction_loss(vocab_output.view(-1,self.vocab_size + self.additionnal_tokens),targets_tokens.view(-1))

    return(loss)
  
  #############################################
  # train model on batch
  #############################################
  def train_batch(self, input_idx_tokens, labels, cur_iter, iter_verbose = 100):
    '''train model en batch
    input_idx_tokens : input of token vocab indexes
    labels : style labels
    cur_iter : current_iteration'''
    self.train()#train mode
    self.autoencoder_optim.zero_grad()#zero gradient
    gen_input=self.embed(input_idx_tokens)#embed input indexes
    labels = labels.float()

    loss = self.batch_loss(gen_input, torch.tensor(input_idx_tokens), labels)

    if cur_iter % iter_verbose == 0 :
      print("reconstruction loss is", loss.item())

    loss.backward()
    self.autoencoder_optim.step()
    return loss
  
  #############################################
  # eval model on batch
  #############################################
  def eval_batch(self, input_idx_tokens, labels):
    self.eval()
    gen_input=self.embed(input_idx_tokens)
    labels = labels

    loss = self.batch_loss(gen_input, input_idx_tokens, labels, len(input_idx_tokens))
    return loss
  
  #############################################
  # generate sequence
  #############################################
  def hiddens_tokens_embed_generation(self):
    '''returns list of last hiddens states and tokens given by model for every step (step : 1 generate token from <bos> , step 2
    generate from <bos> and generated output ...
    '''

    
    #get all embeddings : pretrained + trainable ones
    embeddings=torch.cat((self.pretrained_embed.parameters().__next__(), self.learnable_embed.parameters().__next__())).clone()
    #initial state
    state=self.decoder_initial_state
    
    hiddens = torch.zeros(self.size, self.max_generated_sequence_len, self.hidden_dim)
    tokens = torch.zeros(self.size, self.max_generated_sequence_len, self.embed_size)

    sentence=(torch.tensor([[self.sos_idx]])).repeat((self.size,1))
    #predicted_indexes=torch.tensor([self.sos_idx]).repeat((self.batch_size,1,1))

    lengths=torch.tensor([self.max_generated_sequence_len for k in range(self.size)])

    for i in range(self.max_generated_sequence_len):

      sentence_embedding=self.embed(sentence)
      _,state = (self.autoencoder)["decoder"](sentence_embedding,state)
      #to_vocab = self.convert_to_vocab(state)
      token_approx, gumbel_softmax_output = self.gumbel_word_sample(embeddings , state)
      gumbel_softmax_output = gumbel_softmax_output.view(self.size,self.vocab_size+self.additionnal_tokens)
      predicted_indexes = torch.argmax(gumbel_softmax_output.squeeze(1),1)
      predicted_eos_mask = predicted_indexes == self.eos_idx

      lengths[predicted_eos_mask] = i #set index of eos token (current length + 1 for eos token)

      sentence = torch.cat((sentence, predicted_indexes.view(self.size,1)),1)

      hiddens[:,i,:] = state
      tokens[:,i,:] = token_approx

    return hiddens , tokens, lengths, sentence
  
  #############################################
  # forward function, to be called only to predict a sequence of opposite style from input
  #############################################
  def predict(self,input_idx_tokens, input_labels, obj_labels):
    '''return predicted index sequence
    input_idx_tokens : input index tokens sequence
    labels : style labels'''
    #eval mode
    self.eval()
    #get sequences length
    self.size = len(input_idx_tokens)
    #embed tokens
    gen_input=self.embed(input_idx_tokens)
    #Now create initial hidden state for the encoder using the real input labels
    #And for the decoder using the objective labels
    self.initial_state_embedder(gen_input,input_labels, False, obj_labels)
    #We generate the predicted sentences. The function takes no argument as
    # the output of the encoder was taken into account when
    #generating the initial state of the decoder
    _,_,_,predicted_sentences = self.hiddens_tokens_embed_generation()

    return(predicted_sentences)

# Download custom embeddings

In [6]:
import gdown
ids = ["1RshSrc75VhX-5p0cQwDKO_lrIVR7AKbr", "1OBb9ZDGwPuZv3crnBYFNYkpbfLHeZGZp", "1NPqaoWQ7cdEnedc-ZIXA67czLVy0WgpO"]
filenames = ["embedding.pt", "itos.pickle", "stoi.pickle"]
url = "https://drive.google.com/uc?id="
output = "embeddings/embedding.pt"
gdown.download(url, output, quiet=False)
for filename, idx in zip(filenames, ids):
    gdown.download(url=(url + idx), output=filename, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=
To: /home/ellana/projects/style_transfer_proust/embeddings/embedding.pt
1.69kB [00:00, 2.72MB/s]
Downloading...
From: https://drive.google.com/uc?id=1RshSrc75VhX-5p0cQwDKO_lrIVR7AKbr
To: /home/ellana/projects/style_transfer_proust/embedding.pt
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36.9M/36.9M [00:00<00:00, 91.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1OBb9ZDGwPuZv3crnBYFNYkpbfLHeZGZp
To: /home/ellana/projects/style_transfer_proust/itos.pickle
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 551k/551k [00:00<00:00, 29.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=1NPqaoWQ7cdEnedc-ZIXA67czLV

In [7]:
pretrained_vectors = torch.load("embedding.pt")

with open("stoi.pickle","rb") as f:
  stoi = pickle.load(f)

with open("itos.pickle","rb") as f:
  itos = pickle.load(f)

set vocabulary

In [8]:
ft_vocab=vocab(stoi)
print("index of </s> token is",ft_vocab["</s>"])
#Now add special tokens at the end of the vocab to avoid embedding issues later on
#since inserting them in the middle of the stoi vocab will shift all other indexes, which would mess us the embedding
#when using nn.embedding later on
index_pad=len(ft_vocab)#padding
index_unk=len(ft_vocab)+1#unknown 
index_sos=len(ft_vocab)+2#start of sentence


ft_vocab.insert_token("<pad>",index_pad)
ft_vocab.insert_token("<unk>",index_unk)
ft_vocab.insert_token("<s>",index_sos)

ft_vocab.set_default_index(index_unk)

index of </s> token is 0


dl data

In [9]:
!wget https://raw.githubusercontent.com/Ellana42/style_transfer_proust/main/datasets/dataset.csv
clear_output()

In [10]:
tokenizer_function=functools.partial(tokenize_sentence,tokenizer=tokenizer)
df = pd.read_csv('dataset.csv', sep='|', index_col=0).dropna()
dataset = text_Dataset("", tokenizer_function, ft_vocab, df=df)


In [11]:
from torch.utils.data import random_split

In [12]:
train_ratio, val_ratio, test_ratio = (0.8, 0.1, 0.1)
train_size = int(len(dataset) * train_ratio)
val_size = int(len(dataset) * val_ratio)
test_size = len(dataset) - train_size - val_size
train, val, test = random_split(dataset, [train_size, val_size, test_size])

In [13]:
batch_size = 64
train_dataloader = DataLoader(train, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test, batch_size=batch_size, shuffle = True)
val_dataloader = DataLoader(val, batch_size=batch_size, shuffle = True)

get one batch of elements

In [14]:
#input_idx ,labels = next(iter(test_dataload))
#labels = labels.view(-1,1,1)

# running fct

parameter dictionary

In [15]:
parameters_dict={}
batch_size=64
style_embed_dim = 128 # dim of the Latent representation of the style variable
max_generated_sequence_len = 15
vocab_size = len(ft_vocab)-3 # since pretrained vectors do not have embedding for <s> <pad> and <unk> which are in vocabulary
hidden_dim=258
embed_size = 300
lr = 0.01

style_dict={"batch_size" :batch_size,"label_smoothing_sigma" : 0.1, "lr": lr, "style_embed_dim" : style_embed_dim, "max_generated_sequence_len":max_generated_sequence_len, "eos_idx":0, "vocab_size":vocab_size, "embed_size":embed_size, "self.sos_idx":index_sos}

encoder_dict={"nb_layers":1, "embed_size":embed_size,"hidden_dim":hidden_dim}
decoder_dict={"nb_layers":1, "embed_size":embed_size,"hidden_dim":hidden_dim}

autoencoder_dict={"encoder":encoder_dict,"decoder":decoder_dict}

parameters_dict["style"]=style_dict
parameters_dict["autoencoder"]=autoencoder_dict
parameters_dict

{'style': {'batch_size': 64,
  'label_smoothing_sigma': 0.1,
  'lr': 0.01,
  'style_embed_dim': 128,
  'max_generated_sequence_len': 15,
  'eos_idx': 0,
  'vocab_size': 30756,
  'embed_size': 300,
  'self.sos_idx': 30758},
 'autoencoder': {'encoder': {'nb_layers': 1,
   'embed_size': 300,
   'hidden_dim': 258},
  'decoder': {'nb_layers': 1, 'embed_size': 300, 'hidden_dim': 258}}}

In [16]:
style_model = style_transfer(parameters_dict,pretrained_vectors)

Predict sentences : careful this function only works in eval mode

In [18]:
#style_model.predict(input_idx,labels,labels)

train on batch

In [19]:
#for _ in range(20):
    #style_model.train_batch(input_idx, labels, 0)

In [21]:
import time

In [22]:
def train(epochs=4, evaluation=False):
    print("Start training...\n")
    batch_counts = 0
    for epoch_i in range(epochs):
        print(
        f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}"
        )
        print("-" * 70)
        t0_epoch, t0_batch = time.time(), time.time()
        step, total_loss, batch_loss, batch_count = 0, 0, 0, 0
        for _ in range(len(train_dataloader)):
            try:
                input_idx, labels = next(iter(train_dataloader))
            except ValueError:
                continue
            labels = labels.view(-1,1,1)
            batch_count += 1
            step += 1
            loss = style_model.train_batch(input_idx, labels, 0)
            batch_loss += loss
            total_loss += loss
        t0_epoch, t0_batch = time.time(), time.time()
        if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
            time_elapsed = time.time() - t0_batch
            print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")
            batch_loss, batch_count = 0, 0
            t0_batch = time.time()
        avg_train_loss = total_loss / len(train_dataloader)
        print("-" * 70)
        if (evaluation==True):
            total_loss = 0
            n_batch = 0
            for input_idx ,labels in val_dataloader:
                labels = labels.view(-1,1,1)
                n_batch += 1
                total_loss += style_model.eval_batch(input_idx, labels)
                time_elapsed = time.time() - t0_epoch
            val_loss = total_loss/len(val_dataloader)
            print(
                f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {'-':^9} | {time_elapsed:^9.2f}"
            )
            print("-" * 70)
            print("\n")
            print("Training complete!")
            return (0)

In [23]:
train(evaluation=True)

Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------


  loss = self.batch_loss(gen_input, torch.tensor(input_idx_tokens), labels)
  labels=torch.tensor(labels).float()


reconstruction loss is 10.350297927856445
reconstruction loss is 9.364091873168945
reconstruction loss is 6.93774938583374
reconstruction loss is 6.1809773445129395
reconstruction loss is 6.308010101318359
reconstruction loss is 6.238048076629639
reconstruction loss is 6.062816619873047
reconstruction loss is 6.008533954620361
reconstruction loss is 5.9430365562438965
reconstruction loss is 5.594308376312256
reconstruction loss is 5.566992282867432
reconstruction loss is 5.4683356285095215
reconstruction loss is 5.109234809875488
reconstruction loss is 5.0404438972473145
reconstruction loss is 5.085691928863525
reconstruction loss is 4.822578430175781
reconstruction loss is 4.785800933837891
reconstruction loss is 4.976200580596924
reconstruction loss is 4.880957126617432
reconstruction loss is 4.865368366241455
reconstruction loss is 4.414326190948486
reconstruction loss is 4.483524799346924
reconstruction loss is 4.349912166595459
reconstruction loss is 4.267012119293213
reconstructi

reconstruction loss is 0.9614198803901672
reconstruction loss is 1.0729657411575317
reconstruction loss is 1.0710725784301758
reconstruction loss is 1.0396872758865356
reconstruction loss is 0.9931395649909973
reconstruction loss is 0.8480016589164734
reconstruction loss is 0.9688658714294434
reconstruction loss is 1.0132614374160767
reconstruction loss is 0.9964383244514465
reconstruction loss is 1.001675009727478
reconstruction loss is 1.013398289680481
reconstruction loss is 1.0171486139297485
reconstruction loss is 0.8925380110740662
reconstruction loss is 1.097469687461853
reconstruction loss is 0.912207305431366
reconstruction loss is 0.8431041836738586
reconstruction loss is 0.8899450302124023
reconstruction loss is 0.802812397480011
reconstruction loss is 0.9011675715446472
reconstruction loss is 0.8143156170845032
reconstruction loss is 0.9549314975738525
reconstruction loss is 1.1809768676757812
reconstruction loss is 0.9548759460449219
reconstruction loss is 0.89222532510757

reconstruction loss is 0.5038682818412781
reconstruction loss is 0.4514045715332031
reconstruction loss is 0.5343685150146484
reconstruction loss is 0.418082594871521
reconstruction loss is 0.5423470139503479
reconstruction loss is 0.5704240798950195
reconstruction loss is 0.47047653794288635
reconstruction loss is 0.4745709002017975
reconstruction loss is 0.4634797275066376
reconstruction loss is 0.5417660474777222
reconstruction loss is 0.48550212383270264
reconstruction loss is 0.5578956007957458
reconstruction loss is 0.44810226559638977
reconstruction loss is 0.43913665413856506
reconstruction loss is 0.3916504681110382
reconstruction loss is 0.42297324538230896
reconstruction loss is 0.5999931693077087
reconstruction loss is 0.5318906307220459
reconstruction loss is 0.5041128993034363
reconstruction loss is 0.4773534834384918
reconstruction loss is 0.5862573981285095
reconstruction loss is 0.6700155735015869
reconstruction loss is 0.6466161608695984
reconstruction loss is 0.50924

reconstruction loss is 0.31084996461868286
reconstruction loss is 0.4211095869541168
reconstruction loss is 0.4837020933628082
reconstruction loss is 0.5494161248207092
reconstruction loss is 0.37441158294677734
reconstruction loss is 0.5529966950416565
reconstruction loss is 0.48389020562171936
reconstruction loss is 0.46776703000068665
reconstruction loss is 0.4122180640697479
reconstruction loss is 0.377361923456192
reconstruction loss is 0.2720414400100708
reconstruction loss is 0.2696150839328766
reconstruction loss is 0.23815983533859253
reconstruction loss is 0.44866621494293213
reconstruction loss is 0.33827534317970276
reconstruction loss is 0.45185065269470215
reconstruction loss is 0.4608032703399658
reconstruction loss is 0.26822131872177124
reconstruction loss is 0.3953976631164551
reconstruction loss is 0.3865283727645874
reconstruction loss is 0.34800246357917786
reconstruction loss is 0.43830832839012146
reconstruction loss is 0.33316585421562195
reconstruction loss is 

reconstruction loss is 0.33138468861579895
reconstruction loss is 0.30341413617134094
reconstruction loss is 0.2546052038669586
reconstruction loss is 0.322433739900589
reconstruction loss is 0.247453972697258
reconstruction loss is 0.27619823813438416
reconstruction loss is 0.1969505101442337
reconstruction loss is 0.28265705704689026
reconstruction loss is 0.267774760723114
reconstruction loss is 0.4369959533214569
reconstruction loss is 0.20129044353961945
reconstruction loss is 0.25621920824050903
reconstruction loss is 0.39109864830970764
reconstruction loss is 0.22406315803527832
reconstruction loss is 0.2054278701543808
reconstruction loss is 0.19537241756916046
reconstruction loss is 0.1543249487876892
reconstruction loss is 0.26602646708488464
reconstruction loss is 0.39629828929901123
reconstruction loss is 0.38575685024261475
reconstruction loss is 0.2602022886276245
reconstruction loss is 0.23195497691631317
reconstruction loss is 0.2820631265640259
reconstruction loss is 0

reconstruction loss is 0.24095018208026886
reconstruction loss is 0.35185983777046204
reconstruction loss is 0.28852325677871704
reconstruction loss is 0.1687774807214737
reconstruction loss is 0.2814968526363373
reconstruction loss is 0.32052114605903625
reconstruction loss is 0.26338866353034973
reconstruction loss is 0.21669451892375946
reconstruction loss is 0.2318258136510849
reconstruction loss is 0.2583489716053009
reconstruction loss is 0.2391882687807083
reconstruction loss is 0.20633800327777863
reconstruction loss is 0.21645642817020416
reconstruction loss is 0.16757990419864655
reconstruction loss is 0.24565304815769196
reconstruction loss is 0.2910510003566742
reconstruction loss is 0.25112754106521606
----------------------------------------------------------------------


In [24]:
#for _ in range(20):
#    style_model.train_batch(input_idx, labels, 0)

In [25]:
torch.save(style_model.state_dict(), 'style_model')

In [26]:
ls

big_camembert     embedding.pt           [0m[01;34mnotebooks[0m/    style_model
classifier.ipynb  [01;34membeddings[0m/            [01;34m__pycache__[0m/  [01;34mtest_trainer[0m/
dataset.csv       itos.pickle            README.md     Untitled.ipynb
dataset.csv.1     minimal_example.ipynb  [01;34msrcs[0m/
[01;34mdatasets[0m/         model_synonyms.ipynb   stoi.pickle
