check:

https://becominghuman.ai/generative-adversarial-networks-for-text-generation-part-3-non-rl-methods-70d1be02350b

https://arxiv.org/pdf/1810.06640.pdf

In [51]:
#!pip install "tensorflow==2.8.*"
!pip install fasttext &> /dev/null
!pip install "tensorflow-text==2.8.*" &> /dev/null
#!pip uninstall -y nltk
!pip install -U nltk &> /dev/null

In [52]:
try:
    from google.colab import drive
    COLAB = True
except ModuleNotFoundError:
    COLAB = False
    
if COLAB:
    drive.mount('/content/drive')
    PATH = '/content/drive/MyDrive/NLP_3' #'/content/drive/MyDrive/UniBO/NLP'
else:
    PATH = './'

import sys
sys.path.insert(0, PATH)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [53]:
#!ls drive/MyDrive/NLP_3

In [54]:
from os.path import join
import inspect

class Resource_handler:
	def __init__(self, path_='.'):
		self.path = path_

	def save_resource(self, what):
		from pickle import dump
		with open(join(self.path, what), 'wb') as dfile:
			prev_frame = inspect.stack()[1].frame
			return dump(eval(what, prev_frame.f_globals, prev_frame.f_locals), dfile)

	def load_resource(self, from_, alt_action=None, args_tuple=(), args_dict=dict()):
		from pickle import load
		try:
			with open(join(self.path, from_), 'rb') as dfile:
				return load(dfile)
		except FileNotFoundError:
			return alt_action(*args_tuple, **args_dict) if alt_action else None

RH = Resource_handler(PATH)

In [55]:
import io,re
import tensorflow as tf
print(f'Using tf version: {tf.version.VERSION}')

from tensorflow.keras import backend as K
try:
    from keras.preprocessing.sequence import pad_sequences
except ImportError:
    from keras.utils import pad_sequences
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
#from collections import OrderedDict
from nltk.tokenize import word_tokenize
import fasttext
import fasttext.util
import typing
from typing import Any, Tuple
import tensorflow_text as tf_text
from os.path import join, dirname


def clean_data(path = ''):
	DS1 = join(path, 'data/PACCSS-IT.txt')
	DS2 = join(path, 'data/simpitiki-v2.xml')
	COLS = ['sentence_1', 'sentence_2']

	CSV_PARR_1 = {
		'on_bad_lines':'warn',
		'delimiter':'\t',
		'engine':'python',
		'quotechar':'§',
		'skipinitialspace':True,
		'decimal':'.'
	}


	def to_inmem_file(txt):
		f = io.StringIO()
		f.write(txt)
		f.seek(0)
		return f
		

	def clean_1(txt):
		# (!!!) Performed on raw data!
		txt = txt\
			.replace('"','')	\
			.replace("'",'')	\
			.replace("- ",'')	\
			.replace("  ",' ')	\
			.lower()
		#txt = re.sub('\d+[\./-]\d+[\./-]\d+', '<date>', txt)
		return txt


	def keep_inbetween(txt, tag):
		return re.sub(f'.*(<{tag}[^<>]*>.+</{tag}>).*', r'\1', txt, 0, re.DOTALL)
		

	def remove_tags(txt):
		'Remove tags; enclosed information is preserved.'
		return re.sub(f'</?[^<>]+>', lambda _:'', txt) if txt!=None else None
		

	# ----------- loading data -----------
	#print(pd.read_csv(DS1, **CSV_PARR_1).head(30))
	with open(DS1, 'r') as imf:
		txt = imf.read()

	inmem_DS1 = to_inmem_file(clean_1(txt))

	# ----------- loading data -----------
	with open(DS2, 'r') as imf2:
		txt = imf2.read()

	# ----------- creating DFs -----------
	df1 = pd.read_csv(inmem_DS1, usecols=COLS, **CSV_PARR_1)
	print(f'DF 1 has shape {df1.shape}')

	#legend = keep_inbetween(txt, 'types')
	#df_leg = pd.read_xml(to_inmem_file(legend), parser='etree')

	new_txt = keep_inbetween(txt, 'simplifications')
	df2 = pd.read_xml(to_inmem_file(new_txt), parser='etree')

	df2.drop(df2.columns[[0,1]], axis=1, inplace=True)
	df2.columns = COLS
	df2 = df2.applymap(remove_tags)
	print(f'DF 2 has shape {df2.shape}')

	combo = pd.concat([df1, df2], copy=False)

	return combo
    

def plot_trend(data_x, data_y, x_lab='x', y_lab='y', title='', lines=True, regress=True, figsize=(15,6), line_style='-o'):
	from scipy.stats import linregress
	import matplotlib.pyplot as plt
	plt.figure(figsize=figsize)
	plt.margins(0)
	plt.plot(data_x, data_y, line_style, linewidth=1, markersize=10)
	min_ , max_ = plt.ylim()
	rg = max_ - min_
	plt.xlabel(x_lab, fontsize=18)
	plt.xticks(data_x if len(data_x)<figsize[0] else [data_x[0]]+[_ for i,_ in enumerate(data_x) if i%int(len(data_x)/figsize[0])==0]+[data_x[-1]])
	if lines:
		for i,_ in enumerate(data_x): plt.axvline(_, color='gray', linestyle=":", ymax=(data_y[i]-min_)/rg)
	if regress:
		lr_res = linregress(data_x, data_y)
		y = lr_res.intercept + lr_res.slope*data_x
		plt.plot([data_x[0], data_x[-1]] , [y[0], y[-1]], 'r')
		plt.plot([0, data_x[-1]], [y[-1], y[-1]], color='gray', linestyle='--')
	plt.ylabel(y_lab, fontsize=18)
	if title: plt.title(title, fontsize=18)
	plt.show()

def memoization(limit):
	MEM = dict()
	# decorator:
	def memoizza__(f):
		def g(*x):
			nonlocal MEM, limit
			if x in MEM: return MEM[x]
			else:
				res = f(*x)
				if len(MEM)>=limit and limit!=0:
					MEM.popitem() # bad for top-down procedures!
				MEM[x] = res
				# print(MEM)
				return res
		return g
	return memoizza__

tf.config.run_functions_eagerly(True)

@memoization(0)
def word_tk_mem(arg):
    return word_tokenize(arg)

use_builtins = True

Using tf version: 2.8.2


In [56]:
def build_vocabulary(train_df, test_df, tokenizer):
    """
    Given train corpus and test corpus, builds the corresponding word vocabulary.

    --------------
    Return: 
      - word vocabulary: vocabulary index to word
      - inverse word vocabulary: word to vocabulary index
      - word listing: set of unique terms that build up the vocabulary
    """
    corpus = list(pd.concat([
        train_df['sentence_1'],
        train_df['sentence_2'],
        test_df['sentence_1'],
        test_df['sentence_2']
    ]))
    word_to_idx = dict()  
    idx_to_word = dict()
    word_listing = set()
    # Get all unique words in corpus 
    for sentence in corpus:
        word_listing |= set(tokenizer(sentence))
   # Cast to list
    word_listing = sorted(list(word_listing))
    # Build vocabulary index to word <idx : word>
    idx_to_word = dict(enumerate(word_listing, start=1))
    # Build word_to_idx <word : idx>
    word_to_idx = dict({_:k for k,_ in idx_to_word.items()})

    return idx_to_word, word_to_idx, word_listing

In [57]:
df = clean_data(PATH)
df.dropna(inplace=True)
df.head(5)

DF 1 has shape (63006, 2)
DF 2 has shape (1166, 2)


Unnamed: 0,sentence_1,sentence_2
0,ma questo a cosa servirebbe ?,a che servono queste cose ?
1,"salve, avrei bisogno di una informazione piutt...",ho bisogno di una informazione urgente .
2,ciao a tutti avrei bisogno di un consiglio .,ho bisogno di un suo consiglio .
3,possibilmente uno che avesse bisogno dell aiuto .,ho bisogno di un vostro aiuto .
4,questa sarebbe una cosa positiva.,questa era una nuova cosa .


In [58]:
TRAIN_DF, TEST_DF = train_test_split(df, test_size=0.3)

In [59]:
import nltk
nltk.download('punkt')

idx_to_word, word_to_idx, word_listing = build_vocabulary(TRAIN_DF, TEST_DF, word_tk_mem)
VOCAB_SIZE = len(word_listing)
print(VOCAB_SIZE)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


17976


In [60]:
EMBEDD_DIM = 200

EMBEDD_FNAME = f'cc.it.{EMBEDD_DIM}.bin'  #ex. 'cc.it.100.bin'

try: EMBEDDING_MODEL
except NameError:
    # Downloading FastText italian pre-trained model
    fasttext.FastText.eprint = lambda x: None
    try:
        print('Loading fasttext model...', flush=True)
        EMBEDDING_MODEL = fasttext.load_model(join(PATH, EMBEDD_FNAME))
    except ValueError as e:
        print(e)
        fasttext.util.download_model('it', if_exists='ignore')  
        EMBEDDING_MODEL = fasttext.load_model(join(PATH, EMBEDD_FNAME))

print(EMBEDDING_MODEL.get_dimension())


200


In [61]:
def check_OOV_terms(embedding_model, vocabulary_terms):
  '''Returns a list of out-of-vocabulary (OOV) terms and the corresp. len'''
  oov = set(vocabulary_terms).difference(set(embedding_model.words))
  return list(oov), len(oov)

_, n_oov_terms = check_OOV_terms(EMBEDDING_MODEL, word_listing)
print(f"Total OOV terms: {n_oov_terms} ({n_oov_terms*100/len(word_listing):.03f}%)")

Total OOV terms: 1274 (7.087%)


In [62]:
EMBEDDING_DIM = EMBEDDING_MODEL.get_dimension()
LOAD_EM = False

if LOAD_EM:
  EMBEDDING_MATRIX = RH.load_resource('EMBEDDING_MATRIX')
else:
  def compute_embedding_matrix(embedding_model, idx_to_word):
    """
    Return the embedding matrix of the train vocabulary:
    """
    embedding_matrix = np.zeros((VOCAB_SIZE+1, EMBEDDING_DIM), dtype='float32')

    for idx in idx_to_word.keys():
      embedding_matrix[idx] = embedding_model.get_word_vector(idx_to_word[idx])
    return embedding_matrix

  EMBEDDING_MATRIX = compute_embedding_matrix(EMBEDDING_MODEL, idx_to_word)

  RH.save_resource('EMBEDDING_MATRIX')

In [63]:
print(f'Weights matrix size: {EMBEDDING_MATRIX.shape}')
print(f'Vocab size: {VOCAB_SIZE}')

Weights matrix size: (17977, 200)
Vocab size: 17976


In [64]:
def word_to_idx_conversion(df, word_to_idx,len_sentence=None): 
  """
  Return input data (sentences) encoded and padded
  
  Parameters
  ----------
  - corpus (train or test dataframe)

  Return
  ------
  - DataFrame: sentences encoded using the word_to_idx dict and padded
  """
  # Conversion
  #df.reset_index(drop=True, inplace=True)
  converted_df = df.applymap(lambda _: [word_to_idx[token] for token in word_tk_mem(_)])
  # Padding
  padded_df = converted_df.apply(lambda _: list(pad_sequences(_, padding="post",maxlen=len_sentence)), axis='index')
  return padded_df

In [65]:
ENCODED_TRAIN_DF = word_to_idx_conversion(TRAIN_DF, word_to_idx)
m_len_input=max([len(i) for i in ENCODED_TRAIN_DF['sentence_1']])#in teoria è fissa e uguale per tutti ma come al solito meglio essere sicuri
ENCODED_TEST_DF = word_to_idx_conversion(TEST_DF, word_to_idx,len_sentence=m_len_input)
print(ENCODED_TRAIN_DF.shape, ENCODED_TEST_DF.shape)
ENCODED_TRAIN_DF.head(3)

(44918, 2) (19251, 2)


Unnamed: 0,sentence_1,sentence_2
50227,"[5823, 13, 11306, 11049, 6885, 12168, 15934, 6...","[11049, 6885, 12168, 15934, 13781, 6950, 15522..."
43336,"[11049, 11330, 5676, 7635, 6986, 15934, 3810, ...","[11049, 11330, 5676, 7635, 6986, 15934, 3810, ..."
12067,"[11306, 14020, 17953, 17304, 13630, 7311, 8071...","[14020, 17953, 17304, 13630, 8404, 19, 0, 0, 0..."


In [67]:
TRAIN_DATASET = ENCODED_TRAIN_DF
#EMBEDDING_MATRIX

# Autoencoder (AE)

"we first train an auto-encoder on a large corpus of real sentences. Then, while training the GAN, to get “real” samples we input real sentences to the encoder of the auto-encoder and get the corresponding sentence vectors."

In [68]:
from keras.models import Model,Sequential
Adam = tf.optimizers.Adam
from keras.layers import Dense, Dropout, Input, LeakyReLU, Rescaling, LSTM, RepeatVector
from keras import backend as K
#import tensorflow as tf
#from tqdm import tqdm
import pandas as pd
import numpy as np
#from auxiliary import no_out

import io, sys

In [69]:
MAX_SENTENCE_LEN = 20 #TRAIN_DATASET.iloc[0][0].shape[0]; MAX_SENTENCE_LEN

In [70]:
def create_AE(timesteps, embedd_dim, latent_dim, vocab_size):
  '''Creates an autoencoder for sequences using LSTM'''
  global EMBEDDING_MATRIX
  inputs = Input(shape=(timesteps, )) # ,embedd_dim

  embed_layer = tf.keras.layers.Embedding(
    vocab_size,   #  in dim
    embedd_dim,     #  out dim
    embeddings_initializer = tf.keras.initializers.Constant(EMBEDDING_MATRIX),
    mask_zero = True,
    trainable = False
  )(inputs)

  encoded = LSTM(latent_dim, return_sequences=True)(embed_layer)  #dropout=.2,  
  
  decoded = LSTM(latent_dim, return_sequences=True)(encoded) #  # embedd_dim
  decoded = Dense(vocab_size, activation=tf.keras.activations.softmax)(decoded)

  autoencoder = Model(inputs, decoded)
  encoder = Model(inputs, encoded)
  #decoder = Model(Input(shape=(timesteps, latent_dim)), decoded)

  return autoencoder, encoder, None

In [71]:
INNER_DIM = 50

AE, ENC, _ = create_AE(MAX_SENTENCE_LEN, EMBEDD_DIM, INNER_DIM, VOCAB_SIZE+1)  # EMBEDD_DIM//MAX_SENTENCE_LEN
AE.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_12 (InputLayer)       [(None, 20)]              0         
                                                                 
 embedding_1 (Embedding)     (None, 20, 200)           3595400   
                                                                 
 lstm_2 (LSTM)               (None, 20, 50)            50200     
                                                                 
 lstm_3 (LSTM)               (None, 20, 50)            20200     
                                                                 
 dense_28 (Dense)            (None, 20, 17977)         916827    
                                                                 
Total params: 4,582,627
Trainable params: 987,227
Non-trainable params: 3,595,400
_________________________________________________________________


In [72]:
adam_pp = {
	'learning_rate': .002,	# def. 0.001
	'beta_1': .9,			# def. 0.9
	'beta_2': .999,			# def. 0.999
	'epsilon': 1e-12
}

class MaskedLoss(tf.keras.losses.Loss):
  def __init__(self):
    self.name = 'masked_loss'
    self.loss = tf.keras.losses.mean_squared_error
    self.reduction = tf.compat.v1.losses.Reduction.NONE #losses_utils.ReductionV2.SUM

  def __call__(self, y_true, y_pred, sample_weight):
    # Calculate the loss for each item in the batch.
    loss = self.loss(y_true, y_pred)
    # Mask off the losses on padding.
    #mask = np.apply_along_axis(lambda _: np.allclose(_, np.zeros(_.shape)), 2, y_true)
    mask = np.asarray(y_true).any(axis=2)
    #print(mask.shape)
    loss = tf.math.multiply(loss, mask)
    # Return the total.
    return tf.reduce_sum(loss)

AE.compile(optimizer=Adam(**adam_pp), loss=MaskedLoss()) #mse ?

def iterate(ds, batch_s=16, silent=True):
  global VOCAB_SIZE, MAX_SENTENCE_LEN # to fact
  while True:
    #gc.collect()
    for i in range(0, len(ds)-batch_s+1, batch_s):
      l = []
      one_h = np.zeros((batch_s, MAX_SENTENCE_LEN, VOCAB_SIZE+1), dtype='float32')
      for k in range(batch_s):
        l.append(ds.iloc[i+k][:MAX_SENTENCE_LEN])  # TODO REMOVE LAST TR.
        for word in range(MAX_SENTENCE_LEN):
          one_h[k, word, ds.iloc[i+k][word]] = .9
      el = tf.constant(l)
      #print(el.shape, one_h.shape)
      yield el, one_h
      #yield tf.convert_to_tensor(ds.iloc[i:i+batch_s].to_numpy()) # bug?
    if not silent: print('\nRESTARTING ITERATION')

In [73]:
import gc
gc.collect();
#AE.load_weights('AE2')

BATCH_SIZE = 512
WHOLE_DS = pd.concat([
  TRAIN_DATASET['sentence_1'],
  TRAIN_DATASET['sentence_2']],
  ignore_index=True
)

sentence_ds_itr = iterate(WHOLE_DS, batch_s=BATCH_SIZE, silent=True)

In [74]:
LOAD_W = True

if LOAD_W:
  !cp drive/MyDrive/NLP_3/AE3* .
  AE.load_weights('AE3')

N = len(WHOLE_DS)//BATCH_SIZE  # //10: one ds iteration every 10 epochs
losses = []

try:
  for epoch in range(0):
    ts0 = tf.timestamp()
    tot_loss = 0.
    for batch in range(N):
      el, one_hot = next(sentence_ds_itr)
      tot_loss += AE.train_on_batch(el, one_hot, return_dict=True)['loss']
    print(end=f'epoch {epoch} loss: {tot_loss*1000/(N*BATCH_SIZE):.4f} e-3  (epoch took {tf.timestamp()-ts0.numpy():.1f})')
    if epoch>=3:
      AE.save_weights('AE3')
      !cp -u AE3* drive/MyDrive/NLP_3/
      print(end = '  -- weights saved')
      losses.append(tot_loss)
      gc.collect()
    print()
except KeyboardInterrupt:
  plot_trend(range(len(losses)), losses)
  print('\nMANUAL STOP')

# epoch 27 loss: 0.0739 e-3  (epoch took 244.2)
# epoch +12 loss: 0.0305 e-3  (epoch took 243.0)

In [75]:
class Translator2:
	def __init__(self, word_to_idx, idx_to_word, tokenizer, fun=lambda _:_):
		self.w2i = word_to_idx
		self.i2w = idx_to_word
		self.tk = tokenizer
		self.f = fun	

	def tr(self, what):
		if isinstance(what, str):
			tr = [self.w2i[token] for token in self.tk(what)]
			return self.f(tr) if flat else self.f([tr])
		else:
			tr = [ ' '.join([self.i2w[token] for token in s]) for s in what]
			return '\n'.join(tr)

In [76]:
tr = Translator2(word_to_idx,{0:'<ZERO>', **idx_to_word}, word_tokenize) #, fun=tf.convert_to_tensor)

def run_model(model_, sentence):
  global iterate, word_to_idx_conversion, word_to_idx, MAX_SENTENCE_LEN
  sl = sentence if type(sentence) is list else [sentence]
  pred_test_itr = iterate(word_to_idx_conversion(
      pd.DataFrame(sl),
      word_to_idx,
      len_sentence=MAX_SENTENCE_LEN).iloc[:,0],
    batch_s=len(sl))
  in_, one_h = next(pred_test_itr)
  out_ = model_.predict(in_)
  return np.asarray(tf.math.argmax(out_, axis=-1))

print(tr.tr(run_model(AE, [
  "ho constatato che davide ha una brutta voce.",
  "ho constatato che davide ha una brutta barba.",
  "ho constatato che davide ha una barba brutta.",
  "ho constatato che una barba brutta ha davide.",
  "Giovanni.",
  "barba di Giovanni.",
  "assai periglioso è il cammino.",
  "più periglioso è il cammino.",
  "più vago è il cammino.",
  "più vago fu il cammino.",
  "vago è il cammino.",
  "vago.",
  "più veloce è il cammino.",
  "poi si farà.",
]
)))  #.lower()

  "Even though the `tf.config.experimental_run_functions_eagerly` "


ho constatato che davide ha una brutta voce . . . . . . . . . . . .
ho constatato che davide ha una brutta avevano . . . . . . . . . . . .
ho constatato che davide ha una avevano brutta . . . . . . . . . . . .
ho constatato che una avevano brutta ha davide . . . . . . . . . . . .
l.d . . . . . . . . . . . . . . . . . . .
avevano di discorsi . . . . . . . . . . . . . . . . .
assai nell'ambiente è il cammino . . . . . . . . . . . . . . .
più decadenza è il cammino . . . . . . . . . . . . . . .
più propriamente è il cammino . . . . . . . . . . . . . . .
più propriamente fu il cammino . . . . . . . . . . . . . . .
propriamente è il cammino . . . . . . . . . . . . . . . .
propriamente . . . . . . . . . . . . . . . . . . .
più veloce è il cammino . . . . . . . . . . . . . . .
poi si farà . . . . . . . . . . . . . . . . .


# GAN

In [77]:
adam_pp = {
	'learning_rate': .00002,	# def. 0.001
	'beta_1': .8,			# def. 0.9
	'beta_2': .999,			# def. 0.999
	'epsilon': 1e-10
}
relu_initializer = tf.keras.initializers.HeUniform()
#oth_initializer  = tf.keras.initializers.GlorotNormal()

In [78]:
# !!! do NOT RUN this cell to continue training! !!!
acc_loss = []
nets = None
#gc.collect();

In [79]:
def create_gen(input_shape, dims, id_='0'):
    global adam_pp, relu_initializer
    
    in_ = gen = Input(input_shape)
    for d in dims[:-1]:
      gen = Dense(units=d, activation=LeakyReLU(0.2), kernel_initializer=relu_initializer)(gen)
    gen = Dense(dims[-1], activation=tf.keras.activations.softmax)(gen)

    #gen = tf.math.argmax(gen, axis=-1)
    gen = Model(in_, gen, name=id_)
    gen.compile(loss=tf.keras.losses.BinaryCrossentropy(
        reduction=tf.keras.losses.Reduction.SUM
    ), optimizer=Adam(**adam_pp))
    return gen

if not nets:
  GEN = create_gen((MAX_SENTENCE_LEN, INNER_DIM), (32, 512, 1024, 128, INNER_DIM), id_='generator')
  GEN.summary()

Model: "generator"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_13 (InputLayer)       [(None, 20, 50)]          0         
                                                                 
 dense_29 (Dense)            (None, 20, 32)            1632      
                                                                 
 dense_30 (Dense)            (None, 20, 512)           16896     
                                                                 
 dense_31 (Dense)            (None, 20, 1024)          525312    
                                                                 
 dense_32 (Dense)            (None, 20, 128)           131200    
                                                                 
 dense_33 (Dense)            (None, 20, 50)            6450      
                                                                 
Total params: 681,490
Trainable params: 681,490
Non-train

In [80]:
def create_dis(input_shape, id_='0'):
    global adam_pp, relu_initializer
    
    in_ = dis = Input(input_shape)
    #dis = Dense(units=128, activation=LeakyReLU(0.2), kernel_initializer=relu_initializer)(dis)
    #dis = Dropout(0.2)(dis)
    
    dis = Dense(units=64, activation=LeakyReLU(0.2), kernel_initializer=relu_initializer)(dis)
    #dis = Dropout(0.2)(dis)

    dis = tf.keras.layers.Flatten()(dis)
    
    dis = Dense(units=512, activation=LeakyReLU(0.2), kernel_initializer=relu_initializer)(dis)
    dis = Dropout(0.2)(dis)

    dis = Dense(units=256, activation=LeakyReLU(0.2), kernel_initializer=relu_initializer)(dis)
    dis = Dropout(0.3)(dis)

    dis = Dense(units=1, activation='sigmoid')(dis)

    #gen = tf.math.argmax(gen, axis=-1)
    dis = Model(in_, dis, name=id_)
    dis.compile(loss=tf.keras.losses.BinaryCrossentropy(
        reduction=tf.keras.losses.Reduction.SUM
    ), optimizer=Adam(**adam_pp))
    return dis

if not nets:
  DIS = create_dis((MAX_SENTENCE_LEN, INNER_DIM), id_='discr')
  DIS.summary()

Model: "discr"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_14 (InputLayer)       [(None, 20, 50)]          0         
                                                                 
 dense_34 (Dense)            (None, 20, 64)            3264      
                                                                 
 flatten_3 (Flatten)         (None, 1280)              0         
                                                                 
 dense_35 (Dense)            (None, 512)               655872    
                                                                 
 dropout_6 (Dropout)         (None, 512)               0         
                                                                 
 dense_36 (Dense)            (None, 256)               131328    
                                                                 
 dropout_7 (Dropout)         (None, 256)               0     

In [81]:
def create_gan(discriminator, generator, in_shape):
  '''
  class MaskedLoss2(tf.keras.losses.Loss):
    def __init__(self, masked_value=0.):
      self.name = 'masked_loss2'
      self.loss = tf.keras.losses.SparseCategoricalCrossentropy()
        #from_logits=False,
        #reduction='none')
      self.mv = masked_value
      self.reduction = tf.compat.v1.losses.Reduction.NONE #

    def __call__(self, y_true, y_pred, sample_weight):
      # Calculate the loss for each item in the batch.
      loss = self.loss(y_true, y_pred)
      print(loss.shape)
      # Mask off the losses on padding.
      mask = tf.cast(y_true != self.mv, tf.float32)
      loss *= mask
      print(tf.reduce_sum(loss).shape)
      # Return the total.
      return tf.reduce_sum(loss)
  ''';

  discriminator.trainable = False
  gan_input = Input(shape=in_shape)
  x = tf.keras.layers.Masking()(gan_input)
  x = generator(x)  # gan_input
  gan_output = discriminator(x)
  gan = Model(inputs=gan_input, outputs=gan_output)
  gan.compile(loss=tf.keras.losses.BinaryCrossentropy(
    #from_logits=True,
    reduction=tf.keras.losses.Reduction.SUM
  ), optimizer=Adam(**adam_pp))  # 'binary_crossentropy'
  return gan

if not nets:
  GAN = create_gan(DIS, GEN, (MAX_SENTENCE_LEN, INNER_DIM))
  GAN.summary()

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_15 (InputLayer)       [(None, 20, 50)]          0         
                                                                 
 masking_3 (Masking)         (None, 20, 50)            0         
                                                                 
 generator (Functional)      (None, 20, 50)            681490    
                                                                 
 discr (Functional)          (None, 1)                 790721    
                                                                 
Total params: 1,472,211
Trainable params: 681,490
Non-trainable params: 790,721
_________________________________________________________________


In [82]:
# TO BE REPLACED WITH A HUMAN IMPLEMENTATION
class DECc:
  def __init__(self, AE):
    self.AE = AE
    self.dec_1st_layer = [i for i,e in enumerate(AE.layers) if type(e) is tf.keras.layers.LSTM][1]
  def __call__(self, x):
    #x = tf.zeros((1,20,50))
    for l in self.AE.layers[self.dec_1st_layer:]:
      x = l(x)
    return tf.math.argmax(x, axis=-1)

DEC = DECc(AE)

In [83]:
#ENC( tf.ones((1,20)))

In [84]:
RED_FACTOR = 10
LOAD_W_GAN = False

def training(epochs=1, batch_size=1024):
    global acc_loss, nets, ds_iter, DIS, GEN, GAN, MAX_SENTENCE_LEN, INNER_DIM, DEC, ENC, iterate

    if LOAD_W_GAN:
      !cp drive/MyDrive/NLP_3/GAN* .
      AE.load_weights('GAN1')


    def iterate_4gan(ds_iterator1, ds_iterator2):
      while True:
        (complex_, _), (simple, _) = next(ds_iterator1), next(ds_iterator2)
        yield ENC(complex_), ENC(simple), (complex_, simple)
    #EOFUN

    ds_iterator = iterate_4gan(
        iterate(TRAIN_DATASET['sentence_1'], batch_s=batch_size),
        iterate(TRAIN_DATASET['sentence_2'], batch_s=batch_size)
    )

    if (not nets):
      nets = [GEN, DIS, GAN]
      # print(gan.metrics_names)
    else:
      print('already existent gan, continuing...\n')
    g, d, gan = nets

    N = len(TRAIN_DATASET['sentence_1'])//batch_size//RED_FACTOR # N. OF BATCHES RUN, FOR EACH EPOCH
    for e in range(1, epochs+1):
      ts0 = tf.timestamp()
      d_loss = 0.
      e_loss = 0.
      print(f" === Epoch {len(acc_loss)+1:2d} ===")
      for batch in range(N):
        # STEP 1 -------------------------------------------------
        E1, E2, (complex_, _) = next(ds_iterator)
        # Generate "fake" data from E1
        G1 = g.predict(E1) #.astype(np.int32) #no_out
        x = np.concatenate([G1, E2]) ###[indices]
        #print(len(REAL_BATCH), len( generated_batch),x.shape)

        # Labels for generated and real data
        y = np.zeros(2*batch_size)
        y[0:batch_size] = .9  # label smoothing

        #Pre train discriminator on  fake and real data  before starting the gan. 
        d.trainable = True
        d_loss += d.train_on_batch(x, y=y, reset_metrics=True, return_dict=True)['loss'] #no_out
        #print(end=f'Discriminator loss: {d_loss}; ')

        # STEP 2 -------------------------------------------------
        # Treating the noised input of the Generator as real data
        y_gen = np.ones(batch_size) #np.ones(batch_size)

        # During the training of gan, 
        # the weights of discriminator should be fixed. 
        # We can enforce that by setting the trainable flag
        d.trainable = False

        #training  the GAN by alternating the training of the Discriminator 
        #and training the chained GAN model with Discriminator’s weights freezed.
        e_loss += gan.train_on_batch(next(ds_iterator)[0], y_gen, reset_metrics=True, return_dict=True)['loss']

      acc_loss += [(d_loss, e_loss)]
      print(f'dis loss is {acc_loss[-1][0]*1000.//(N*batch_size):0.7f} e-3')
      print(f'gan loss is {acc_loss[-1][1]*1000.//(N*batch_size):0.7f} e-3')  #*1000/(N*batch_size)
      print(f'epoch took {tf.timestamp()-ts0:.1f}s')
      if e>2:
        print(tr.tr(np.asarray(complex_)[:1]))
        print(tr.tr(np.asarray(DEC(G1))[:1]))  # !!!using the last G1
        gan.save_weights('GAN1')
        !cp -u GAN* drive/MyDrive/NLP_3/

      gc.collect()

In [None]:
#training(500, batch_size=512)