In [None]:
import tensorflow as tf
from numpy import pi, sqrt
import gc

In [None]:
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)
physical_devices

## Position and Embedding

In [None]:
class PositionEmbeding(tf.keras.layers.Layer):
    def __init__(self, max_len, vocab_size, embedding_dim):
        super(PositionEmbeding, self).__init__()
        self.token_emb = tf.keras.layers.Embedding(vocab_size, embedding_dim, mask_zero=True)
        self.pos_emb = tf.keras.layers.Embedding(max_len, embedding_dim, mask_zero=True)
        self.start = 1
    
    def get_token_embedding(self):
        return self.token_emb.weights
    
    def set_start(self, N):
        self.start = N
    
    def position_embedding(self, x):
        batch_length = tf.shape(x)[1] 
        batch_size = tf.shape(x)[0]
        
        pos = tf.reshape(tf.tile(tf.range(self.start, batch_length + self.start), [batch_size]),
                                       [batch_size, batch_length])
        pos = tf.cast(pos, tf.int32)
        invalid_pos = tf.cast(tf.not_equal(x, 0), tf.int32)
        pos *= invalid_pos
        
        return self.pos_emb(pos)
        
    def token_embedding(self, x):
        
        words = self.token_emb(x)
        return words
    
    def call(self, input):
        positions = self.position_embedding(input)
        words = self.token_embedding(input)
        return words + positions 

## GELU Function

In [None]:
def GELU(x):
    return 0.5*x*(1+tf.tanh(sqrt(2/pi)*(x+0.044715*tf.pow(x, 3))))

## Multi Head Attention

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
    
    
    def split_heads(self, x):
        batch = tf.shape(x)[0]
        *prev, embedding_dim = x.shape.as_list()
        x = tf.reshape(x, [batch] + [-1] + [self.num_heads, embedding_dim//self.num_heads])
        return tf.transpose(x, [0, 2, 1, 3])
    
    def merge_heads(self, x):
        batch = tf.shape(x)[0]
        x = tf.transpose(x, [0, 2, 1, 3])
        *prev, heads, sub_dim = x.shape.as_list()
        x = tf.reshape(x, [batch] + [-1] + [heads*sub_dim])
        return x
    
    def call(self, input):
        
        input, past, mask = input
        
        #Para la dimension 3, que ahora mismo tenemos [batch, seq_len, embedding*3]
        # dividimos en 3 partes para generar query, key y value que al mismo tiempo
        # dividimos en los heads que queramos
        q, k, v = map(self.split_heads, tf.split(input, 3, axis=2))
        
        
        #present = tf.stack([k,v], axis=1)
        if past is not None:
            pk, pv = tf.unstack(past, axis=1)
            k = tf.concat([pk,k], axis=-2)
            v = tf.concat([pv,v], axis=-2)
        
        present = tf.stack([k,v], axis=1)
        
        #Por las siguientes multiplicaciones tendremos que w tiene dimensiones
        #[batch, heads, dst_sec, orig_sec] es decir tenemos para todos los elementos del batch
        #y para todos los heads una matriz donde cada columna y fila son palabras, las diagonales
        #son las mismas palabras y el valor es la "puntuación" o relación que esta palabra tiene
        #con la otra
        w = tf.matmul(q, k, transpose_b=True)
        w /= tf.math.rsqrt(tf.cast(v.shape.as_list()[-1], w.dtype))
        
        if mask is not None:
            w += (mask * -1e9)
        
        #Aplicamos la mascara, softmax para regular entre 0 y 1 y finalmente multiplicamos los valores
        #de query y key por value
        #w = self.apply_mask(w)
        w = tf.nn.softmax(w, axis=-1)
        output = tf.matmul(w, v)
        
        return self.merge_heads(output), present
        

## Decoder Block

In [None]:
class DecoderBlock(tf.keras.layers.Layer):
    def __init__(self, num_heads, d_model):
        super(DecoderBlock, self).__init__()
        self.MultiHeadAttention = MultiHeadAttention(num_heads=num_heads)
        
        self.LayerNorm1 = tf.keras.layers.LayerNormalization()
        self.LayerNorm2 = tf.keras.layers.LayerNormalization()
        
        self.GELU = tf.keras.layers.Activation(GELU)
        
        self.conv1D_1 = tf.keras.layers.Conv1D(d_model*3, 1)
        self.conv1D_2 = tf.keras.layers.Conv1D(d_model*4, 1)
        
        self.conv1D_3 = tf.keras.layers.Conv1D(d_model, 1)
        self.conv1D_4 = tf.keras.layers.Conv1D(d_model, 1)

    
    def call(self, input):
        x, past, mask = input
        x_norm1 = self.LayerNorm1(x)
        x_conv1 = self.conv1D_1(x_norm1)
        x_attn, present = self.MultiHeadAttention((x_conv1, past, mask))
        x_conv3 = self.conv1D_3(x_attn)
        
        x = tf.keras.layers.Add()([x_conv3,x])
        
        x_norm2 = self.LayerNorm2(x)
        x_conv2 = self.conv1D_2(x_norm2)
        
        #https://mlfromscratch.com/activation-functions-explained/#/
        x_gelu = self.GELU(x_conv2)
        
        # Hay paginas que hablan de dos densas, pero en el codigo se utiliza otra cosa WTF??
        # https://www.reddit.com/r/MachineLearning/comments/b1c6sn/d_is_gpt2_source_code_publically_available/eilbqas/
        # en ese enlace se menciona posible reduccion del codigo?
        
        # Si ponemos dos sensas no llegamos a lo pedido (en cuanto a pesos) indicados en el paper
        # entonces es menos potente?? O estan contando pesos fuera del model, pero no tiene sentido
        
        
        
        x_conv4 = self.conv1D_4(x_gelu)
        
        x = tf.keras.layers.Add()([x_conv4,x])
        return x, present
        

## GPT2

In [None]:
class GPT2(tf.keras.Model):
    def __init__(self, num_layers, num_heads, d_model, vocab_size, max_len):
        super(GPT2, self).__init__()
        self.DecoderBlocks = [DecoderBlock(num_heads, d_model) for _ in range(num_layers)]
        self.PosEmb = PositionEmbeding(max_len, vocab_size, d_model)
        self.Norm = tf.keras.layers.LayerNormalization()
        self.vocab = vocab_size
        self.d_model = d_model
        self.pasts = list([None] * num_layers)
        self.conditioned = False
        self.num_layers = num_layers
    
    def reset_past_status(self):
        self.pasts = [None] * num_layers
        self.conditioned = False
    
    def generate_next_output(self, X):
        start = 1 if not self.conditioned else tf.shape(self.pasts)[-2]
        self.PosEmb.set_start(start)
        
        output, present = self.call(X)
        self.pasts = present
        self.conditioned = True
        return output

    
    def create_look_ahead_mask(self, size):
        mask = tf.linalg.band_part(tf.ones((size, size)), -1, 0)
        return mask
    
    def create_paddding_mask(self, inp):
        padded = tf.cast(tf.math.equal(inp, 0), tf.float32)
        return padded[:, tf.newaxis, tf.newaxis, :]
    
    def create_mask(self, x):
        matrix_dim = tf.shape(x)[1]
        
        mask_ahead = self.create_look_ahead_mask(matrix_dim)
        mask_padd = self.create_paddding_mask(x)
        
        mask = tf.maximum(mask_padd, mask_ahead)
        return mask
    
    def call(self, input):
        presents = []
        
        assert len(self.pasts) == self.num_layers
        
        mask = self.create_mask(input)
        x = self.PosEmb(input)
        
        for DecoderB, past in zip(self.DecoderBlocks, self.pasts):
            x, present = DecoderB((x, past, mask))
            presents.append(present)
            
        x = self.Norm(x)
        seq_len = tf.shape(x)[1]
        x = tf.reshape(x, [-1, self.d_model])
        x = tf.matmul(x, self.PosEmb.get_token_embedding(), transpose_b=True)
        logits = tf.reshape(x, [-1, seq_len, self.vocab])
        return logits, presents
    
    def train_step(self, data):
        gc.collect()
        x, y = data
        x = tf.cast(x, tf.int32)

        with tf.GradientTape() as tape:
            y_pred, _ = self(x, training=True)
            loss = self.compiled_loss(y, y_pred, regularization_losses=self.losses)

        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        self.compiled_metrics.update_state(y, y_pred)
        perplexity = tf.exp(loss)
        
        return {"Loss": loss, "Perplexity": perplexity}
    
    def test_step(self, data):
        x, y = data
  
        y_pred, _ = self(x, training=False)

        loss = self.compiled_loss(y, y_pred, regularization_losses=self.losses)

        self.compiled_metrics.update_state(y, y_pred)
        
        perplexity = tf.exp(loss)

        return {"Loss": loss, "Perplexity": perplexity}
        

# Generate Dataset

In [None]:
import os
Root = os.getcwd()

In [None]:
path_originalData = Root + "/Dataset"
all_files = [os.path.join(path_originalData,file) for file in os.listdir(path_originalData) if '.txt' in file[-4:]]

#### Fix posible problems in Unicode Files

In [None]:
from ftfy import fix_text

In [None]:
procesed_path = "./Processed.txt"
writer = open(procesed_path, "w", encoding='utf-8')
for file in all_files:
    f = open(file, "r", encoding='utf-8')
    writer.writelines([fix_text(line, normalization="NFKC") for line in f.readlines()])
    f.close()
writer.close()

#### Byte Pair Encoding

In [None]:
from collections import Counter
import sentencepiece as spm
import numpy as np
import csv

In [None]:
token_count = Counter()
with open(procesed_path, 'r', encoding='utf-8') as f:
    for line in f:
        token_count.update(line.lower().split())

In [None]:
counter_path = "./Counter.txt"
with open(counter_path, 'w', newline='', encoding='utf-8') as f:
    output = csv.writer(f, delimiter='\t')
    for word in token_count:
        output.writerow([word, token_count[word]])

In [None]:
#Libreria que nos crea la codificación byte Pair Encoding (el vocab_size es a nuestra elección)
Model_path = "./BPE_Model"
vocab_size = 24512 #int(len(token_count)/4)

spmcmd = '--input={spm_input} --model_prefix={spm_model} --input_format=tsv --vocab_size={vocab_size} --user_defined_symbols=[SEP],[BOS],[EOS] --hard_vocab_limit=false --model_type=bpe --pad_id=0 --unk_id=1 --bos_id=-1 --eos_id=-1 --pad_piece=[PAD] --unk_piece=[UNK]'.format(
spm_input=counter_path, spm_model=Model_path, vocab_size=vocab_size)
spm.SentencePieceTrainer.train(spmcmd)

In [None]:
s = spm.SentencePieceProcessor()
s.Load(Model_path + ".model")

In [None]:
BOS = 3
EOS = 4

In [None]:
dataset = []
count = 0

min_seq_len = 15
max_seq_len = 128

with open(procesed_path, 'r', encoding='utf-8') as f:
    for line in f:
        encod = s.encode_as_ids(line)
        if max_seq_len > len(encod) > min_seq_len:
            dataset += [[[BOS]+ encod, encod + [EOS]]]
            

# Load Dataset

In [None]:
train_percent = (85 / 100)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = [X[0] for X in dataset]
Y = [Y[1] for Y in dataset]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=1-train_percent)

In [None]:
max_len = max(max([len(x) for x in X_train]), max([len(x) for x in X_test])) + 1

In [None]:
'''for i in range(len(X_train)):
    X_train[i] += [0 for _ in range(max_len-len(X_train[i]))]
    y_train[i] += [0 for _ in range(max_len-len(y_train[i]))]'''

In [None]:
'''for i in range(len(X_test)):
    X_test[i] += [0 for _ in range(max_len-len(X_test[i]))]
    y_test[i] += [0 for _ in range(max_len-len(y_test[i]))]'''

In [None]:
BATCH_SIZE = 8
EPOCHS = 10
BUFFER_SIZE = 10000

In [None]:
#train_dataset = tf.data.Dataset.from_tensor_slices((tf.constant(X_train), tf.constant(y_train))).batch(1, drop_remainder=True)
#test_dataset  = tf.data.Dataset.from_tensor_slices((tf.constant(X_test), tf.constant(y_test))).batch(1, drop_remainder=True)

train_dataset = tf.data.Dataset.from_tensor_slices((tf.ragged.constant(X_train), tf.ragged.constant(y_train))).map(lambda x,y: (x,y)).padded_batch(BATCH_SIZE)
test_dataset  = tf.data.Dataset.from_tensor_slices((tf.ragged.constant(X_test), tf.ragged.constant(y_test))).map(lambda x,y: (x,y)).padded_batch(BATCH_SIZE)

# Training

In [None]:
num_layers = 8
num_heads = 8
d_model = 768

In [None]:
gpt2 = GPT2(num_layers, num_heads, d_model, vocab_size, max_len)

In [None]:
def Loss(y_true, y_pred):
    mask = tf.math.logical_not(tf.math.equal(y_true, 0))
    
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)(y_true, y_pred)
    
    mask = tf.cast(mask, loss.dtype)
    loss *= mask
    

    loss = tf.reduce_sum(loss, axis=1)
    average_loss = tf.reduce_mean(loss / tf.reduce_sum(mask, axis=1))
    
    
    
    return tf.reduce_mean(loss)  

In [None]:
gpt2.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss=Loss)

In [None]:
hist = gpt2.fit(train_dataset, validation_data=test_dataset, epochs=50)

In [None]:
def top_k_logits(logits, k):
    if k == 0:
        return logits

    values, _ = tf.nn.top_k(logits, k=k)
    min_values = values[:, -1]

    return tf.where(
        logits < min_values,
        tf.ones_like(logits, dtype=logits.dtype) * -1e10,
        logits
    )

In [None]:
def extract_data(logits, temperature=1, top_k=50):
    logits = logits[:, -1, :]/tf.cast(temperature, tf.float32)
    #print(logits)
    logits = top_k_logits(logits, k=top_k)
    #print(logits)
    sample = tf.random.categorical(logits, num_samples=1, dtype=tf.int32)
    #sample = tf.nn.softmax(logits)
    return sample

In [None]:
puta = ''
def generator_text(initial_sentence, model, seq_len, temp=1, k=10, clear_status=True):
    global puta
    if clear_status:
        gpt2.reset_past_status()

    context = tf.expand_dims(([BOS] + s.encode_as_ids(initial_sentence)), 0)
    prev = context
    output = context
    j = 0
    for i in range(seq_len):
        logits = model.generate_next_output(prev)
        puta = logits
        #print("original", logits)
        sample = extract_data(logits)
        #print(sample)
        if tf.equal(sample, EOS):
            print(j, "END")
            break
        output = tf.concat([output, sample], axis=-1)
        prev = sample
        j += 1
    print(output)
    result = tf.squeeze(output, axis=0)
    pred = [int(i) for i in result]
    generated_seq = s.decode_ids(pred[1:])
    generated_seq = generated_seq.replace("[SEP]", "").strip()
    generated_seq = ' '.join(generated_seq.split())
    return generated_seq

In [None]:
Initial_Sentence = "i have a friend called "
generator_text(Initial_Sentence, gpt2, 50, temp=0.7, k=8)

In [None]:
Loss([y_train[0]],  out)

In [None]:
N = 4
Loss([y_train[0][:4]], out[:, :4, :])

In [None]:
Loss([y_train[0][:]], out)

In [None]:
l1 = [x for x in range(98)]
l2 = [0 for x in range(98)]
m = tf.math.logical_not(tf.math.equal(tf.constant([y_train[0]]), 0))
tf.constant([y_train[0]])[m]


In [None]:
gpt2.reset_past_status()
out = gpt2.generate_next_output(tf.constant([X_train[0]]))
out

In [None]:
tf.math.argmax(out, axis=-1)

In [None]:
tf.constant([X_train[0]])

In [None]:
tf.constant([y_train[0]])

In [None]:
for ex in train_dataset.take(1):
    print(ex[0])

In [None]:
X = next(train_dataset.as_numpy_iterator())[0]
Y = next(train_dataset.as_numpy_iterator())[1]
print(X,Y)

In [None]:
Y.shape

In [None]:
m = tf.math.logical_not(tf.math.equal(a, 0))
m

In [None]:
m[3]

In [None]:
batch = tf.shape(a)[0]
batch

In [None]:
x = a[m]
np.reshape(x, (batch, 98))