# Making Transformer Model From Scratch

# Importing Required Libraries

In [1]:
import PyPDF2  # For PDF image extraction
import docx  # For DOCX text extraction
from PIL import Image  # For image loading
import numpy as np
import tensorflow as tf
from tensorflow import keras  # For Transformer layers


  from pandas.core import (


# Extracting Images From Pdf

In [2]:
def extract_images_from_pdf(pdf_path):
  """
  Extracts images from a PDF file using PyPDF2.

  Args:
      pdf_path: Path to the PDF file.

  Returns:
      A list of extracted image objects or None if no images found.
  """

  with open(pdf_path, 'rb') as pdf_file:
    pdf_reader = PyPDF2.PdfReader(pdf_file)

    extracted_images = []

    for page_num in range(len(pdf_reader.pages)):
      page = pdf_reader.pages[page_num]

      # Extract embedded images (adjust based on your PDF structure)
      for image_data in page.extract_embedded_images():
        # Assuming format and filename extraction logic (optional)
        image_format = image_data['Format']
        image_filename = f"page_{page_num+1}_image_{len(extracted_images)}.{image_format}"

        # Save the image to a temporary location 
        with open(image_filename, 'wb') as image_file:
          image_file.write(image_data['stream'])

        extracted_images.append(image_filename)

    return extracted_images if extracted_images else None


In [None]:
extract_images_from_pdf('Padilla - Nobleza virtuosa_testExtract.pdf')


# Expracting Text From Docx

In [5]:
def extract_text_from_docx(docx_path):
  """
  Extracts text from a DOCX file.

  Args:
      docx_path: Path to the DOCX file.

  Returns:
      The extracted text content as a string.
  """

  doc = docx.Document(docx_path)
  full_text = []

  # Iterating through paragraphs in all document elements
  for paragraph in doc.paragraphs:
    full_text.append(paragraph.text)

  return '\n'.join(full_text)


In [None]:
extract_text_from_docx('Padilla - 1 Nobleza virtuosa_testTranscription.docx')

# Preprocessing Images

In [6]:
def preprocess_image(image_path, target_size=(224, 224)):
  """
  Loads, resizes, and normalizes an image for the Transformer model.

  Args:
      image_path: Path to the image file.
      target_size: Target size for resizing (default: (224, 224)).

  Returns:
      A NumPy array representing the preprocessed image.
  """

  # Load the image using PIL
  img = Image.open(image_path)

  # Resize the image to the target size using high-quality Lanczos resampling
  img = img.resize(target_size, Image.Resampling.LANCZOS)

  # Convert the image to a NumPy array and normalize to range [0, 1]
  image_data = np.array(img) / 255.0

  return image_data


# Transformer(Encoder-Decoder Model)

Self-Attention Mechanism: This is the heart of the Transformer and allows each element in the sequence (words in the text or features from the image) to attend to other elements in the same sequence.

Implementation Details:
Each element is projected into three linear layers: query (Q), key (K), and value (V) vectors.

The query vector of a specific element attends to the key vectors of all other elements.

Attention weights are calculated based on the compatibility between the query and key vectors. These weights represent how relevant each element is to the element calculating the attention.

The attention weights are used to weight the value vectors of all elements, creating a context vector for the specific element based on relevant information from other elements in the sequence.

A multi-head self-attention mechanism performs multiple independent attention calculations with different weight projections, capturing diverse relationships within the sequence.

The outputs from the multi-head attention are concatenated and projected through a linear layer.

***The layer takes the embedding dimension, number of heads, and optional dropout rate as hyperparameters.

***It defines dense layers for projecting queries, keys, and values.

***An additional dense layer combines the outputs from multiple heads.

***A dropout layer is included

In [7]:
class MultiHeadAttention(tf.keras.layers.Layer):
  """
  Multi-head attention layer for the Transformer model.
  """

  def __init__(self, embedding_dim, num_heads, dropout_rate=0.1):
    super(MultiHeadAttention, self).__init__()
    self.embedding_dim = embedding_dim
    self.num_heads = num_heads
    self.head_dim = embedding_dim // num_heads  # Dimension per head

    # Dense layers for query, key, and value projections
    self.dense_q = tf.keras.layers.Dense(embedding_dim)
    self.dense_k = tf.keras.layers.Dense(embedding_dim)
    self.dense_v = tf.keras.layers.Dense(embedding_dim)

    # Dense layer for combining multiple heads' outputs
    self.dense = tf.keras.layers.Dense(embedding_dim)

    # Dropout layer
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

    # Weights for scaling attention scores (optional)
    self.scale = tf.Variable(1 / (self.head_dim ** 0.5), trainable=True, name='attention_scale')

  def split_heads(self, x, batch_size):
    """
    Splits the input tensor into multiple heads.
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.head_dim))
    return tf.transpose(x, perm=[0, 2, 1, 3])  # (batch_size, num_heads, seq_len, head_dim) 

  def call(self, inputs, training=False):
    """
    Performs multi-head attention on the input tensors.

    Args:
        inputs: A tuple of tensors (queries, keys, values).
        training: Boolean, whether in training mode (affects dropout).

    Returns:
        The weighted sum of values based on attention scores.
    """

    queries, keys, values = inputs
    batch_size = tf.shape(queries)[0]

    # Project queries, keys and values using dense layers
    q = self.dense_q(queries)
    k = self.dense_k(keys)
    v = self.dense_v(values)

    # Split heads for parallel attention calculations
    q_heads = self.split_heads(q, batch_size)
    k_heads = self.split_heads(k, batch_size)
    v_heads = self.split_heads(v, batch_size)

    # Calculate attention scores
    attention_scores = tf.matmul(q_heads, k_heads, transpose_b=True) * self.scale

    # Apply masking 
    # ... (implement masking logic based on your needs)

    # Applying a softmax function to attention scores
    attention_weights = tf.nn.softmax(attention_scores, axis=-1)

    # Apply dropout 
    attention_weights = self.dropout(attention_weights, training=training)

    # Context vector weighted sum
    context_vector = tf.matmul(attention_weights, v_heads)

    # Combine heads back into a single tensor
    context_vector = tf.transpose(context_vector, perm=[0, 2, 1, 3])  # (batch_size, seq_len, num_heads, head_dim)
    context_vector = tf.reshape(context_vector, (batch_size, -1, self.embedding_dim))

    # Applying final linear layer
    output = self.dense(context_vector)

    return output

  def get_config(self):
    config = super(MultiHeadAttention, self).get_config()
    config.update({
        'embedding_dim': self.embedding_dim,
        'num_heads': self.num_heads,
        'dropout_rate': self.dropout_rate,
    })
    return config


# Encoder-Decoder Attention:

This attention mechanism allows the decoder to attend to the encoded representation of the image (generated by the encoder) while generating the caption.

The decoder query vector attends to the encoder key and value vectors.

Similar to self-attention, attention weights are calculated and used to create a context vector for the decoder, incorporating information from the encoded image representation.

class TransformerDecoderLayer(keras.layers.Layer):
  def __init__(self, embedding_dim, num_heads, dropout_rate):
    super(TransformerDecoderLayer, self).__init__()

    self.self_attention = MultiHeadAttention(embedding_dim, num_heads)
    self.encoder_decoder_attention = MultiHeadAttention(embedding_dim, num_heads)
    # ... (other layer components)

  def call(self, inputs, encoder_outputs, training=False):
    # ... (decoder self-attention)

    # Encoder-decoder attention
    context_vector = self.encoder_decoder_attention((decoder_output, encoder_outputs, encoder_outputs), training=training)

    # ... (add & norm, feed forward, add & norm)
    return output


# Evaluating Metrics

# BLEU SCORE 

In [None]:
from sacrebleu import BLEU

# Load generated captions from a file
generated_captions = []
with open("generated_captions.txt", "r") as file:
  for line in file:
    generated_captions.append(line.strip())

# Load reference captions from a file 
reference_captions = []
with open("reference_captions.txt", "r") as file:
  for line in file:
    captions = file.readlines()  # Assuming multiple references per image
    reference_captions.append(captions)

# Calculate BLEU score
bleu = BLEU(score_type='corpus')
bleu_score = bleu.corpus_score(generated_captions, reference_captions)

print(f"BLEU Score: {bleu_score.score:.4f}")
