In this notebook we aim at computing how well we are approximating the attention scores matrix with the formula
$$
(I_nP+E)
$$

In [None]:
import numpy as np
import scipy
import pickle

Firstly we code the base functions to generate a bunch of matrices of the desired form belonging to the three sets $\Sigma_1,\Sigma_2,\Sigma_3$

In [None]:
# Diagonal matrix with error
def creatediagonal(n): #d is the vector dimension of the matrix
    vect=np.ones(n)-0.1*np.random.rand(n)
    # Now we add the sparse matrix
    sparse_matrix = 0.5*scipy.sparse.random(n, n, density=1/n) # usually that is the density of sparse matrix.
    sparse_matrix.setdiag(0) # we do not want to affect diagonal values
    sigma_1=np.diag(vect)+sparse_matrix.todense()
    # now we normalize by rows sigma_1
    row_sums = np.array(sigma_1.sum(axis=1)).flatten()
    sigma_1=sigma_1 / row_sums[:, np.newaxis]
    return sigma_1

# Almost diagonal matrix
def context(n,w): #here w stands for window size.
    context_matrix = np.zeros((n, n))
    for i in range(n):
      for j in range(max(0, i - w), min(i + w + 1, n)):
            # Apply the dropout
          if np.random.rand() > 0.4: # dropout probability
              context_matrix[i, j] = np.random.uniform(0.7, 1)
    return context_matrix


# Syntactic matrices
def createsintact(n, w, dropout_prob=0.4):
    context_matrix = np.zeros((n, n))
    for i in range(n):
      for j in range(max(0, i - w), min(i + w + 1, n)):
            # Apply the dropout
          if np.random.rand() > dropout_prob:
              context_matrix[i, j] = np.random.uniform(0.7, 1)

    # Adding noise and normalizing
    sparse_matrix = 0.5 * scipy.sparse.random(n, n, density=1 / n)
    sparse_matrix.setdiag(0)
    sigma_2 = context_matrix + sparse_matrix.todense()
    row_sums = np.array(sigma_2.sum(axis=1)).flatten()
    sigma_2 = sigma_2 / row_sums[:, np.newaxis]

    return sigma_2

def verticalmatrix(w): # how many other words that word will attend
    matrix = np.zeros((w, w))
    matrix[:, 0] = np.random.uniform(size=w,low=0.7, high=1) # vertical attending.
    return matrix[:, 0]




In [None]:
def generate_positions(n, num_positions, w):
    max_position = n - w+1
    min_distance = w
    positions = []
    while len(positions) < num_positions:
        new_position = np.random.randint(1, max_position + 1)
        if all(abs(new_position - pos) >= min_distance for pos in positions):
            positions.append(new_position)

    return np.sort(positions)


def raretoken(n,num_elements,w):
    matrix = np.eye(n)
    diagonal_values = np.random.uniform(size=n, low=0.7, high=1)
    np.fill_diagonal(matrix, diagonal_values)
    positions=generate_positions(n, num_elements, w)
    for pos in positions:
      for i in range(w):
        matrix[pos-1+i,pos-1]=matrix[pos-1+i,pos-1+i]
        if i>0:
          matrix[pos-1+i,pos-1+i]=0

    # Adding noise and normalizing
    sparse_matrix = 0.5 * scipy.sparse.random(n, n, density=1 / n)
    sparse_matrix.setdiag(0)
    sigma_3 = matrix + sparse_matrix.todense()
    row_sums = np.array(sigma_3.sum(axis=1)).flatten()
    sigma_3 = sigma_3 / row_sums[:, np.newaxis]

    return sigma_3



In [None]:
A=raretoken(10,2,3)

In [None]:
print(A)

[[0.55 0.   0.   0.31 0.   0.   0.   0.   0.13 0.  ]
 [0.7  0.   0.   0.   0.3  0.   0.   0.   0.   0.  ]
 [0.65 0.   0.   0.   0.   0.35 0.   0.   0.   0.  ]
 [0.   0.   0.   0.65 0.   0.   0.   0.3  0.06 0.  ]
 [0.   0.28 0.   0.   0.72 0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.71 0.29 0.   0.   0.  ]
 [0.   0.   0.   0.   0.   1.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   1.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   1.   0.  ]
 [0.23 0.   0.   0.   0.   0.   0.   0.15 0.   0.61]]


Now that we have coded the matrices that approximate the attention scores according to the new formula, we compute the distance with the original matrix arising from the attention experiments. We recall that according to our definition

$$  
|H-X|:= \sum_{i,j} |h_{i,j} -x_{i,j} | \quad h_{i,j}\in H, \quad x_{i,j}\in X.
$$
It is worthy to denote that there are many hyperparameters that may be learned: $w$ the bandwidth, the dropout probability $p$, the coefficient multiplying the sparse matrix, which in our case is set to be $0.5$, the parameter num_position and the various coefficients when dealing with random generation. These parameters can have an impact on the overall performance and to obtain them several experiments must be carried out. Nevertheless, we presume that many of them may dependend on the language we are using, as it may be the case for the bandwidth $w$, clearly an unpredictable random component is also inevitable.

In [None]:
np.set_printoptions(precision=2)

In [None]:
def computedistance(A,num_pos=2,w=3,n_attempts=10):
    best_distance = float('inf')
    best_matrix = None

    # First group
    for _ in range(n_attempts):
        sigma_1 = creatediagonal(len(A))
        distance = np.sum(np.abs(A - sigma_1))
        if distance < best_distance:
            best_distance = distance
            best_matrix = sigma_1

    # Second group
    for _ in range(n_attempts):
        sigma_2 = createsintact(len(A), w)  # We fix w=3
        distance = np.sum(np.abs(A - sigma_2))
        if distance < best_distance:
            best_distance = distance
            best_matrix = sigma_2

    # Third group
    for _ in range(n_attempts):
        sigma_3 = raretoken(len(A), num_pos, w)  # These parameters can be changed: 2 rare token and window 3
        distance = np.sum(np.abs(A - sigma_3))
        if distance < best_distance:
            best_distance = distance
            best_matrix = sigma_3

    return best_matrix, best_distance

# Example
A = np.eye(10)  # Matrice di esempio
best_matrix, best_distance = computedistance(A)
print("Best matrix:")
print(best_matrix)
print("\n Minimum Distance:", best_distance)


Matrice migliore:
[[1.   0.   0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.95 0.   0.   0.   0.05 0.   0.   0.   0.  ]
 [0.   0.   0.97 0.   0.   0.   0.   0.03 0.   0.  ]
 [0.   0.   0.   1.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   1.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.83 0.06 0.   0.11 0.  ]
 [0.   0.21 0.   0.   0.   0.   0.79 0.   0.   0.  ]
 [0.   0.13 0.2  0.   0.   0.   0.   0.68 0.   0.  ]
 [0.   0.   0.   0.15 0.   0.   0.   0.   0.85 0.  ]
 [0.   0.02 0.   0.   0.   0.   0.   0.28 0.   0.7 ]]

 Distanza minima: 2.443403098209869


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
with open('/content/drive/MyDrive/Transformer/matrix_list.pkl', 'rb') as f:
    loaded_matrix_list = pickle.load(f)

In [None]:
for i in range(len(loaded_matrix_list)):
  loaded_matrix_list[i]=loaded_matrix_list[i][:16,:16] # we cut all the paddings and take the length of the sentence

In [None]:
for i, matrix in enumerate(loaded_matrix_list):
    best_matrix, best_distance = computedistance(matrix,num_pos=2,w=4,n_attempts=50)
    print("\nDistanza minima:", best_distance)

  sigma_2 = sigma_2 / row_sums[:, np.newaxis]



Distanza minima: 21.069530669032968

Distanza minima: 19.233454623228344

Distanza minima: 22.352018368252036

Distanza minima: 22.468749502906167

Distanza minima: 20.48712989798162

Distanza minima: 20.871859012299915

Distanza minima: 20.61355644223331

Distanza minima: 21.525276899413527
