In [15]:
import torch
from safetensors import safe_open
from scipy.linalg import svd
import numpy as np

# Load the LoRA tensors from .safetensors files
with safe_open("lora1.safetensors", framework="pt", device="cpu") as f:
    lora1_tensors = {}
    for k in f.keys():
        lora1_tensors[k] = f.get_tensor(k)

# Print the available keys
print("Keys in lora1_tensors:")
for key in lora1_tensors.keys():
    print(key)

Keys in lora1_tensors:
lora_te_text_model_encoder_layers_0_mlp_fc1.alpha
lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight
lora_te_text_model_encoder_layers_0_mlp_fc1.lora_up.weight
lora_te_text_model_encoder_layers_0_mlp_fc2.alpha
lora_te_text_model_encoder_layers_0_mlp_fc2.lora_down.weight
lora_te_text_model_encoder_layers_0_mlp_fc2.lora_up.weight
lora_te_text_model_encoder_layers_0_self_attn_k_proj.alpha
lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight
lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_up.weight
lora_te_text_model_encoder_layers_0_self_attn_out_proj.alpha
lora_te_text_model_encoder_layers_0_self_attn_out_proj.lora_down.weight
lora_te_text_model_encoder_layers_0_self_attn_out_proj.lora_up.weight
lora_te_text_model_encoder_layers_0_self_attn_q_proj.alpha
lora_te_text_model_encoder_layers_0_self_attn_q_proj.lora_down.weight
lora_te_text_model_encoder_layers_0_self_attn_q_proj.lora_up.weight
lora_te_text_model_encoder_layers_0_self

In [17]:
with safe_open("lora2.safetensors", framework="pt", device="cpu") as f:
    lora2_tensors = {}
    for k in f.keys():
        lora2_tensors[k] = f.get_tensor(k)

print("\nKeys in lora2_tensors:")
for key in lora2_tensors.keys():
    print(key)


Keys in lora2_tensors:
lora_te_text_model_encoder_layers_0_mlp_fc1.alpha
lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight
lora_te_text_model_encoder_layers_0_mlp_fc1.lora_up.weight
lora_te_text_model_encoder_layers_0_mlp_fc2.alpha
lora_te_text_model_encoder_layers_0_mlp_fc2.lora_down.weight
lora_te_text_model_encoder_layers_0_mlp_fc2.lora_up.weight
lora_te_text_model_encoder_layers_0_self_attn_k_proj.alpha
lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight
lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_up.weight
lora_te_text_model_encoder_layers_0_self_attn_out_proj.alpha
lora_te_text_model_encoder_layers_0_self_attn_out_proj.lora_down.weight
lora_te_text_model_encoder_layers_0_self_attn_out_proj.lora_up.weight
lora_te_text_model_encoder_layers_0_self_attn_q_proj.alpha
lora_te_text_model_encoder_layers_0_self_attn_q_proj.lora_down.weight
lora_te_text_model_encoder_layers_0_self_attn_q_proj.lora_up.weight
lora_te_text_model_encoder_layers_0_sel

Now, in the paper on LoRAs, the following concept is introduced: The subspace similarity measure is a way of measuring the similarity between the subspaces spanned by the top singular vectors of two low-rank adaptation matrices, $A_{r=8}$ and $A_{r=64}$, from the same pre-trained model. Here's how it's done:

First, you perform a singular value decomposition (SVD) on each of these matrices to obtain their right-singular unitary matrices, denoted $U_{A_{r=8}}$ and $U_{A_{r=64}}$.

The goal is then to quantify how much of the subspace spanned by the top $i$ singular vectors in $U_{A_{r=8}}$ is contained in the subspace spanned by the top $j$ singular vectors of $U_{A_{r=64}}$.

This is measured using a normalized subspace similarity based on the Grassmann distance. The formula for this measure, denoted $\phi(A_{r=8}, A_{r=64}, i, j)$, is given as follows:

$$
\phi(A_{r=8}, A_{r=64}, i, j) = \frac{||U_{A_{r=8}}^{(i)} {U_{A_{r=64}}^{(j)}}^T||_F^2}{\min(i, j)}
$$

where $U_{A_{r}}^{(i)}$ represents the columns of $U_{A_{r}}$ corresponding to the top $i$ singular vectors, and $||\cdot||_F$ denotes the Frobenius norm.

The measure $\phi(·)$ ranges from 0 to 1, where 1 represents a complete overlap of the subspaces (i.e., they are the same), and 0 represents a complete separation (i.e., they are orthogonal). This is a normalized measure because it's divided by $\min(i, j)$, which is the maximum possible square of the Frobenius norm of the product matrix $U_{A_{r=8}}^{(i)} {U_{A_{r=64}}^{(j)}}^T$.

This process is performed for all pairs $(i, j)$ where $1 \leq i \leq 8$ and $1 \leq j \leq 64$. The results give an understanding of how much the learned subspaces for different ranks overlap with each other.

This can also be performed on two layers $\Delta W_1 = B_1A_1$ and $\Delta W_2 = B_2A_2$  in two different LoRAs. In particular, suppose we choose a layer `n` of each LoRA and run the subspace similarity measure comparison on $U_{\Delta W_1}^{(i)} {U_{\Delta W_2}^{(j)}}^T$. Then this will tell us how much those to LoRAs overlap with one another. 

This could be useful in determining which LoRAs to merge. If we run this analysis on all of the weight matrices of two different LoRAs, then we can determine how much layer `n` of `lora1` overlaps with layer `n` of `lora2`. If the overlap is small, then the two weight martices $\Delta W_1^{(n)} = B_1^{(n)}A_1^{(n)}$ and $\Delta W_2^{(n)} = B_2^{(n)}A_2^{(n)}$ may express very different things because the subspaces that they span do not overlap very much. So, to be more explicit, we compute

$$
\phi(\Delta W_1^{(n)}, \Delta W_2^{(n)}, i, j) = \frac{||U_{\Delta W_1^{(n)}}^{(i)} {U_{\Delta W_2^{(n)}}^{(j)}}^T||_F^2}{\min(i, j)}
$$

for a weight matrix $\Delta W_1^{(n)}$ from the first LoRA, and the corresponding $\Delta W_2^{(n)}$ from the second LoRA. This could indicate that merging the two LoRAs will create a more general model, able to create a wider range of diverse styles. This might also help in explaining why two LoRAs create something very muddy or undesirable when merges. Obviously, this is all conjecture based on a mathematical analysis that needs to be tested, and it does not provide a precise theshold for the overlap. What upper or lower bound might we use for this subspace similarity measure $\phi$? Could this hypthesis be wrong, or inverted? That is, is it possible that in some cases we actually want *high* overlap between models so that we merge very similar concepts?

In [18]:
import torch

# The A matrices
A1 = lora1_tensors['lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight'].float()
A2 = lora2_tensors['lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight'].float()

# The B matrices
B1 = lora1_tensors['lora_te_text_model_encoder_layers_0_mlp_fc1.lora_up.weight'].float()
B2 = lora2_tensors['lora_te_text_model_encoder_layers_0_mlp_fc1.lora_up.weight'].float()

# Compute the update matrices
Delta_W1 = torch.matmul(B1, A1)
Delta_W2 = torch.matmul(B2, A2)

# Compute the SVD of the update matrices
U1, _, _ = torch.svd(Delta_W1)
U2, _, _ = torch.svd(Delta_W2)

# Define the subspace similarity measure
def phi(U1, U2, i, j):
    U1_i = U1[:, :i]  # First i columns of U1
    U2_j = U2[:, :j]  # First j columns of U2
    
    product = torch.matmul(U1_i.t(), U2_j)  # Matrix multiplication
    norm = torch.norm(product)  # Frobenius norm
    
    return norm ** 2 / min(i, j)

# Calculate the subspace similarity measure
i = U1.size(1)  # Number of columns in U1
j = U2.size(1)  # Number of columns in U2
result = phi(U1, U2, i, j)  # Replace i and j with the desired values

print(f'The subspace similarity measure is: {result}')

The subspace similarity measure is: 0.26528483629226685


In [47]:
import torch

# Gather all keys and sort them
all_keys = sorted(list(lora1_tensors.keys()))

# Filter keys for lora_down and lora_up pairs
lora_down_keys = [key for key in all_keys if 'lora_down' in key]
lora_up_keys = [key for key in all_keys if 'lora_up' in key]

# Ensure we have matching pairs of keys
assert len(lora_down_keys) == len(lora_up_keys), "Mismatch in number of 'lora_down' and 'lora_up' keys"

# Define the subspace similarity measure
def phi(U1, U2, i, j):
    U1_i = U1[:, :i]  # First i columns of U1
    U2_j = U2[:, :j]  # First j columns of U2
    
    product = torch.matmul(U1_i.t(), U2_j)  # Matrix multiplication
    norm = torch.norm(product)  # Frobenius norm
    
    return norm ** 2 / min(i, j)

# Iterate over all layers
for layer in range(len(lora_down_keys)):
    try:
        # Extract the corresponding A and B matrices
        A1 = lora1_tensors[lora_down_keys[layer]].float()
        B1 = lora1_tensors[lora_up_keys[layer]].float()

        A2 = lora2_tensors[lora_down_keys[layer]].float()
        B2 = lora2_tensors[lora_up_keys[layer]].float()

        # Print the shapes of A1 and B1 matrices for troubleshooting
        print(f"A1 shape: {A1.shape}")
        print(f"B1 shape: {B1.shape}")

        print(f"A2 shape: {A2.shape}")
        print(f"B2 shape: {B2.shape}")

        # Compute the update matrices
        Delta_W1 = torch.matmul(B1, A1)
        print(f"ΔW1 shape: {Delta_W1.shape}")
        Delta_W2 = torch.matmul(B2, A2)
        print(f"ΔW2 shape: {Delta_W2.shape}")


        # Compute the SVD of the update matrices
        U1, _, _ = torch.svd(Delta_W1)
        U2, _, _ = torch.svd(Delta_W2)

        # Calculate the subspace similarity measure
        i = U1.size(1)  # Number of columns in U1
        j = U2.size(1)  # Number of columns in U2
        result = phi(U1, U2, i, j)  # Replace i and j with the desired values

        print(f"The subspace similarity measure for layer {layer} is: {result}")

    except RuntimeError as e:
        # Print the layer number and the error message
        print(f"Error occurred at layer {layer}: {e}")

A1 shape: torch.Size([32, 768])
B1 shape: torch.Size([3072, 32])
A2 shape: torch.Size([64, 768])
B2 shape: torch.Size([3072, 64])
ΔW1 shape: torch.Size([3072, 768])
ΔW2 shape: torch.Size([3072, 768])
The subspace similarity measure for layer 0 is: 0.26528483629226685
A1 shape: torch.Size([32, 3072])
B1 shape: torch.Size([768, 32])
A2 shape: torch.Size([64, 3072])
B2 shape: torch.Size([768, 64])
ΔW1 shape: torch.Size([768, 3072])
ΔW2 shape: torch.Size([768, 3072])
The subspace similarity measure for layer 1 is: 0.9999878406524658
A1 shape: torch.Size([32, 768])
B1 shape: torch.Size([768, 32])
A2 shape: torch.Size([64, 768])
B2 shape: torch.Size([768, 64])
ΔW1 shape: torch.Size([768, 768])
ΔW2 shape: torch.Size([768, 768])
The subspace similarity measure for layer 2 is: 0.9999885559082031
A1 shape: torch.Size([32, 768])
B1 shape: torch.Size([768, 32])
A2 shape: torch.Size([64, 768])
B2 shape: torch.Size([768, 64])
ΔW1 shape: torch.Size([768, 768])
ΔW2 shape: torch.Size([768, 768])
The su

## Grassmannian (Fréchet/Karcher) Mean For Square Update Matrices in two LoRA Models

The following is supposed to implement the Fréchet (a.k.a. Karcher) mean of two points on the Grassmannian. If we know that the matrices are square, as most of the weight matrices in Stable Diffusion are (and therefore so are the update matrices of LoRA models), then we can use it to compute the Fréchet mean of any of the square matrices for two LoRAs. In particular, we can compute the Fréchet mean of two update weight matrices $\Delta W_1^{(n)}$ and $\Delta W_2^{(n)}$ for some fixed layer `n` of the LoRA models. 

In [20]:
import numpy as np
from scipy.linalg import svd

def grassmann_mean(points, max_iters=100, tol=1e-6):
    """
    Computes the Fréchet mean of a set of points on the Grassmann manifold.

    Parameters:
    - points: a list of numpy arrays, each representing a point on the Grassmannian.
    - max_iters: the maximum number of iterations for the algorithm.
    - tol: the tolerance for convergence.

    Returns:
    - The Fréchet mean as a numpy array.
    """
    # Initialize the mean to the first point
    mean = np.linalg.qr(points[0])[0]
    for _ in range(max_iters):
        # Compute the tangent space at the current mean
        tangent_space = sum([log_map(mean, np.linalg.qr(p)[0]) for p in points]) / len(points)
        # Update the mean
        new_mean = exp_map(mean, tangent_space)
        # Check for convergence
        if np.linalg.norm(new_mean - mean) < tol:
            return new_mean
        mean = new_mean
    return mean

def log_map(p, q):
    """
    Computes the logarithm map at p of q, i.e., the tangent vector at p that points towards q.

    Parameters:
    - p: a numpy array representing a point on the Grassmannian.
    - q: a numpy array representing a point on the Grassmannian.

    Returns:
    - The tangent vector at p that points towards q.
    """
    # Compute the SVD of p^T q
    u, s, vt = svd(np.dot(p.T, q))
    # Clip the singular values to the range [-1, 1]
    s = np.clip(s, -1, 1)
    # Compute the logarithm map
    return np.dot(p, np.dot(u, np.dot(np.diag(np.arccos(s)), vt)))


def exp_map(p, x):
    """
    Computes the exponential map at p of x, i.e., the point on the Grassmannian reached by following the geodesic in the direction of x from p.

    Parameters:
    - p: a numpy array representing a point on the Grassmannian.
    - x: a numpy array representing a tangent vector at p.

    Returns:
    - The point on the Grassmannian reached by following the geodesic in the direction of x from p.
    """
    # Compute the SVD of x
    u, s, vt = svd(x)
    # Compute the exponential map
    return np.dot(p, np.dot(u, np.dot(np.diag(np.cos(s)), vt))) + np.dot(u, np.dot(np.diag(np.sin(s)), vt))

In [21]:
import numpy as np

def generate_random_orthogonal_matrix(dim):
    """
    Generates a random orthogonal matrix of the given dimension.

    Parameters:
    - dim: the dimension of the matrix.

    Returns:
    - A numpy array representing an orthogonal matrix of the given dimension.
    """
    # Generate a random matrix
    mat = np.random.randn(dim, dim)
    # Compute the QR decomposition of the matrix
    q, _ = np.linalg.qr(mat)
    return q

# Set a random seed for reproducibility
np.random.seed(0)

# Generate a list of points on the Grassmannian
points = [generate_random_orthogonal_matrix(3) for _ in range(5)]

# Compute the Fréchet mean of the points
mean = grassmann_mean(points)

print(mean)

[[ 0.41942679  0.55888978 -0.84660875]
 [ 0.2804597  -0.98964856 -0.45203556]
 [-0.9643001   0.02577428 -0.5265871 ]]


---

## Another Method Using UQpy

---

In [38]:
import numpy as np

# Generate two random 3x3 matrices
matrix1 = np.random.rand(5, 3)
matrix2 = np.random.rand(5, 3)

# Perform QR decomposition on the matrices to get orthonormal matrices
Q1, _ = np.linalg.qr(matrix1)
Q2, _ = np.linalg.qr(matrix2)

# Ensure the matrices are of type float
Q1 = Q1.astype(float)
Q2 = Q2.astype(float)

print("Orthonormal matrix 1:\n", Q1)
print("Orthonormal matrix 2:\n", Q2)


Orthonormal matrix 1:
 [[-0.54467462 -0.25164423  0.4287992 ]
 [-0.44596758 -0.43639108 -0.66882015]
 [-0.00357031  0.71634447 -0.4014099 ]
 [-0.55902041  0.44829261  0.34847733]
 [-0.43809348  0.17922708 -0.29367412]]
Orthonormal matrix 2:
 [[-0.14734214  0.83070799 -0.11600762]
 [-0.55904048 -0.0441168   0.80508795]
 [-0.53750149 -0.34500269 -0.29623097]
 [-0.38392218  0.40969309 -0.10137276]
 [-0.47901978 -0.14526752 -0.49025136]]


In [40]:
import numpy as np

def is_orthonormal(matrix):
    return np.allclose(matrix.T @ matrix, np.eye(matrix.shape[1]))

print(is_orthonormal(orthonormal_matrix1))
print(is_orthonormal(orthonormal_matrix2))



True
True


## Gram-Schidt from Scratch
Just in case we need it, we can use Gram-Schmidt to produce orthonormal matrices. 

In [34]:
import numpy as np

def gram_schmidt(A):
    Q = np.zeros_like(A)
    for i in range(A.shape[1]):
        # Start with the i'th column of A
        q = A[:, i]
        # Subtract projections onto the previous vectors
        for j in range(i):
            q -= np.dot(Q[:, j], A[:, i]) * Q[:, j]
        # Normalize
        Q[:, i] = q / np.linalg.norm(q)
    return Q


In [35]:
orthonormal_matrix1 = gram_schmidt(matrix1)
orthonormal_matrix2 = gram_schmidt(matrix2)

## Karcher Mean

In [44]:
# Import necessary classes and methods
from UQpy.utilities.GrassmannPoint import GrassmannPoint
from UQpy.dimension_reduction.grassmann_manifold import GrassmannOperations
from UQpy.utilities.distances.grassmannian_distances.AsimovDistance import AsimovDistance
from UQpy.utilities.distances.grassmannian_distances.BinetCauchyDistance import BinetCauchyDistance
from UQpy.utilities.distances.grassmannian_distances.FubiniStudyDistance import FubiniStudyDistance
from UQpy.utilities.distances.grassmannian_distances.GeodesicDistance import GeodesicDistance
from UQpy.utilities.distances.grassmannian_distances.MartinDistance import MartinDistance
from UQpy.utilities.distances.grassmannian_distances.ProcrustesDistance import ProcrustesDistance
from UQpy.utilities.distances.grassmannian_distances.ProjectionDistance import ProjectionDistance
from UQpy.utilities.distances.grassmannian_distances.SpectralDistance import SpectralDistance


# Define the points on the Grassmann manifold
X1 = GrassmannPoint(orthonormal_matrix1)
X2 = GrassmannPoint(orthonormal_matrix2)

# List of points
grassmann_points = [X1, X2]

# Define the distance measure
# distance = AsimovDistance()
# distance = BinetCauchyDistance()
# distance = FubiniStudyDistance()
distance = GeodesicDistance()
# distance = MartinDistance()
# distance = ProcrustesDistance()
# distance = ProjectionDistance()
# distance = SpectralDistance()

# Compute Karcher mean using StochasticGradientDescent
karcher_mean = GrassmannOperations.karcher_mean(
    grassmann_points=grassmann_points, 
    optimization_method='StochasticGradientDescent', 
    distance=distance, 
    acceleration=False, 
    tolerance=0.001, 
    maximum_iterations=1000
)

print("The Karcher mean of the points is:", karcher_mean)
print(karcher_mean.data)


The Karcher mean of the points is: <UQpy.utilities.GrassmannPoint.GrassmannPoint object at 0x7fb50a543070>
[[ 0.54457918  0.81385501  0.20266606]
 [ 0.41149671 -0.46982705  0.78098207]
 [ 0.73082417 -0.34191016 -0.59075669]]
