# Computing Karcher Means of LoRA Models

In [11]:
from safetensors import safe_open
from scipy.linalg import svd
import numpy as np

# Load the LoRA tensors from .safetensors files
with safe_open("fashigirl-v5.5-lora-naivae-64dim.safetensors", framework="pt", device="cpu") as f:
    lora1_tensors = {}
    for k in f.keys():
        lora1_tensors[k] = f.get_tensor(k)

# Print the available keys
print("Keys in lora1:")
for key in lora1_tensors.keys():
    print(key)

Keys in lora1:
lora_te_text_model_encoder_layers_0_mlp_fc1.alpha
lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight
lora_te_text_model_encoder_layers_0_mlp_fc1.lora_up.weight
lora_te_text_model_encoder_layers_0_mlp_fc2.alpha
lora_te_text_model_encoder_layers_0_mlp_fc2.lora_down.weight
lora_te_text_model_encoder_layers_0_mlp_fc2.lora_up.weight
lora_te_text_model_encoder_layers_0_self_attn_k_proj.alpha
lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight
lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_up.weight
lora_te_text_model_encoder_layers_0_self_attn_out_proj.alpha
lora_te_text_model_encoder_layers_0_self_attn_out_proj.lora_down.weight
lora_te_text_model_encoder_layers_0_self_attn_out_proj.lora_up.weight
lora_te_text_model_encoder_layers_0_self_attn_q_proj.alpha
lora_te_text_model_encoder_layers_0_self_attn_q_proj.lora_down.weight
lora_te_text_model_encoder_layers_0_self_attn_q_proj.lora_up.weight
lora_te_text_model_encoder_layers_0_self_attn_v_

In [2]:
with safe_open("lora2.safetensors", framework="pt", device="cpu") as f:
    lora2_tensors = {}
    for k in f.keys():
        lora2_tensors[k] = f.get_tensor(k)

# Print the available keys
print("Keys in lora2:")
for key in lora2_tensors.keys():
    print(key)

Keys in lora2:
lora_te_text_model_encoder_layers_0_mlp_fc1.alpha
lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight
lora_te_text_model_encoder_layers_0_mlp_fc1.lora_up.weight
lora_te_text_model_encoder_layers_0_mlp_fc2.alpha
lora_te_text_model_encoder_layers_0_mlp_fc2.lora_down.weight
lora_te_text_model_encoder_layers_0_mlp_fc2.lora_up.weight
lora_te_text_model_encoder_layers_0_self_attn_k_proj.alpha
lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight
lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_up.weight
lora_te_text_model_encoder_layers_0_self_attn_out_proj.alpha
lora_te_text_model_encoder_layers_0_self_attn_out_proj.lora_down.weight
lora_te_text_model_encoder_layers_0_self_attn_out_proj.lora_up.weight
lora_te_text_model_encoder_layers_0_self_attn_q_proj.alpha
lora_te_text_model_encoder_layers_0_self_attn_q_proj.lora_down.weight
lora_te_text_model_encoder_layers_0_self_attn_q_proj.lora_up.weight
lora_te_text_model_encoder_layers_0_self_attn_v_

In [3]:
import numpy as np
from numpy.linalg import matrix_rank, norm, qr
from UQpy.utilities.GrassmannPoint import GrassmannPoint
from UQpy.dimension_reduction.grassmann_manifold import GrassmannOperations
from UQpy.utilities.distances.grassmannian_distances.AsimovDistance import AsimovDistance
from UQpy.utilities.distances.grassmannian_distances.BinetCauchyDistance import BinetCauchyDistance
from UQpy.utilities.distances.grassmannian_distances.FubiniStudyDistance import FubiniStudyDistance
from UQpy.utilities.distances.grassmannian_distances.GeodesicDistance import GeodesicDistance
from UQpy.utilities.distances.grassmannian_distances.MartinDistance import MartinDistance
from UQpy.utilities.distances.grassmannian_distances.ProcrustesDistance import ProcrustesDistance
from UQpy.utilities.distances.grassmannian_distances.ProjectionDistance import ProjectionDistance
from UQpy.utilities.distances.grassmannian_distances.SpectralDistance import SpectralDistance


def compute_karcher_mean(X1, X2, distance_metric):
    # Check if the matrices are full rank
    if matrix_rank(X1) != min(X1.shape) or matrix_rank(X2) != min(X2.shape):
        raise ValueError('Input matrices are not full rank.')

    # Perform the Gram-Schmidt process to get orthonormal matrices
    Q1, _ = qr(X1)
    Q2, _ = qr(X2)

    # Define the points on the Grassmann manifold
    X1_grassmann = GrassmannPoint(Q1)
    X2_grassmann = GrassmannPoint(Q2)

    # List of points
    grassmann_points = [X1_grassmann, X2_grassmann]

    # Define the distance measure based on user input
    if distance_metric == 'AsimovDistance':
        distance = AsimovDistance()
    elif distance_metric == 'BinetCauchyDistance':
        distance = BinetCauchyDistance()
    elif distance_metric == 'FubiniStudyDistance':
        distance = FubiniStudyDistance()
    elif distance_metric == 'GeodesicDistance':
        distance = GeodesicDistance()
    elif distance_metric == 'MartinDistance':
        distance = MartinDistance()
    elif distance_metric == 'ProcrustesDistance':
        distance = ProcrustesDistance()
    elif distance_metric == 'ProjectionDistance':
        distance = ProjectionDistance()
    elif distance_metric == 'SpectralDistance':
        distance = SpectralDistance()
    else:
        raise ValueError('Invalid distance metric.')

    # Compute Karcher mean using StochasticGradientDescent
    karcher_mean = GrassmannOperations.karcher_mean(
        grassmann_points=grassmann_points,
        optimization_method='StochasticGradientDescent',
        distance=distance,
        acceleration=False,
        tolerance=0.001,
        maximum_iterations=1000
    )

    return karcher_mean.data

In [9]:
print(lora1_tensors['lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight'])
print(lora2_tensors['lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight'].numpy())
print(lora1_tensors['lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight'].shape)
print(lora2_tensors['lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight'].numpy().shape)


tensor([[ 2.1881e-02, -2.8610e-02,  2.4281e-03,  ...,  9.0122e-04,
         -1.1467e-02,  1.7441e-02],
        [-1.1864e-02, -3.4046e-04,  2.6031e-02,  ...,  3.2623e-02,
         -5.7182e-03, -1.9608e-02],
        [ 1.6434e-02, -8.0490e-04,  2.4612e-02,  ..., -9.8343e-03,
          6.9797e-05, -1.8158e-02],
        ...,
        [-1.9241e-02,  3.4424e-02,  1.8143e-02,  ...,  1.6022e-02,
         -1.8692e-03, -2.4033e-02],
        [-2.4124e-02, -3.4821e-02, -1.5358e-02,  ...,  3.0579e-02,
          3.6102e-02, -4.0436e-03],
        [ 2.5772e-02, -1.8509e-02, -1.0094e-02,  ..., -2.5131e-02,
          1.3260e-02, -1.1574e-02]], dtype=torch.float16)
[[-0.0275    0.00898  -0.02756  ...  0.02109   0.02454   0.01735 ]
 [-0.001681 -0.03345   0.014114 ... -0.01412   0.002762  0.0167  ]
 [-0.01953   0.02481   0.005184 ...  0.0335    0.008255  0.007458]
 ...
 [ 0.03305   0.0233    0.0181   ...  0.02495   0.0361   -0.01753 ]
 [-0.01171  -0.01368  -0.0298   ...  0.032     0.009254  0.03534 ]
 [ 0.01

In [12]:
X1 = lora1_tensors['lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight'].numpy().astype(np.float64)
X2 = lora2_tensors['lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight'].numpy().astype(np.float64)
karcher_mean = compute_karcher_mean(X1, X2, 'GeodesicDistance')
print(X1.shape)
print(X2.shape)
print(karcher_mean.shape)
print(karcher_mean)

(64, 768)
(64, 768)
(64, 64)
[[-0.16601927  0.05208861  0.12220976 ...  0.13459587 -0.25987938
   0.00226502]
 [-0.01015164 -0.19896531 -0.09372324 ...  0.05044137  0.12135019
   0.17535809]
 [-0.11792711  0.14660218 -0.04240003 ... -0.12304302 -0.10536765
   0.11791925]
 ...
 [ 0.19955479  0.14011982 -0.04992667 ... -0.10905791 -0.05989728
   0.07369256]
 [-0.0707102  -0.08189841  0.1446561  ... -0.10068816 -0.06625681
   0.2213837 ]
 [ 0.0619578  -0.01110882 -0.13774743 ...  0.00147635  0.04940113
   0.17591866]]


In [17]:
def compute_karcher_mean_of_models(model1_path, model2_path, key):
    # Load the tensors from the models
    with safe_open(model1_path, framework="pt", device="cpu") as f:
        model1_tensors = {}
        for k in f.keys():
            model1_tensors[k] = f.get_tensor(k)

    with safe_open(model2_path, framework="pt", device="cpu") as f:
        model2_tensors = {}
        for k in f.keys():
            model2_tensors[k] = f.get_tensor(k)

    # Get the update weight matrices
    X1 = model1_tensors[key].numpy().astype(np.float64)
    X2 = model2_tensors[key].numpy().astype(np.float64)

    # Compute the Karcher mean
    try:
        karcher_mean = compute_karcher_mean(X1, X2, 'GeodesicDistance')  # select the appropriate distance metric
        return karcher_mean
    except:
        return "Failed to compute Karcher mean"

# Specify the paths to your models and the key
model1_path = 'fashigirl-v5.5-lora-naivae-64dim.safetensors'
model2_path = 'lora2.safetensors'
key = 'lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight'  # adjust as per your requirements

# Compute the Karcher mean
karcher_mean = compute_karcher_mean_of_models(model1_path, model2_path, key)
print(karcher_mean)


[[-0.16601927  0.05208861  0.12220976 ...  0.13459587 -0.25987938
   0.00226502]
 [-0.01015164 -0.19896531 -0.09372324 ...  0.05044137  0.12135019
   0.17535809]
 [-0.11792711  0.14660218 -0.04240003 ... -0.12304302 -0.10536765
   0.11791925]
 ...
 [ 0.19955479  0.14011982 -0.04992667 ... -0.10905791 -0.05989728
   0.07369256]
 [-0.0707102  -0.08189841  0.1446561  ... -0.10068816 -0.06625681
   0.2213837 ]
 [ 0.0619578  -0.01110882 -0.13774743 ...  0.00147635  0.04940113
   0.17591866]]


In [20]:
import torch
from safetensors import safe_open
from scipy.linalg import svd
import numpy as np

def compute_karcher_mean_of_models(model1_path, model2_path, key):
    # Load the tensors from the models
    with safe_open(model1_path, framework="pt", device="cpu") as f:
        model1_tensors = {k: f.get_tensor(k) for k in f.keys()}

    with safe_open(model2_path, framework="pt", device="cpu") as f:
        model2_tensors = {k: f.get_tensor(k) for k in f.keys()}

    # Get the update weight matrices
    X1 = model1_tensors[key].numpy().astype(np.float64)
    X2 = model2_tensors[key].numpy().astype(np.float64)

    # Compute the Karcher mean
    try:
        karcher_mean = compute_karcher_mean(X1, X2, 'GeodesicDistance')  # select the appropriate distance metric
        return karcher_mean
    except:
        return "Failed to compute Karcher mean"

# Specify the paths to your models and the key
model1_path = 'fashigirl-v5.5-lora-naivae-64dim.safetensors'
model2_path = 'lora2.safetensors'
key = 'lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight'  # adjust as per your requirements

# Compute the Karcher mean
karcher_mean = compute_karcher_mean_of_models(model1_path, model2_path, key)
print(karcher_mean)



[[-0.16601927  0.05208861  0.12220976 ...  0.13459587 -0.25987938
   0.00226502]
 [-0.01015164 -0.19896531 -0.09372324 ...  0.05044137  0.12135019
   0.17535809]
 [-0.11792711  0.14660218 -0.04240003 ... -0.12304302 -0.10536765
   0.11791925]
 ...
 [ 0.19955479  0.14011982 -0.04992667 ... -0.10905791 -0.05989728
   0.07369256]
 [-0.0707102  -0.08189841  0.1446561  ... -0.10068816 -0.06625681
   0.2213837 ]
 [ 0.0619578  -0.01110882 -0.13774743 ...  0.00147635  0.04940113
   0.17591866]]


In [22]:
def compute_karcher_mean_for_all_keys(model1_path, model2_path):
    # Load the tensors from the models
    with safe_open(model1_path, framework="pt", device="cpu") as f:
        model1_tensors = {k: f.get_tensor(k) for k in f.keys()}

    with safe_open(model2_path, framework="pt", device="cpu") as f:
        model2_tensors = {k: f.get_tensor(k) for k in f.keys()}

    # Compute the Karcher mean for each key
    karcher_means = {}
    for key in model1_tensors.keys():
        X1 = model1_tensors[key].numpy().astype(np.float64)
        X2 = model2_tensors[key].numpy().astype(np.float64)
        if key.endswith(".alpha"):
            # Compute the usual average for keys with '.alpha' suffixes
            karcher_means[key] = (X1 + X2) / 2
            print(f"Arithmetic Averaged key: {key}")
        else:
            try:
                karcher_means[key] = compute_karcher_mean(X1, X2, 'GeodesicDistance')  # select the appropriate distance metric
                print(f"Karcher mean for key {key} computed successfully.")
            except:
                print(f"Arithmetic Averaged key: {key}")
                karcher_means[key] = (X1 + X2) / 2

    return karcher_means

# Specify the paths to your models
model1_path = 'fashigirl-v5.5-lora-naivae-64dim.safetensors'
model2_path = 'lora2.safetensors'

# Compute the Karcher mean for all keys
karcher_means = compute_karcher_mean_for_all_keys(model1_path, model2_path)


Averaged key: lora_te_text_model_encoder_layers_0_mlp_fc1.alpha
Karcher mean for key lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight computed successfully.
Karcher mean for key lora_te_text_model_encoder_layers_0_mlp_fc1.lora_up.weight computed successfully.
Averaged key: lora_te_text_model_encoder_layers_0_mlp_fc2.alpha
Karcher mean for key lora_te_text_model_encoder_layers_0_mlp_fc2.lora_down.weight computed successfully.
Karcher mean for key lora_te_text_model_encoder_layers_0_mlp_fc2.lora_up.weight computed successfully.
Averaged key: lora_te_text_model_encoder_layers_0_self_attn_k_proj.alpha
Karcher mean for key lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight computed successfully.
Karcher mean for key lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_up.weight computed successfully.
Averaged key: lora_te_text_model_encoder_layers_0_self_attn_out_proj.alpha
Karcher mean for key lora_te_text_model_encoder_layers_0_self_attn_out_proj.lora_d