# Computing Karcher Means of LoRA Models

In [11]:
from safetensors import safe_open
from scipy.linalg import svd
import numpy as np

# Load the LoRA tensors from .safetensors files
with safe_open("fashigirl-v5.5-lora-naivae-64dim.safetensors", framework="pt", device="cpu") as f:
    lora1_tensors = {}
    for k in f.keys():
        lora1_tensors[k] = f.get_tensor(k)

# Print the available keys
print("Keys in lora1:")
for key in lora1_tensors.keys():
    print(key)

Keys in lora1:
lora_te_text_model_encoder_layers_0_mlp_fc1.alpha
lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight
lora_te_text_model_encoder_layers_0_mlp_fc1.lora_up.weight
lora_te_text_model_encoder_layers_0_mlp_fc2.alpha
lora_te_text_model_encoder_layers_0_mlp_fc2.lora_down.weight
lora_te_text_model_encoder_layers_0_mlp_fc2.lora_up.weight
lora_te_text_model_encoder_layers_0_self_attn_k_proj.alpha
lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight
lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_up.weight
lora_te_text_model_encoder_layers_0_self_attn_out_proj.alpha
lora_te_text_model_encoder_layers_0_self_attn_out_proj.lora_down.weight
lora_te_text_model_encoder_layers_0_self_attn_out_proj.lora_up.weight
lora_te_text_model_encoder_layers_0_self_attn_q_proj.alpha
lora_te_text_model_encoder_layers_0_self_attn_q_proj.lora_down.weight
lora_te_text_model_encoder_layers_0_self_attn_q_proj.lora_up.weight
lora_te_text_model_encoder_layers_0_self_attn_v_

In [2]:
with safe_open("lora2.safetensors", framework="pt", device="cpu") as f:
    lora2_tensors = {}
    for k in f.keys():
        lora2_tensors[k] = f.get_tensor(k)

# Print the available keys
print("Keys in lora2:")
for key in lora2_tensors.keys():
    print(key)

Keys in lora2:
lora_te_text_model_encoder_layers_0_mlp_fc1.alpha
lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight
lora_te_text_model_encoder_layers_0_mlp_fc1.lora_up.weight
lora_te_text_model_encoder_layers_0_mlp_fc2.alpha
lora_te_text_model_encoder_layers_0_mlp_fc2.lora_down.weight
lora_te_text_model_encoder_layers_0_mlp_fc2.lora_up.weight
lora_te_text_model_encoder_layers_0_self_attn_k_proj.alpha
lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight
lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_up.weight
lora_te_text_model_encoder_layers_0_self_attn_out_proj.alpha
lora_te_text_model_encoder_layers_0_self_attn_out_proj.lora_down.weight
lora_te_text_model_encoder_layers_0_self_attn_out_proj.lora_up.weight
lora_te_text_model_encoder_layers_0_self_attn_q_proj.alpha
lora_te_text_model_encoder_layers_0_self_attn_q_proj.lora_down.weight
lora_te_text_model_encoder_layers_0_self_attn_q_proj.lora_up.weight
lora_te_text_model_encoder_layers_0_self_attn_v_

In [3]:
import numpy as np
from numpy.linalg import matrix_rank, norm, qr
from UQpy.utilities.GrassmannPoint import GrassmannPoint
from UQpy.dimension_reduction.grassmann_manifold import GrassmannOperations
from UQpy.utilities.distances.grassmannian_distances.AsimovDistance import AsimovDistance
from UQpy.utilities.distances.grassmannian_distances.BinetCauchyDistance import BinetCauchyDistance
from UQpy.utilities.distances.grassmannian_distances.FubiniStudyDistance import FubiniStudyDistance
from UQpy.utilities.distances.grassmannian_distances.GeodesicDistance import GeodesicDistance
from UQpy.utilities.distances.grassmannian_distances.MartinDistance import MartinDistance
from UQpy.utilities.distances.grassmannian_distances.ProcrustesDistance import ProcrustesDistance
from UQpy.utilities.distances.grassmannian_distances.ProjectionDistance import ProjectionDistance
from UQpy.utilities.distances.grassmannian_distances.SpectralDistance import SpectralDistance


def compute_karcher_mean(X1, X2, distance_metric):
    # Check if the matrices are full rank
    if matrix_rank(X1) != min(X1.shape) or matrix_rank(X2) != min(X2.shape):
        raise ValueError('Input matrices are not full rank.')

    # Perform the Gram-Schmidt process to get orthonormal matrices
    Q1, _ = qr(X1)
    Q2, _ = qr(X2)

    # Define the points on the Grassmann manifold
    X1_grassmann = GrassmannPoint(Q1)
    X2_grassmann = GrassmannPoint(Q2)

    # List of points
    grassmann_points = [X1_grassmann, X2_grassmann]

    # Define the distance measure based on user input
    if distance_metric == 'AsimovDistance':
        distance = AsimovDistance()
    elif distance_metric == 'BinetCauchyDistance':
        distance = BinetCauchyDistance()
    elif distance_metric == 'FubiniStudyDistance':
        distance = FubiniStudyDistance()
    elif distance_metric == 'GeodesicDistance':
        distance = GeodesicDistance()
    elif distance_metric == 'MartinDistance':
        distance = MartinDistance()
    elif distance_metric == 'ProcrustesDistance':
        distance = ProcrustesDistance()
    elif distance_metric == 'ProjectionDistance':
        distance = ProjectionDistance()
    elif distance_metric == 'SpectralDistance':
        distance = SpectralDistance()
    else:
        raise ValueError('Invalid distance metric.')

    # Compute Karcher mean using StochasticGradientDescent
    karcher_mean = GrassmannOperations.karcher_mean(
        grassmann_points=grassmann_points,
        optimization_method='StochasticGradientDescent',
        distance=distance,
        acceleration=False,
        tolerance=0.001,
        maximum_iterations=1000
    )

    return karcher_mean.data

In [9]:
print(lora1_tensors['lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight'])
print(lora2_tensors['lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight'].numpy())
print(lora1_tensors['lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight'].shape)
print(lora2_tensors['lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight'].numpy().shape)


tensor([[ 2.1881e-02, -2.8610e-02,  2.4281e-03,  ...,  9.0122e-04,
         -1.1467e-02,  1.7441e-02],
        [-1.1864e-02, -3.4046e-04,  2.6031e-02,  ...,  3.2623e-02,
         -5.7182e-03, -1.9608e-02],
        [ 1.6434e-02, -8.0490e-04,  2.4612e-02,  ..., -9.8343e-03,
          6.9797e-05, -1.8158e-02],
        ...,
        [-1.9241e-02,  3.4424e-02,  1.8143e-02,  ...,  1.6022e-02,
         -1.8692e-03, -2.4033e-02],
        [-2.4124e-02, -3.4821e-02, -1.5358e-02,  ...,  3.0579e-02,
          3.6102e-02, -4.0436e-03],
        [ 2.5772e-02, -1.8509e-02, -1.0094e-02,  ..., -2.5131e-02,
          1.3260e-02, -1.1574e-02]], dtype=torch.float16)
[[-0.0275    0.00898  -0.02756  ...  0.02109   0.02454   0.01735 ]
 [-0.001681 -0.03345   0.014114 ... -0.01412   0.002762  0.0167  ]
 [-0.01953   0.02481   0.005184 ...  0.0335    0.008255  0.007458]
 ...
 [ 0.03305   0.0233    0.0181   ...  0.02495   0.0361   -0.01753 ]
 [-0.01171  -0.01368  -0.0298   ...  0.032     0.009254  0.03534 ]
 [ 0.01

In [12]:
X1 = lora1_tensors['lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight'].numpy().astype(np.float64)
X2 = lora2_tensors['lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight'].numpy().astype(np.float64)
karcher_mean = compute_karcher_mean(X1, X2, 'GeodesicDistance')
print(X1.shape)
print(X2.shape)
print(karcher_mean.shape)
print(karcher_mean)

(64, 768)
(64, 768)
(64, 64)
[[-0.16601927  0.05208861  0.12220976 ...  0.13459587 -0.25987938
   0.00226502]
 [-0.01015164 -0.19896531 -0.09372324 ...  0.05044137  0.12135019
   0.17535809]
 [-0.11792711  0.14660218 -0.04240003 ... -0.12304302 -0.10536765
   0.11791925]
 ...
 [ 0.19955479  0.14011982 -0.04992667 ... -0.10905791 -0.05989728
   0.07369256]
 [-0.0707102  -0.08189841  0.1446561  ... -0.10068816 -0.06625681
   0.2213837 ]
 [ 0.0619578  -0.01110882 -0.13774743 ...  0.00147635  0.04940113
   0.17591866]]


In [17]:
def compute_karcher_mean_of_models(model1_path, model2_path, key):
    # Load the tensors from the models
    with safe_open(model1_path, framework="pt", device="cpu") as f:
        model1_tensors = {}
        for k in f.keys():
            model1_tensors[k] = f.get_tensor(k)

    with safe_open(model2_path, framework="pt", device="cpu") as f:
        model2_tensors = {}
        for k in f.keys():
            model2_tensors[k] = f.get_tensor(k)

    # Get the update weight matrices
    X1 = model1_tensors[key].numpy().astype(np.float64)
    X2 = model2_tensors[key].numpy().astype(np.float64)

    # Compute the Karcher mean
    try:
        karcher_mean = compute_karcher_mean(X1, X2, 'GeodesicDistance')  # select the appropriate distance metric
        return karcher_mean
    except:
        return "Failed to compute Karcher mean"

# Specify the paths to your models and the key
model1_path = 'fashigirl-v5.5-lora-naivae-64dim.safetensors'
model2_path = 'lora2.safetensors'
key = 'lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight'  # adjust as per your requirements

# Compute the Karcher mean
karcher_mean = compute_karcher_mean_of_models(model1_path, model2_path, key)
print(karcher_mean)


[[-0.16601927  0.05208861  0.12220976 ...  0.13459587 -0.25987938
   0.00226502]
 [-0.01015164 -0.19896531 -0.09372324 ...  0.05044137  0.12135019
   0.17535809]
 [-0.11792711  0.14660218 -0.04240003 ... -0.12304302 -0.10536765
   0.11791925]
 ...
 [ 0.19955479  0.14011982 -0.04992667 ... -0.10905791 -0.05989728
   0.07369256]
 [-0.0707102  -0.08189841  0.1446561  ... -0.10068816 -0.06625681
   0.2213837 ]
 [ 0.0619578  -0.01110882 -0.13774743 ...  0.00147635  0.04940113
   0.17591866]]


In [20]:
import torch
from safetensors import safe_open
from scipy.linalg import svd
import numpy as np

def compute_karcher_mean_of_models(model1_path, model2_path, key):
    # Load the tensors from the models
    with safe_open(model1_path, framework="pt", device="cpu") as f:
        model1_tensors = {k: f.get_tensor(k) for k in f.keys()}

    with safe_open(model2_path, framework="pt", device="cpu") as f:
        model2_tensors = {k: f.get_tensor(k) for k in f.keys()}

    # Get the update weight matrices
    X1 = model1_tensors[key].numpy().astype(np.float64)
    X2 = model2_tensors[key].numpy().astype(np.float64)

    # Compute the Karcher mean
    try:
        karcher_mean = compute_karcher_mean(X1, X2, 'GeodesicDistance')  # select the appropriate distance metric
        return karcher_mean
    except:
        return "Failed to compute Karcher mean"

# Specify the paths to your models and the key
model1_path = 'fashigirl-v5.5-lora-naivae-64dim.safetensors'
model2_path = 'lora2.safetensors'
key = 'lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight'  # adjust as per your requirements

# Compute the Karcher mean
karcher_mean = compute_karcher_mean_of_models(model1_path, model2_path, key)
print(karcher_mean)



[[-0.16601927  0.05208861  0.12220976 ...  0.13459587 -0.25987938
   0.00226502]
 [-0.01015164 -0.19896531 -0.09372324 ...  0.05044137  0.12135019
   0.17535809]
 [-0.11792711  0.14660218 -0.04240003 ... -0.12304302 -0.10536765
   0.11791925]
 ...
 [ 0.19955479  0.14011982 -0.04992667 ... -0.10905791 -0.05989728
   0.07369256]
 [-0.0707102  -0.08189841  0.1446561  ... -0.10068816 -0.06625681
   0.2213837 ]
 [ 0.0619578  -0.01110882 -0.13774743 ...  0.00147635  0.04940113
   0.17591866]]


In [22]:
def compute_karcher_mean_for_all_keys(model1_path, model2_path):
    # Load the tensors from the models
    with safe_open(model1_path, framework="pt", device="cpu") as f:
        model1_tensors = {k: f.get_tensor(k) for k in f.keys()}

    with safe_open(model2_path, framework="pt", device="cpu") as f:
        model2_tensors = {k: f.get_tensor(k) for k in f.keys()}

    # Compute the Karcher mean for each key
    karcher_means = {}
    for key in model1_tensors.keys():
        X1 = model1_tensors[key].numpy().astype(np.float64)
        X2 = model2_tensors[key].numpy().astype(np.float64)
        if key.endswith(".alpha"):
            # Compute the usual average for keys with '.alpha' suffixes
            karcher_means[key] = (X1 + X2) / 2
            print(f"Arithmetic Averaged key: {key}")
        else:
            try:
                karcher_means[key] = compute_karcher_mean(X1, X2, 'GeodesicDistance')  # select the appropriate distance metric
                print(f"Karcher mean for key {key} computed successfully.")
            except:
                print(f"Arithmetic Averaged key: {key}")
                karcher_means[key] = (X1 + X2) / 2

    return karcher_means

# Specify the paths to your models
model1_path = 'fashigirl-v5.5-lora-naivae-64dim.safetensors'
model2_path = 'lora2.safetensors'

# Compute the Karcher mean for all keys
karcher_means = compute_karcher_mean_for_all_keys(model1_path, model2_path)


Averaged key: lora_te_text_model_encoder_layers_0_mlp_fc1.alpha
Karcher mean for key lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight computed successfully.
Karcher mean for key lora_te_text_model_encoder_layers_0_mlp_fc1.lora_up.weight computed successfully.
Averaged key: lora_te_text_model_encoder_layers_0_mlp_fc2.alpha
Karcher mean for key lora_te_text_model_encoder_layers_0_mlp_fc2.lora_down.weight computed successfully.
Karcher mean for key lora_te_text_model_encoder_layers_0_mlp_fc2.lora_up.weight computed successfully.
Averaged key: lora_te_text_model_encoder_layers_0_self_attn_k_proj.alpha
Karcher mean for key lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight computed successfully.
Karcher mean for key lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_up.weight computed successfully.
Averaged key: lora_te_text_model_encoder_layers_0_self_attn_out_proj.alpha
Karcher mean for key lora_te_text_model_encoder_layers_0_self_attn_out_proj.lora_d

In [23]:
def compute_karcher_mean_for_all_keys(model1_path, model2_path):
    # Load the tensors from the models
    with safe_open(model1_path, framework="pt", device="cpu") as f:
        model1_tensors = {k: f.get_tensor(k) for k in f.keys()}

    with safe_open(model2_path, framework="pt", device="cpu") as f:
        model2_tensors = {k: f.get_tensor(k) for k in f.keys()}

    # Compute the Karcher mean for each key
    karcher_means = {}
    for key in model1_tensors.keys():
        X1 = model1_tensors[key].numpy().astype(np.float64)
        X2 = model2_tensors[key].numpy().astype(np.float64)
        if key.endswith(".alpha"):
            # Compute the usual average for keys with '.alpha' suffixes
            karcher_means[key] = (X1 + X2) / 2
            print(f"Arithmetic Averaged key: {key}")
        else:
            try:
                # If the tensor can be reshaped into a matrix, do so
                if len(X1.shape) > 2 and X1.shape[-2:] == (1, 1):
                    X1 = X1.reshape(X1.shape[:-2])
                    X2 = X2.reshape(X2.shape[:-2])
                karcher_means[key] = compute_karcher_mean(X1, X2, 'GeodesicDistance')  # select the appropriate distance metric
                print(f"Karcher mean for key {key} computed successfully.")
            except:
                print(f"Failed to compute mean: {key}")
                karcher_means[key] = (X1 + X2) / 2

    return karcher_means

# Specify the paths to your models
model1_path = 'fashigirl-v5.5-lora-naivae-64dim.safetensors'
model2_path = 'lora2.safetensors'

# Compute the Karcher mean for all keys
karcher_means = compute_karcher_mean_for_all_keys(model1_path, model2_path)


Arithmetic Averaged key: lora_te_text_model_encoder_layers_0_mlp_fc1.alpha
Karcher mean for key lora_te_text_model_encoder_layers_0_mlp_fc1.lora_down.weight computed successfully.
Karcher mean for key lora_te_text_model_encoder_layers_0_mlp_fc1.lora_up.weight computed successfully.
Arithmetic Averaged key: lora_te_text_model_encoder_layers_0_mlp_fc2.alpha
Karcher mean for key lora_te_text_model_encoder_layers_0_mlp_fc2.lora_down.weight computed successfully.
Karcher mean for key lora_te_text_model_encoder_layers_0_mlp_fc2.lora_up.weight computed successfully.
Arithmetic Averaged key: lora_te_text_model_encoder_layers_0_self_attn_k_proj.alpha
Karcher mean for key lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight computed successfully.
Karcher mean for key lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_up.weight computed successfully.
Arithmetic Averaged key: lora_te_text_model_encoder_layers_0_self_attn_out_proj.alpha
Karcher mean for key lora_te_text_mode

This code is a script that computes the Karcher mean of the tensors from two different models, using the Grassmannian geometry and different distances on the Grassmann manifold. It employs the UQpy library, which is a Python package for uncertainty quantification (UQ). The script is divided into two main functions: `compute_karcher_mean` and `compute_karcher_mean_for_all_keys`.

Before explaining these two functions, it is important to introduce the concepts of Grassmannian geometry and Karcher mean. 

The Grassmannian $Gr(p, n)$ is the set of all $p$-dimensional subspaces in $\mathbb{R}^n$ where $0 \leq p \leq n$. In machine learning and deep learning, the Grassmannian is used to model the space of subspaces spanned by the columns of weight matrices, for example. A point $X$ in the Grassmannian $Gr(p, n)$ is a $n \times p$ matrix whose columns form an orthonormal basis for a $p$-dimensional subspace in $\mathbb{R}^n$. This is represented mathematically as follows:

$$
X = [x_1, x_2, \ldots, x_p], \quad x_i \in \mathbb{R}^n, \quad X^T X = I_p
$$

where $x_i$ are the columns of $X$ and $I_p$ is the $p \times p$ identity matrix. 

The Karcher mean (also known as the Fréchet mean) is a generalization of the arithmetic mean to a manifold setting. Given a set of points $X_1, X_2, \ldots, X_m$ in a Riemannian manifold $\mathcal{M}$, the Karcher mean $\mu$ is defined as the minimizer of the function:

$$
F(\mu) = \frac{1}{m} \sum_{i=1}^{m} d^2(X_i, \mu)
$$

where $d(X_i, \mu)$ is the geodesic distance between $X_i$ and $\mu$.

## Function `compute_karcher_mean`

This function takes two matrices, $X1$ and $X2$, and a distance metric as input and returns the Karcher mean of the two matrices with respect to the given distance metric.

The function first checks if the input matrices are full rank. This check is crucial because a point on the Grassmannian is a full rank matrix. Mathematically, for a matrix $X$ to be full rank, its rank must equal the minimum of its number of rows and columns:

$$
rank(X) = min(\# \text{ of rows}, \# \text{ of columns})
$$

If either matrix is not full rank, the function raises a ValueError. 

The function then applies the Gram-Schmidt process to the input matrices to generate orthonormal matrices $Q1$ and $Q2$. The Gram-Schmidt process is a method for orthonormalizing a set of vectors in an inner product space, most commonly the Euclidean space $\mathbb{R}^n$. The orthonormalization process can be expressed as follows:

$$
Q = [q_1, q_2, \ldots, q_p], \quad q_i = \frac{a_i - \sum_{j=1}^{i-1} (q_j^T a_i)q_j}{\| a_i - \sum_{j=1}^{i-1} (q_j^T a_i)q_j \|}, \quad i = 1, 2, \ldots, p
$$

where $a_i$ are the columns of the original matrix $A = [a_1, a_2, \ldots, a_p]$.

Next, the function creates Grassmann points $X1\_grassmann$ and $X2\_grassmann$ from the orthonormal matrices $Q1$ and $Q2$. These points represent the subspaces spanned by the columns of $Q1$ and $Q2$ in the Grassmannian.

The function then determines the distance measure to be used in the computation of the Karcher mean based on user input. This is a crucial step as different distance measures on the Grassmannian give rise to different geodesics and consequently different Karcher means. The user can choose from a variety of distance measures, including:

1. Asimov distance
2. Binet-Cauchy distance
3. Fubini-Study distance
4. Geodesic distance
5. Martin distance
6. Procrustes distance
7. Projection distance
8. Spectral distance

The selection of the appropriate distance measure depends on the specific application and data. For example, the geodesic distance is often used in applications involving data on the Grassmannian as it respects the Riemannian structure of the Grassmannian. The geodesic distance between two points $X$ and $Y$ in the Grassmannian $Gr(p, n)$ is given by:

$$
d_G(X, Y) = \left( \sum_{i=1}^p (\cos^{-1})^2 \sigma_i(X^T Y)\right)^{1/2}
$$

where $\sigma_i(X^T Y)$ are the singular values of the matrix $X^T Y$. Or equivalently, 

$$
d_G(X, Y) = || \Theta ||_2 = \left(\sum_l \theta_l^2 \right)^{1/2}
$$

The two formulas given represent the same measure -- the geodesic distance on the Grassmann manifold. Let's first describe the context and then connect the two formulas.

The first formula is based on the singular values of the matrix $X^T Y$. The singular values are computed from the singular value decomposition (SVD) of $X^T Y$, and they represent the cosines of the principal angles between the subspaces $X$ and $Y$. The geodesic distance is then defined as the square root of the sum of the squares of the arccosines of these singular values:

$$
d_G(X, Y) = \left( \sum_{i=1}^p (\cos^{-1})^2 \sigma_i(X^T Y)\right)^{1/2}
$$

The second formula is based on the principal angles themselves. The principal angles $\theta_l$ between the subspaces $X$ and $Y$ are defined as the angles between pairs of corresponding vectors in $X$ and $Y$. The geodesic distance is then defined as the 2-norm of the vector of principal angles, which is the square root of the sum of the squares of these angles:

$$
d_G(X, Y) = || \Theta ||_2 = \left(\sum_l \theta_l^2 \right)^{1/2}
$$

To connect the two formulas, note that the arccosine of the singular value is equal to the principal angle, i.e., $\cos^{-1} \sigma_i = \theta_i$. Therefore, when you square and sum over all principal angles or all arccosines of the singular values, you get the same quantity, which is the square of the geodesic distance.

This connection between the two formulas is a result of the intimate relationship between the SVD of $X^T Y$ and the principal angles between the subspaces $X$ and $Y$, which is a fundamental fact in the geometry of the Grassmann manifold.

Finally, the function computes the Karcher mean of the Grassmann points using the stochastic gradient descent method provided by the UQpy library. The Karcher mean minimizes the sum of squared distances to the points on the Grassmannian, which is a non-convex optimization problem. Stochastic gradient descent is a popular method for solving non-convex optimization problems due to its scalability to large data sets and its ability to escape local minima.

The tolerance and maximum number of iterations for the stochastic gradient descent are set to 0.001 and 1000, respectively. If the optimization does not converge within these limits, the function returns the current estimate of the Karcher mean.

## Function `compute_karcher_mean_for_all_keys`

This function computes the Karcher mean of the tensors from two different models for all keys. It first loads the tensors from the models and then computes the Karcher mean for each key.

If a key ends with ".alpha", the function computes the arithmetic average of the corresponding tensors. This is a simple elementwise operation represented by the formula:

$$
\mu = \frac{X1 + X2}{2}
$$

If a key does not end with ".alpha", the function attempts to compute the Karcher mean using the `compute_karcher_mean` function. If this computation fails (for example, if the corresponding tensors are not full rank), the function defaults to computing the arithmetic average.

The function `compute_karcher_mean_for_all_keys` provides a way to combine models in deep learning by computing the Karcher mean of their weight matrices. This is a novel and promising approach that leverages the geometry of the Grassmannian to extract common features from different models, potentially leading to improved performance and robustness.



Ensemble learning and knowledge distillation are two crucial concepts in machine learning and deep learning. Ensemble learning involves training multiple models and combining their predictions, typically resulting in better performance than any individual model. Knowledge distillation, on the other hand, is the process of transferring knowledge from a complex, usually high performing, model (the teacher) to a simpler model (the student), with the aim of creating a model that performs nearly as well but is less computationally expensive.

The computation of the Karcher mean of model weights on the Grassmannian as described above can be seen as a sophisticated form of ensemble learning. In traditional ensemble learning, the predictions of individual models are combined, often through simple averaging or voting. However, this approach does not take into account the geometric structure of the model parameters. By computing the Karcher mean on the Grassmannian, we are effectively creating an ensemble model that respects the underlying geometric structure of the parameter space.

Let $\mathcal{M}_1, \mathcal{M}_2, \ldots, \mathcal{M}_k$ be the models in the ensemble, where $\mathcal{M}_i$ represents the $i$-th model's weights represented as points on the Grassmannian. The ensemble model $\mathcal{M}_E$ can be represented as:

$$
\mathcal{M}_E = \text{KarcherMean}(\mathcal{M}_1, \mathcal{M}_2, \ldots, \mathcal{M}_k)
$$

where the Karcher mean is computed on the Grassmannian. The ensemble model's prediction $y$ for an input $x$ is then given by $\mathcal{M}_E(x)$.

This approach can be particularly useful when the models in the ensemble are complementary, i.e., they excel in different regions of the input space. By computing the Karcher mean, we can create a model that shares the strengths of all the models in the ensemble, leading to robust and reliable predictions.

In the context of knowledge distillation, the Karcher mean can be used to distill the knowledge from multiple teacher models into a single student model. Suppose we have teacher models $\mathcal{T}_1, \mathcal{T}_2, \ldots, \mathcal{T}_k$ and a student model $\mathcal{S}$. We can compute the Karcher mean of the teacher models' weights and then train $\mathcal{S}$ to mimic the output of the ensemble model. This can be achieved by minimizing the following loss function during training:

$$
\mathcal{L}(\mathcal{S}) = \mathbb{E}_{(x, y) \sim \mathcal{D}}[d(\mathcal{S}(x), \mathcal{M}_E(x))^2]
$$

where $\mathcal{D}$ is the training data, $d(\cdot, \cdot)$ is a distance measure (e.g., the squared Euclidean distance), and the expectation is taken over all data points $(x, y)$ in $\mathcal{D}$.

By distilling knowledge from multiple teacher models, we can create a student model that not only is simpler and more computationally efficient, but also retains the predictive performance of the ensemble of teacher models.

For neural networks and deep learning, this approach can be particularly beneficial. The Karcher mean can serve as a form of geometric regularization, encouraging the model to stay close to the subspace spanned by the columns of the weight matrices. This could potentially improve the generalization ability of the model by preventing overfitting to the training data.

Moreover, this approach could be used to optimize training and inference in a variety of applications, including visual recognition, statistical learning, recommender systems, wireless communications, and natural language processing, as noted in the referenced research paper【11†source】.

Grassmannian learning is particularly powerful due to its ability to harness the structural information embedded in a problem, thereby leading to reduced complexity and improved performance. For example, Grassmannian Discriminant Analysis (GDA) applied to image-set classification can better capture the subspace invariance of facial expressions than traditional methods do. Additionally, in visual domain adaptation, Grassmannian Geodesic Flow Kernel (GFK) can exploit the domain-invariant features hidden in the geodesic connecting the source and target domains, both being Grassmann manifolds, to enable effective knowledge transfer between them【12†source】.

The Grassmann manifold itself can be represented by a collection of generator matrices, and the distance between two elements can be measured using principal angles【13†source】【14†source】. Gradient-based learning algorithms on the Grassmann manifold require the notion of tangency, which is crucial to compute gradients and perform optimization【15†source】.

Given two points on a manifold, a geodesic refers to the shortest curve on the manifold connecting the points, and solving for geodesic is a classical problem in the calculus of variation. On the Grassmann manifold, there exists a relatively simple method of computing geodesics based on the Singular Value Decomposition (SVD)【16†source】.

These properties of the Grassmann manifold and the Karcher mean can be leveraged to develop more efficient and effective learning algorithms. For instance, the geometric structure of the Grassmann manifold can be used to regularize the weights of a neural network during training, preventing overfitting and improving generalization. The Karcher mean can be used to compute the central tendency of a set of models in an ensemble or a set of teacher models in knowledge distillation, leading to better performance and more efficient computation.

Furthermore, it is conceivable that one could use this method to create a more dynamic form of ensemble learning or knowledge distillation, where the weights of the models in the ensemble or the student model are updated iteratively based on the Karcher mean of the current weights and the weights of a new model or a teacher model. This would allow the ensemble or student model to adapt more quickly to changes in the data or the task.

As a potential future direction, it would be interesting to investigate how the Karcher mean on the Grassmannian can be combined with other recent advances in deep learning, such as self-supervised learning or meta-learning. For example, could we use the Karcher mean to aggregate the weights of different models trained with different self-supervised tasks or different tasks in a meta-learning setup? This could potentially lead to more robust and versatile models that can adapt to a wide range of tasks and domains. However, further research is needed to explore these possibilities and understand the theoretical and practical implications of this approach.