In [38]:
import numpy as np
import scipy.stats as stats

In [18]:
# euclidean
u1 = np.array([4, 1, 1, 4])
u3 = np.array([2, 0, 4,5 ])

euc_sim = np.sqrt(np.sum((u1 - u3)**2))
euc_sim

3.872983346207417

In [None]:
# Mean squared Difference (MSD)
# euclidean에서 null 값을 제거한 것들끼리의 유사도

u1 = np.array([4, 1, 1, 4])
u3 = np.array([2, 0, 4, 5])

MSD = (2**2+ 3**2+ 1**2)/3
MSD

4.666666666666667

In [53]:
# JMSD
u1 = np.array([4, 1, 1, 4])
u1_norm = u1/5

u3 = np.array([2, 0, 4, 5])
u3_norm = u3/5

jcd = 3/4
MSD = ((u1_norm[0] - u3_norm[0])**2 + (u1_norm[2] - u3_norm[2])**2 + (u1_norm[3] - u3_norm[3])**2)/3

JMSD = jcd * (1-MSD)
JMSD

0.61

In [37]:
# cosine
u1 = np.array([5, 4, 3])
u3 = np.array([3, 5, 4])

# cos_sim = np.sum((u1 * u3))/(np.sqrt(np.sum(u1**2)) * np.sqrt(np.sum((u3**2))))
cos_sim = np.sum((u1 * u3))/(np.linalg.norm(u1, ord = 2) * np.linalg.norm(u3, ord = 2))

cos_sim

0.9399999999999998

In [64]:
# pearson correlation coefficient (PCC)
u1 = np.array([4, 1, 1, 4])
u3 = np.array([2,    4, 5])

u1_mean = np.mean(u1)
u3_mean = np.mean(u3)

new_u1 = np.array([4, 1, 4])
new_u3 = np.array([2, 4, 5])

a = new_u1-u1_mean
b = new_u3-u3_mean

pcc = np.sum(a*b) / (np.linalg.norm(a) * np.linalg.norm(b))
pcc

-0.17817416127494962

In [63]:
# Constrained Pearson
u1 = np.array([4, 1, 1, 4])
u3 = np.array([2,    4, 5])

u1_med = np.median(u1)
u3_med = np.median(u3)

new_u1 = np.array([4, 1, 4])
new_u3 = np.array([2, 4, 5])

a = new_u1-u1_med
b = new_u3-u3_med

pcc = np.sum(a*b) / (np.linalg.norm(a) * np.linalg.norm(b))
pcc

-0.2581988897471611

# Predicting rating

In [1]:
import torch
import numpy as np
import torch.nn as nn

In [11]:
# L1 norm

H, N, T = 4, 5, 300
np.random.seed(0)

# attention map
A = np.random.rand(H, N, T)
print(A.shape)
# print(A)

# normalization
A_sum = A.sum(axis=(1,2), keepdims=True) # 각 헤드의 attention map의 합
A_tilde = A / A_sum
print(A_tilde.shape)

# 각 헤드 L_sparse
# L_sparse_h = np.abs(A_tilde).sum(axis=(1,2))
L_sparse_h = -torch.sum(A_tilde ** 2, dim = (1,2))
print()
print(L_sparse_h.shape)
print(L_sparse_h)

# L_sparse
L_sparse = np.sum(L_sparse_h) / H
print('L_sparse:', L_sparse)

(4, 5, 300)
(4, 5, 300)


TypeError: sum() received an invalid combination of arguments - got (numpy.ndarray, dim=tuple), but expected one of:
 * (Tensor input, *, torch.dtype dtype)
 * (Tensor input, tuple of ints dim, bool keepdim, *, torch.dtype dtype, Tensor out)
 * (Tensor input, tuple of names dim, bool keepdim, *, torch.dtype dtype, Tensor out)


In [108]:
# L2 norm

H, N, T = 4, 5, 300
# np.random.seed(0)

# attention map
A = np.random.rand(H, N, T)
print(A.shape)
# print(A)

# normalization
A_sum = A.sum(axis=(1,2), keepdims=True) # 각 헤드의 attention map의 합
A_tilde = A / A_sum
print(A_tilde.shape)

# 각 헤드 L_sparse
# L_sparse_h = np.abs(A_tilde).sum(axis=(1,2))
L_sparse_h = -(np.power(A_tilde,2).sum(axis=(1,2)))
print()
print(L_sparse_h.shape)
print(L_sparse_h)

# L_sparse
L_sparse = np.sum(L_sparse_h) / H
print('L_sparse:', L_sparse)

(4, 5, 300)
(4, 5, 300)

(4,)
[-0.00089222 -0.00089486 -0.00089428 -0.00087901]
L_sparse: -0.0008900942127996425


In [20]:
np.sum(L_sparse)

-0.0008941309162695329

In [8]:
L_sparse_h

array([-0.00089682, -0.00088438, -0.00089418, -0.0008965 ])

In [91]:

class MultiHeadSparsityLoss(nn.Module):
    """
    Implements the multi-head sparsity regularization:
        For each head h, A^{(h)} ∈ R^{N×T} is normalized by its sum,
        then L_sparse^{(h)} = ||Â^{(h)}||_1, and
        L_sparse = (1/H) * sum_h L_sparse^{(h)}.
    Supports input shapes (B,H,N,T) or (B,N,T). Optionally supports a mask over frames.
    """
    def __init__(self, eps: float = 1e-12, reduction: str = "mean"):
        """
        eps: small constant to avoid division by zero
        reduction: 'mean' (default), 'sum', or 'none' over the batch
        """
        super().__init__()
        self.eps = eps
        assert reduction in ("mean", "sum", "none")
        self.reduction = reduction

    def forward(self, A: torch.Tensor, frame_mask: torch.Tensor = None) -> torch.Tensor:
        """
        Args
        ----
        A : torch.Tensor
            Attention maps with shape (B,H,N,T) or (B,N,T).
        frame_mask : torch.Tensor, optional
            Valid-frame mask of shape (B,T) or (B,1,T). Ones for valid frames, zeros for padded frames.

        Returns
        -------
        loss : torch.Tensor
            Scalar loss if reduction != 'none', else (B,) vector.
        """
        if A.ndim == 3:
            # (B,N,T) -> add H=1
            A = A.unsqueeze(1)  # (B,1,N,T)
        elif A.ndim != 4:
            raise ValueError("A must be (B,H,N,T) or (B,N,T).")

        B, H, N, T = A.shape

        # Optionally apply frame mask before normalization (mask invalid frames to zero)
        if frame_mask is not None:
            if frame_mask.ndim == 2:
                frame_mask = frame_mask.unsqueeze(1)  # (B,1,T)
            if frame_mask.ndim == 3:
                # Broadcast to (B,1,1,T)
                frame_mask = frame_mask.unsqueeze(1) if frame_mask.shape[1] != 1 else frame_mask
            # reshape to (B,1,1,T) for broadcast over heads and joints
            frame_mask = frame_mask.view(B, 1, 1, T)
            A = A * frame_mask  # zero-out invalid frames

        # Head-wise normalization by total sum over (N,T) so that sum_i,t Â^{(h)}_{i,t} = 1
        sums = A.sum(dim=(2, 3), keepdim=True).clamp_min(self.eps)  # (B,H,1,1)
        A_tilde = A / sums  # (B,H,N,T)

        # L1 per head: ||Â^{(h)}||_1 = sum_{i,t} |Â^{(h)}_{i,t}|
        # A_tilde >= 0 assumed (attention weights). If not guaranteed, abs() is safe.
        L_sparse_h = A_tilde.abs().sum(dim=(2, 3))  # (B,H)

        # L_sparse_h = -torch.sqrt(torch.sum(A_tilde ** 2, dim = (2,3))) + self.eps
        # L_sparse_h = -torch.sum(A_tilde ** 2, dim = (2,3))

        # # 여기만;;
        # # L1 per head: ||Â^{(h)}||_1 = sum_{i,t} |Â^{(h)}_{i,t}|
        # # A_tilde >= 0 assumed (attention weights). If not guaranteed, abs() is safe.
        # L_sparse_h = A.abs().sum(dim=(2, 3))  # (B,H)

 

        # Average over heads: (1/H) * sum_h L_sparse^{(h)}
        L_sparse = L_sparse_h.mean(dim=1)  # (B,)

        if self.reduction == "mean":
            return L_sparse.mean()
        elif self.reduction == "sum":
            return L_sparse.sum()
        else:  # 'none'
            return L_sparse  # (B,)


# ---------------------------
# 사용 예시
# ---------------------------
if __name__ == "__main__":
    B, H, N, T = 8, 4, 25, 48
    A = torch.rand(B, H, N, T, requires_grad=True)  # 예시 attention
    y = torch.randint(0, 300, (B,))                 # 예시 라벨
    logits = torch.randn(B, 300, requires_grad=True)

    # 손실들
    ce = nn.CrossEntropyLoss()(logits, y)

    sparse_loss_fn = MultiHeadSparsityLoss(eps=1e-12, reduction="mean")
    L_sparse = sparse_loss_fn(A)  # \mathcal{L}_{sparse}

    # 가중합 (예: lambda_s = 0.1)
    lambda_s = 0.1
    loss = ce + lambda_s * L_sparse
    loss.backward()

    print('L_sparse:', L_sparse)
    print(f"L_sparse={L_sparse.item():.12f}")

    print(loss)

L_sparse: tensor(1., grad_fn=<MeanBackward0>)
L_sparse=1.000000000000
tensor(5.9592, grad_fn=<AddBackward0>)
