In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import scipy.sparse as sp

from torch.utils.data import DataLoader, Dataset


In [2]:
train_data = pd.read_csv(
    '/kaggle/input/movie-recomendation-fall-2020/train.txt',
    sep='\t',
    header=None,
    names=['user_id', 'movie_id', 'rating']
)

print(train_data.head())
print(train_data.shape)


   user_id  movie_id  rating
0        1         1       5
1        1         2       3
2        1         3       4
3        1         4       3
4        1         5       3
(90570, 3)


In [3]:
user_map = {u: i for i, u in enumerate(train_data['user_id'].unique())}
item_map = {i: j for j, i in enumerate(train_data['movie_id'].unique())}

train_data['uid'] = train_data['user_id'].map(user_map)
train_data['iid'] = train_data['movie_id'].map(item_map)

num_users = len(user_map)
num_items = len(item_map)

print("num_users:", num_users)
print("num_items:", num_items)


num_users: 943
num_items: 1680


In [4]:
train_interactions = torch.tensor(
    train_data[['uid', 'iid']].values,
    dtype=torch.long
)


In [5]:
def build_adj_matrix(interactions, num_users, num_items):
    users = interactions[:, 0].numpy()
    items = interactions[:, 1].numpy()

    data = np.ones(len(users), dtype=np.float32)

    ui_mat = sp.coo_matrix(
        (data, (users, items)),
        shape=(num_users, num_items)
    )

    adj = sp.vstack([
        sp.hstack([sp.coo_matrix((num_users, num_users)), ui_mat]),
        sp.hstack([ui_mat.T, sp.coo_matrix((num_items, num_items))])
    ])

    # D^{-1/2} A D^{-1/2}
    rowsum = np.array(adj.sum(axis=1)).flatten()
    d_inv = np.power(rowsum, -0.5)
    d_inv[np.isinf(d_inv)] = 0.
    D = sp.diags(d_inv)

    return D @ adj @ D
adj_matrix = build_adj_matrix(train_interactions, num_users, num_items)


In [6]:
class XSimGCL(nn.Module):
    def __init__(self, num_users, num_items, adj_matrix,
                 embedding_dim=64, n_layers=3, device='cuda'):
        super().__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.n_layers = n_layers
        self.device = device

        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)

        nn.init.xavier_uniform_(self.user_embedding.weight)
        nn.init.xavier_uniform_(self.item_embedding.weight)

        self.adj = self._convert_sp_mat_to_tensor(adj_matrix).to(device)

    def _convert_sp_mat_to_tensor(self, mat):
        mat = mat.tocoo()
        indices = torch.LongTensor([mat.row, mat.col])
        values = torch.FloatTensor(mat.data)
        return torch.sparse.FloatTensor(indices, values, mat.shape)

    def forward(self):
        """
        Return embeddings of ALL layers
        """
        ego = torch.cat(
            [self.user_embedding.weight, self.item_embedding.weight],
            dim=0
        )

        layer_embeddings = [ego]

        for _ in range(self.n_layers):
            ego = torch.sparse.mm(self.adj, ego)
            layer_embeddings.append(ego)

        return layer_embeddings


In [7]:
def bpr_loss(u_emb, pos_emb, neg_emb):
    pos_score = (u_emb * pos_emb).sum(dim=1)
    neg_score = (u_emb * neg_emb).sum(dim=1)
    return (-torch.log(torch.sigmoid(pos_score - neg_score) + 1e-8)).mean()

In [8]:
class BPRDataset(Dataset):
    def __init__(self, interactions, num_items):
        self.users = interactions[:, 0]
        self.items = interactions[:, 1]
        self.num_items = num_items

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        u = self.users[idx]
        pos = self.items[idx]
        neg = torch.randint(0, self.num_items, (1,)).item()
        return u, pos, neg


In [9]:
def train_xsimgcl(
    model,
    interactions,
    epochs=50,
    batch_size=2048,
    lr=1e-3,
    lambda_reg=0.1
):
    device = model.device
    model.train()

    dataset = BPRDataset(interactions, model.num_items)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        total_loss = 0.0
        total_bpr = 0.0
        total_reg = 0.0

        for users, pos_items, neg_items in loader:
            users = users.to(device)
            pos_items = pos_items.to(device)
            neg_items = neg_items.to(device)

            optimizer.zero_grad()

            # ===== 1. Forward: LightGCN embeddings at all layers =====
            layer_embs = model()  # list: [E^(0), E^(1), ..., E^(K)]

            # ===== 2. Final embedding (LightGCN aggregation) =====
            final_emb = torch.mean(
                torch.stack(layer_embs, dim=1),
                dim=1
            )  # [num_users + num_items, dim]

            user_final, item_final = torch.split(
                final_emb,
                [model.num_users, model.num_items]
            )

            # ===== 3. BPR loss =====
            loss_bpr = bpr_loss(
                user_final[users],
                item_final[pos_items],
                item_final[neg_items]
            )

            # ===== 4. XSimGCL regularization (KEY DIFFERENCE) =====
            # regularize REPRESENTATION, not parameters
            loss_reg = (
                user_final[users].pow(2).sum(dim=1).mean()
                +
                item_final[pos_items].pow(2).sum(dim=1).mean()
            )

            # ===== 5. Total loss =====
            loss = loss_bpr + lambda_reg * loss_reg

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            total_bpr += loss_bpr.item()
            total_reg += loss_reg.item()

        print(
            f"[Epoch {epoch+1}/{epochs}] "
            f"Loss: {total_loss / len(loader):.4f} | "
            f"BPR: {total_bpr / len(loader):.4f} | "
            f"XSimReg: {total_reg / len(loader):.4f}"
        )


In [18]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = XSimGCL(
    num_users=num_users,
    num_items=num_items,
    adj_matrix=adj_matrix,   # đã normalized
    embedding_dim=64,
    n_layers=3,
    device=device
).to(device)


train_xsimgcl(
    model=model,
    interactions=train_interactions,  # KHÔNG .to(device)
    epochs=50,
    batch_size=2048,
    lr=1e-3,
    lambda_reg=1e-4
)


[Epoch 1/50] Loss: 0.6865 | BPR: 0.6865 | XSimReg: 0.0733
[Epoch 2/50] Loss: 0.6132 | BPR: 0.6131 | XSimReg: 0.8292
[Epoch 3/50] Loss: 0.4984 | BPR: 0.4981 | XSimReg: 2.5664
[Epoch 4/50] Loss: 0.4414 | BPR: 0.4409 | XSimReg: 4.2290
[Epoch 5/50] Loss: 0.4282 | BPR: 0.4277 | XSimReg: 5.1495
[Epoch 6/50] Loss: 0.4181 | BPR: 0.4175 | XSimReg: 5.5560
[Epoch 7/50] Loss: 0.4187 | BPR: 0.4182 | XSimReg: 5.7306
[Epoch 8/50] Loss: 0.4150 | BPR: 0.4144 | XSimReg: 5.7601
[Epoch 9/50] Loss: 0.4154 | BPR: 0.4148 | XSimReg: 5.7664
[Epoch 10/50] Loss: 0.4107 | BPR: 0.4101 | XSimReg: 5.7686
[Epoch 11/50] Loss: 0.4099 | BPR: 0.4093 | XSimReg: 5.7796
[Epoch 12/50] Loss: 0.4092 | BPR: 0.4086 | XSimReg: 5.7772
[Epoch 13/50] Loss: 0.4038 | BPR: 0.4032 | XSimReg: 5.7863
[Epoch 14/50] Loss: 0.4015 | BPR: 0.4009 | XSimReg: 5.7872
[Epoch 15/50] Loss: 0.3979 | BPR: 0.3973 | XSimReg: 5.8199
[Epoch 16/50] Loss: 0.3962 | BPR: 0.3956 | XSimReg: 5.8725
[Epoch 17/50] Loss: 0.3925 | BPR: 0.3920 | XSimReg: 5.8936
[Epoch

In [19]:
import pandas as pd
import torch
import numpy as np

test_df = pd.read_csv(
    '/kaggle/input/movie-recomendation-fall-2020/test.txt',
    sep='\t',
    header=None,
    names=['user_id', 'movie_id']
)

print(test_df.head())
print("Test size:", len(test_df))

# Id cho submission
test_df['Id'] = np.arange(1, len(test_df) + 1)


   user_id  movie_id
0        1        20
1        1        33
2        1        61
3        1       117
4        1       155
Test size: 9430


In [20]:
test_df['uid'] = test_df['user_id'].map(user_map)
test_df['iid'] = test_df['movie_id'].map(item_map)

missing_users = test_df['uid'].isna().sum()
missing_items = test_df['iid'].isna().sum()

print(f"Missing users: {missing_users}")
print(f"Missing items: {missing_items}")


Missing users: 0
Missing items: 2


In [22]:
device = model.device
model.eval()

with torch.no_grad():
    # Lấy embedding ở tất cả layers
    layer_embs = model()

    # LightGCN-style aggregation
    final_emb = torch.mean(torch.stack(layer_embs, dim=1), dim=1)

    user_emb, item_emb = torch.split(
        final_emb,
        [model.num_users, model.num_items]
    )


In [23]:
def score_to_rating(score):
    # sigmoid scaling
    rating = 1 + 4 * torch.sigmoid(score)
    return rating.item()


In [24]:
predictions = []

for _, row in test_df.iterrows():
    u = row['uid']
    i = row['iid']

    # User hoặc item mới → fallback
    if pd.isna(u) or pd.isna(i):
        pred = 2.5
    else:
        u = int(u)
        i = int(i)

        score = torch.dot(user_emb[u], item_emb[i])
        pred = score_to_rating(score)

    predictions.append(pred)

test_df['Score'] = predictions


In [25]:
submit_df = test_df[['Id', 'Score']]
submit_path = '/kaggle/working/xSimGCL.csv'

submit_df.to_csv(submit_path, index=False)

print("✅ Submission saved to:", submit_path)
print(submit_df.head())


✅ Submission saved to: /kaggle/working/xSimGCL.csv
   Id     Score
0   1  4.480495
1   2  4.676014
2   3  4.480831
3   4  4.898435
4   5  4.700726


In [26]:
print(submit_df['Score'].describe())


count    9430.000000
mean        4.726521
std         0.364798
min         1.365061
25%         4.685679
50%         4.851260
75%         4.927363
max         4.998294
Name: Score, dtype: float64
