In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from scipy.sparse import coo_matrix


In [2]:
# Load data
train_data = pd.read_csv(
    '/kaggle/input/movie-recomendation-fall-2020/train.txt',
    sep='\t',
    header=None,
    names=['user_id', 'movie_id', 'rating']
)

# Implicit feedback
train_data = train_data[train_data['rating'] > 0]

# Re-index
user_map = {u: i for i, u in enumerate(train_data['user_id'].unique())}
item_map = {i: j for j, i in enumerate(train_data['movie_id'].unique())}

train_data['uid'] = train_data['user_id'].map(user_map)
train_data['iid'] = train_data['movie_id'].map(item_map)

num_users = len(user_map)
num_items = len(item_map)


In [3]:
def build_adj_matrix(df, num_users, num_items):
    row = np.concatenate([df['uid'], df['iid'] + num_users])
    col = np.concatenate([df['iid'] + num_users, df['uid']])
    data = np.ones(len(row))

    adj = coo_matrix(
        (data, (row, col)),
        shape=(num_users + num_items, num_users + num_items)
    )

    # D^{-1/2} A D^{-1/2}
    deg = np.array(adj.sum(axis=1)).flatten()
    deg_inv_sqrt = np.power(deg, -0.5)
    deg_inv_sqrt[np.isinf(deg_inv_sqrt)] = 0.0

    D_inv_sqrt = coo_matrix(
        (deg_inv_sqrt, (np.arange(len(deg)), np.arange(len(deg))))
    )

    norm_adj = D_inv_sqrt @ adj @ D_inv_sqrt
    return norm_adj


In [5]:
norm_adj = build_adj_matrix(train_data, num_users, num_items)

norm_adj = norm_adj.tocoo()  # ðŸ”¥ Báº®T BUá»˜C

norm_adj = torch.sparse_coo_tensor(
    torch.LongTensor([norm_adj.row, norm_adj.col]),
    torch.FloatTensor(norm_adj.data),
    torch.Size(norm_adj.shape)
)


  torch.LongTensor([norm_adj.row, norm_adj.col]),


In [6]:
class BPRDataset(Dataset):
    def __init__(self, df, num_items):
        self.users = df['uid'].values
        self.items = df['iid'].values
        self.num_items = num_items
        self.user_pos = df.groupby('uid')['iid'].apply(set).to_dict()

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        u = self.users[idx]
        i = self.items[idx]

        while True:
            j = np.random.randint(self.num_items)
            if j not in self.user_pos[u]:
                break
        return u, i, j


In [7]:
class SimGCL(nn.Module):
    def __init__(
        self,
        num_users,
        num_items,
        emb_dim=64,
        n_layers=3,
        noise_eps=0.1,
        tau=0.2
    ):
        super().__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.n_layers = n_layers
        self.noise_eps = noise_eps
        self.tau = tau

        self.embedding = nn.Embedding(num_users + num_items, emb_dim)
        nn.init.xavier_uniform_(self.embedding.weight)

    def lightgcn(self, adj):
        all_emb = self.embedding.weight
        embs = [all_emb]

        for _ in range(self.n_layers):
            all_emb = torch.sparse.mm(adj, all_emb)
            embs.append(all_emb)

        return torch.mean(torch.stack(embs), dim=0)

    def forward(self, adj, add_noise=False):
        emb = self.lightgcn(adj)

        if add_noise:
            noise = F.normalize(torch.rand_like(emb), dim=1)
            emb = emb + self.noise_eps * noise

        user_emb = emb[:self.num_users]
        item_emb = emb[self.num_users:]
        return user_emb, item_emb


In [8]:
def bpr_loss(u_emb, i_emb, j_emb):
    pos = torch.sum(u_emb * i_emb, dim=1)
    neg = torch.sum(u_emb * j_emb, dim=1)
    return -torch.mean(F.logsigmoid(pos - neg))


In [9]:
def simgcl_cl_loss(z1, z2, tau):
    z1 = F.normalize(z1, dim=1)
    z2 = F.normalize(z2, dim=1)

    sim = torch.matmul(z1, z2.T) / tau
    labels = torch.arange(sim.size(0), device=sim.device)
    return F.cross_entropy(sim, labels)


In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = SimGCL(num_users, num_items).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

dataset = BPRDataset(train_data, num_items)
loader = DataLoader(dataset, batch_size=2048, shuffle=True)

norm_adj = norm_adj.to(device)

lambda_cl = 0.2


In [11]:
for epoch in range(1, 101):
    model.train()
    total_loss = 0

    for u, i, j in loader:
        u, i, j = u.to(device), i.to(device), j.to(device)

        # View 1
        u1, it1 = model(norm_adj, add_noise=True)
        # View 2
        u2, it2 = model(norm_adj, add_noise=True)

        # BPR
        loss_bpr = bpr_loss(
            u1[u], it1[i], it1[j]
        )

        # CL
        loss_cl = (
            simgcl_cl_loss(u1[u], u2[u], model.tau) +
            simgcl_cl_loss(it1[i], it2[i], model.tau)
        )

        loss = loss_bpr + lambda_cl * loss_cl

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch:03d} | Loss {total_loss / len(loader):.4f}")


Epoch 001 | Loss 2.5814
Epoch 002 | Loss 2.2277
Epoch 003 | Loss 2.1263
Epoch 004 | Loss 2.0755
Epoch 005 | Loss 2.0439
Epoch 006 | Loss 2.0221
Epoch 007 | Loss 2.0061
Epoch 008 | Loss 1.9937
Epoch 009 | Loss 1.9839
Epoch 010 | Loss 1.9755
Epoch 011 | Loss 1.9691
Epoch 012 | Loss 1.9631
Epoch 013 | Loss 1.9578
Epoch 014 | Loss 1.9533
Epoch 015 | Loss 1.9494
Epoch 016 | Loss 1.9460
Epoch 017 | Loss 1.9428
Epoch 018 | Loss 1.9399
Epoch 019 | Loss 1.9373
Epoch 020 | Loss 1.9348
Epoch 021 | Loss 1.9323
Epoch 022 | Loss 1.9303
Epoch 023 | Loss 1.9281
Epoch 024 | Loss 1.9263
Epoch 025 | Loss 1.9248
Epoch 026 | Loss 1.9230
Epoch 027 | Loss 1.9214
Epoch 028 | Loss 1.9197
Epoch 029 | Loss 1.9183
Epoch 030 | Loss 1.9170
Epoch 031 | Loss 1.9154
Epoch 032 | Loss 1.9144
Epoch 033 | Loss 1.9130
Epoch 034 | Loss 1.9120
Epoch 035 | Loss 1.9105
Epoch 036 | Loss 1.9092
Epoch 037 | Loss 1.9083
Epoch 038 | Loss 1.9070
Epoch 039 | Loss 1.9058
Epoch 040 | Loss 1.9049
Epoch 041 | Loss 1.9037
Epoch 042 | Loss

In [27]:
import pandas as pd
import torch
import numpy as np

# 1. Load test
test_df = pd.read_csv(
    '/kaggle/input/movie-recomendation-fall-2020/test.txt',
    sep='\t',
    header=None,
    names=['user_id', 'movie_id']
)

# 2. Táº¡o Id
test_df['Id'] = np.arange(1, len(test_df) + 1)

# 3. Map index
test_df['uid'] = test_df['user_id'].map(user_map)
test_df['iid'] = test_df['movie_id'].map(item_map)

# 4. Model eval
model.eval()
with torch.no_grad():
    user_emb, item_emb = model(norm_adj, add_noise=False)

# 5. Vectorized prediction
uids = torch.LongTensor(test_df['uid'].fillna(-1).values).to(device)
iids = torch.LongTensor(test_df['iid'].fillna(-1).values).to(device)

scores = torch.zeros(len(test_df), device=device)

valid_mask = (uids >= 0) & (iids >= 0)

scores[valid_mask] = torch.sum(
    user_emb[uids[valid_mask]] * item_emb[iids[valid_mask]],
    dim=1
)

# User/item má»›i â†’ score = 0 (implicit CF chuáº©n)
scores[~valid_mask] = 0.0

# 6. Táº¡o submission
submit_df = pd.DataFrame({
    'Id': test_df['Id'],
    'Score': scores.cpu().numpy()
})

submit_path = '/kaggle/working/SimGCL_root.csv'
submit_df.to_csv(submit_path, index=False)

print("âœ… Submission saved:", submit_path)
print(submit_df.head())


âœ… Submission saved: /kaggle/working/SimGCL_root.csv
   Id     Score
0   1 -0.032680
1   2 -0.125063
2   3  0.050505
3   4  0.119273
4   5 -0.035640
