In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader



In [3]:
train_data = pd.read_csv(
    '/kaggle/input/movie-recomendation-fall-2020/train.txt',
    sep='\t',
    header=None,
    names=['user_id', 'movie_id', 'rating']
)

# map id -> continuous index
user2id = {u: i for i, u in enumerate(train_data['user_id'].unique())}
item2id = {i: j for j, i in enumerate(train_data['movie_id'].unique())}

train_data['uid'] = train_data['user_id'].map(user2id)
train_data['iid'] = train_data['movie_id'].map(item2id)

n_users = len(user2id)
n_items = len(item2id)


In [4]:
from scipy.sparse import coo_matrix


In [5]:
def build_adj_matrix(df, n_users, n_items):
    rows = np.concatenate([df['uid'], df['iid'] + n_users])
    cols = np.concatenate([df['iid'] + n_users, df['uid']])
    data = np.ones(len(rows))

    adj = coo_matrix(
        (data, (rows, cols)),
        shape=(n_users + n_items, n_users + n_items)
    )

    # normalized Laplacian
    deg = np.array(adj.sum(axis=1)).flatten()
    deg_inv_sqrt = np.power(deg, -0.5)
    deg_inv_sqrt[np.isinf(deg_inv_sqrt)] = 0.

    D_inv = coo_matrix(
        (deg_inv_sqrt, (np.arange(len(deg)), np.arange(len(deg))))
    )

    return D_inv @ adj @ D_inv


In [6]:
norm_adj = build_adj_matrix(train_data, n_users, n_items)
norm_adj = norm_adj.tocoo()  # ðŸ”¥ Báº®T BUá»˜C
norm_adj = torch.sparse_coo_tensor(
    np.vstack((norm_adj.row, norm_adj.col)),
    norm_adj.data,
    norm_adj.shape
).float()


In [7]:
import random

class BPRDataset(Dataset):
    def __init__(self, df, n_items):
        self.users = df['uid'].values
        self.items = df['iid'].values
        self.n_items = n_items
        self.user_pos = df.groupby('uid')['iid'].apply(set).to_dict()

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        u = self.users[idx]
        pos = self.items[idx]
        while True:
            neg = random.randint(0, self.n_items - 1)
            if neg not in self.user_pos[u]:
                break
        return u, pos, neg


In [8]:
import torch.nn as nn
import torch.nn.functional as F


In [9]:
class LightGCN(nn.Module):
    def __init__(self, n_users, n_items, emb_dim, n_layers, adj):
        super().__init__()
        self.n_users = n_users
        self.n_items = n_items
        self.n_layers = n_layers
        self.adj = adj

        self.user_emb = nn.Embedding(n_users, emb_dim)
        self.item_emb = nn.Embedding(n_items, emb_dim)

        nn.init.xavier_uniform_(self.user_emb.weight)
        nn.init.xavier_uniform_(self.item_emb.weight)

    def forward(self):
        all_emb = torch.cat([self.user_emb.weight,
                              self.item_emb.weight], dim=0)

        embs = [all_emb]
        for _ in range(self.n_layers):
            all_emb = torch.sparse.mm(self.adj, all_emb)
            embs.append(all_emb)

        embs = torch.stack(embs, dim=1)
        final_emb = torch.mean(embs, dim=1)

        return (
            final_emb[:self.n_users],
            final_emb[self.n_users:]
        )


In [10]:
def bpr_loss(u_emb, pos_emb, neg_emb):
    pos_score = torch.sum(u_emb * pos_emb, dim=1)
    neg_score = torch.sum(u_emb * neg_emb, dim=1)
    return -torch.mean(F.logsigmoid(pos_score - neg_score))

In [11]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

dataset = BPRDataset(train_data, n_items)
loader = DataLoader(dataset, batch_size=1024, shuffle=True)

model = LightGCN(
    n_users=n_users,
    n_items=n_items,
    emb_dim=64,
    n_layers=3,
    adj=norm_adj.to(device)
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [12]:
for epoch in range(25):
    model.train()
    total_loss = 0

    for u, pos, neg in loader:
        u = u.to(device)
        pos = pos.to(device)
        neg = neg.to(device)

        # ðŸ”¥ forward má»—i batch
        user_emb, item_emb = model()

        loss = bpr_loss(
            user_emb[u],
            item_emb[pos],
            item_emb[neg]
        )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader):.4f}")

Epoch 1, Loss: 0.6485
Epoch 2, Loss: 0.4313
Epoch 3, Loss: 0.3518
Epoch 4, Loss: 0.3409
Epoch 5, Loss: 0.3363
Epoch 6, Loss: 0.3351
Epoch 7, Loss: 0.3324
Epoch 8, Loss: 0.3288
Epoch 9, Loss: 0.3253
Epoch 10, Loss: 0.3224
Epoch 11, Loss: 0.3135
Epoch 12, Loss: 0.3093
Epoch 13, Loss: 0.3064
Epoch 14, Loss: 0.2951
Epoch 15, Loss: 0.2890
Epoch 16, Loss: 0.2847
Epoch 17, Loss: 0.2790
Epoch 18, Loss: 0.2775
Epoch 19, Loss: 0.2727
Epoch 20, Loss: 0.2718
Epoch 21, Loss: 0.2672
Epoch 22, Loss: 0.2663
Epoch 23, Loss: 0.2613
Epoch 24, Loss: 0.2583
Epoch 25, Loss: 0.2577


score_mean: 3.7944698333740234
score_std : 1.6145498752593994


Global mean rating: 3.5238268742409184


In [25]:
import pandas as pd
import torch
import numpy as np

# 1. Load test
test_df = pd.read_csv(
    '/kaggle/input/movie-recomendation-fall-2020/test.txt',
    sep='\t',
    header=None,
    names=['user_id', 'movie_id']
)

# 2. Táº¡o Id (báº¯t Ä‘áº§u tá»« 1)
test_df['Id'] = np.arange(1, len(test_df) + 1)

# 3. Map index - Sá»¬A: sá»­ dá»¥ng Ä‘Ãºng dict tá»« train
test_df['uid'] = test_df['user_id'].map(user2id)  # user2id tá»« train
test_df['iid'] = test_df['movie_id'].map(item2id)  # item2id tá»« train

# 4. Xá»­ lÃ½ missing values
test_df['uid'] = test_df['uid'].fillna(-1).astype(int)
test_df['iid'] = test_df['iid'].fillna(-1).astype(int)

# 5. Model eval
model.eval()
with torch.no_grad():
    user_emb, item_emb = model()

# 6. Vectorized prediction
uids = torch.LongTensor(test_df['uid'].values).to(device)
iids = torch.LongTensor(test_df['iid'].values).to(device)

scores = torch.zeros(len(test_df), device=device)

valid_mask = (uids >= 0) & (iids >= 0)

# Chá»‰ tÃ­nh cho user/item cÃ³ trong train
if valid_mask.any():
    scores[valid_mask] = torch.sum(
        user_emb[uids[valid_mask]] * item_emb[iids[valid_mask]],
        dim=1
    )

# User/item má»›i giá»¯ score = 0
scores[~valid_mask] = 0.0

# 7. Táº¡o submission
submit_df = pd.DataFrame({
    'Id': test_df['Id'],
    'Score': scores.cpu().numpy()
})

# 8. LÆ°u file
submit_path = '/kaggle/working/LightGCN.csv'
submit_df.to_csv(submit_path, index=False)

print("âœ… Submission saved to", submit_path)
print("\nFirst 5 rows:")
print(submit_df.head())

âœ… Submission saved to /kaggle/working/LightGCN.csv

First 5 rows:
   Id     Score
0   1  2.548569
1   2  3.349087
2   3  2.464196
3   4  5.602284
4   5  3.395258
