In [1]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from tqdm import tqdm

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load data
print("Loading data...")
train_data = pd.read_csv('/kaggle/input/movie-recomendation-fall-2020/train.txt', sep='\t', header=None, names=['user_id', 'movie_id', 'rating'])

print(f"Dataset shape: {train_data.shape}")
print(f"Number of unique users: {train_data['user_id'].nunique()}")
print(f"Number of unique movies: {train_data['movie_id'].nunique()}")
print(f"Rating range: {train_data['rating'].min()} - {train_data['rating'].max()}")



Using device: cuda
Loading data...
Dataset shape: (90570, 3)
Number of unique users: 943
Number of unique movies: 1680
Rating range: 1 - 5


In [2]:
user_ids = sorted(train_data['user_id'].unique())  # unique(): L·∫•y ra danh s√°ch c√°c ph·∫ßn t·ª≠ duy nh·∫•t trong c·ªôt; sorted():S·∫Øp x·∫øp theo th·ª© t·ª± t·ª´ l·ªõn ->b√©
user_id_mapping = {id: i for i, id in enumerate(user_ids)} # Dictionary Chuy·ªÉn id th√†nh s·ªë theo th·ª© t·ª± 
item_ids = sorted(train_data['movie_id'].unique())  # 1 to 1682
item_id_mapping = {id: i for i, id in enumerate(item_ids)}

train_data['user_id'] = train_data['user_id'].map(user_id_mapping) # L√∫c n√†y t·∫•t c·∫£ c√°c id ƒë·ªÅu theo th·ª© t·ª± v√† kh√¥ng b·ªã m·∫•t d·ªØ li·ªáu th·ª© t·ª± c·ªßa user_id
train_data['movie_id'] = train_data['movie_id'].map(item_id_mapping)

num_users = len(user_ids)
num_items = len(item_ids)

print("Number of unique users:", num_users)
print("Number of unique items:", num_items)

Number of unique users: 943
Number of unique items: 1680


In [3]:
train_interactions = torch.tensor(train_data[['user_id', 'movie_id']].values, dtype=torch.long)
print(train_interactions.shape)
print(train_interactions[:10])
print("Data type:", train_interactions.dtype)
print("Device:", train_interactions.device)

torch.Size([90570, 2])
tensor([[0, 0],
        [0, 1],
        [0, 2],
        [0, 3],
        [0, 4],
        [0, 5],
        [0, 6],
        [0, 7],
        [0, 8],
        [0, 9]])
Data type: torch.int64
Device: cpu


In [4]:
# Adjacency matrix
rows = torch.cat([train_interactions[:, 0], train_interactions[:, 1] + num_users], dim=0) #N·ªëi 2 c·ªôt user_id v√† movie_id ƒë·ªÉ h·ªçc t∆∞∆°ng t√°c
cols = torch.cat([train_interactions[:, 1] + num_users, train_interactions[:, 0]], dim=0)
# rows = [0, 0, 1, 4, 5, 3]  ## C√°c ph·∫ßn t·ª≠ trong rows v√† cols l√† gi·ªëng nhau, ch·ªâ kh√°c nhau v·ªÅ th·ª© t·ª± ƒë·ªÉ h·ªçc c√°c t∆∞∆°ng t√°c
# cols = [4, 5, 3, 0, 0, 1]
#          ‚Üì   ‚Üì  ‚Üì  ‚Üì  ‚Üì  ‚Üì
# C·∫°nh: (0,4), (0,5), (1,3), (4,0), (5,0), (3,1)
indices = torch.stack([rows, cols], dim=0).to(device)
values = torch.ones(indices.shape[1], device=device) #values :Tensor to√†n s·ªë 1 v√† indices.shape[1] th·ªÉ hi·ªán cho s·ªë c·∫°nh 
adj = torch.sparse_coo_tensor(indices, values, size=(num_users + num_items, num_users + num_items), device=device)
#adj :Ma tr·∫≠n k·ªÅ bi·ªÉu di·ªÖn t∆∞∆°ng t√°c gi·ªØa item v√† user
# Normalized adjacency matrix
degrees = torch.sparse.sum(adj, dim=1).to_dense() #T√≠nh b·∫≠c c·ªßa c√°c node b·∫±ng c√°c t√≠nh t·ªïng t·ª´ng h√†ng->degrees: m·∫£ng bi·ªÉu di·ªÖn b·∫≠c c·ªßa t·ª´ng node
norm_values = 1.0 / (torch.sqrt(degrees[rows]) * torch.sqrt(degrees[cols])).to(device) #T√≠nh h·ªá s·ªë c·∫°nh gi·ªØa user v√† item
#A_norm[u, i] = 1 / (‚àödegree(u) * ‚àödegree(i)):Ph·∫ßn t·ª≠ ƒë·∫°i di·ªán cho g√≠a tr·ªã c·∫°nh trong ma tr·∫≠n norm_adj
norm_adj = torch.sparse_coo_tensor(indices, norm_values, size=(num_users + num_items, num_users + num_items), device=device) 


In [5]:
class SimGCL(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, num_layers, norm_adj, device):
        super(SimGCL, self).__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.device = device
        self.register_buffer('norm_adj', norm_adj)
        self.user_embeddings = nn.Embedding(num_users, embedding_dim)
        self.item_embeddings = nn.Embedding(num_items, embedding_dim)
        nn.init.normal_(self.user_embeddings.weight, std=0.01)  # Initialize user embeddings
        nn.init.normal_(self.item_embeddings.weight, std=0.01)  # Initialize item embeddings
        self.eps = 0.1 

    #H√†m n√†y truy·ªÅn th√¥ng tin qua c√°c  layer GNN ƒë·ªÉ t·∫°o ra c√°c Embedding cu·ªëi c√πng cho user v√† item
    def forward(self, perturbed=False):
        # Concatenate initial user and item embeddings
        ego_embeddings = torch.cat([self.user_embeddings.weight, self.item_embeddings.weight], dim=0)
        #self.user_embedding: (num_user,embedding_dim); self.item_embedding: (num_item,embedding_dim)
        ## ---> ego_embeddings: (num_user+num_item,embedding_dim):Gh√©p 2 ma tr·∫≠n tr√™n l·∫°i vs nhau
        #ego_embeddings ch·ªâ th·ªÉ hi·ªán embedding c·ªßa c√°c node m√† ko c√≥ s·ª± t∆∞∆°ng t√°c  gi·ªØa c√°c node 
        all_embeddings = []

        #L·∫∑p qua s·ªë l∆∞·ª£ng c√°c layer GNN
        for k in range(self.num_layers):
            
            ego_embeddings = torch.spmm(self.norm_adj, ego_embeddings) # Cho norm_adj(num_item+num_user;num_item+num_user)*(num_item+num_user;embeddings_dim)
           #ego_embeddings: l√∫c n√†y l√† c√°c embedding th·ªÉ hi·ªán ƒëc ƒë·∫∑c tr∆∞ng c·ªßa node(ƒë√£ bao g·ªìm vs s·ª± t∆∞∆°ng t√°c c·ªßa c√°c node kh√°c vs n√≥)

            #Nhi·ªÖu n√†y kh√¥ng ch·ªâ th√™m 1 l·∫ßn v√†o ban ƒë·∫ßu m√† th√™m v√†o m·ªói l·ªõp GNN
            #Th√™m nhi·ªÖu v√†o m√¥ h√¨nh 
            if perturbed:
                #Ma tr·∫≠n nhi·ªÖm t·ª´ [0-1]
                random_noise = torch.rand_like(ego_embeddings).to(self.device)
                # Add normalized noise scaled by eps
                ego_embeddings += torch.sign(ego_embeddings) * F.normalize(random_noise, dim=-1) * self.eps

            #Qua m·ªói l·ªõp GNN, l∆∞u l·∫°i to√†n b·ªô embedding ƒë·∫°i di·ªán cho c√°c node
            all_embeddings.append(ego_embeddings)

        # Stack embeddings across layers and compute the mean
        all_embeddings = torch.stack(all_embeddings, dim=1)
        all_embeddings = torch.mean(all_embeddings, dim=1)

        # Split into user and item embeddings
        user_all_embeddings, item_all_embeddings = torch.split(
            all_embeddings, [self.num_users, self.num_items]) 
 # chia ma tr·∫≠n all_embeddings(num_user+item_user,embedding) th√†nh 2 ma tr·∫≠n user_all_embedding(num_user,embedding)v√† item_all_embedding(item_user,embedding)

        return user_all_embeddings, item_all_embeddings

    def get_embeddings(self):
        all_embeddings = torch.cat([self.user_embeddings.weight, self.item_embeddings.weight], dim=0)
        ego_embeddings = all_embeddings
        for _ in range(self.num_layers):
            all_embeddings = torch.spmm(self.norm_adj, all_embeddings)
            ego_embeddings += all_embeddings
        final_embeddings = ego_embeddings / (self.num_layers + 1)
        user_emb = final_embeddings[:self.num_users]
        item_emb = final_embeddings[self.num_users:]
        return user_emb, item_emb

In [7]:
import torch.optim as optim
# Hyperparameters
embedding_dim = 64
num_layers = 3
learning_rate =1e-3
lambda_reg = 1e-6  # Regularization weight
cl_rate = 0.01  # Contrastive loss weight

# Initialize model
model = SimGCL(num_users, num_items, embedding_dim, num_layers, norm_adj, device)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [8]:
def InfoNCE(view1, view2, temperature: float, b_cos: bool = True):
    if b_cos:
        view1, view2 = F.normalize(view1, dim=1), F.normalize(view2, dim=1)
    pos_score = (view1 @ view2.T) / temperature
    score = torch.diag(F.log_softmax(pos_score, dim=1))
    return -score.mean()

def bpr_loss(user_emb, pos_item_emb, neg_item_emb):
    pos_score = (user_emb * pos_item_emb).sum(dim=1)
    neg_score = (user_emb * neg_item_emb).sum(dim=1)
    loss = -torch.log(torch.sigmoid(pos_score - neg_score) + 1e-8)
    return loss.mean()



In [13]:
def train_simgcl(
    model,
    train_interactions,
    num_epochs=50,
    batch_size=256,
    temperature=0.2,
    learning_rate=0.001,
    lambda_cl=0.1
):
    device = model.device
    model.train()

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    train_dataset = TensorDataset(
        train_interactions[:, 0],  # user
        train_interactions[:, 1]   # pos item
    )
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(num_epochs):
        total_loss = 0

        for batch_users, batch_pos_items in train_loader:
            batch_users = batch_users.to(device)
            batch_pos_items = batch_pos_items.to(device)

            # --------- Negative sampling ---------
            batch_neg_items = torch.randint(
                low=0,
                high=model.num_items,
                size=batch_pos_items.size(),
                device=device
            )

            optimizer.zero_grad()

            # ==================================================
            # 1. BPR LOSS (NO perturbation)
            # ==================================================
            user_emb, item_emb = model(perturbed=False)

            u_emb = user_emb[batch_users]
            pos_emb = item_emb[batch_pos_items]
            neg_emb = item_emb[batch_neg_items]

            loss_bpr = bpr_loss(u_emb, pos_emb, neg_emb)

            # ==================================================
            # 2. CONTRASTIVE LOSS (WITH perturbation)
            # ==================================================
            user_emb1, item_emb1 = model(perturbed=True)
            user_emb2, item_emb2 = model(perturbed=True)

            cl_user = InfoNCE(
                user_emb1[batch_users],
                user_emb2[batch_users],
                temperature
            )

            cl_item = InfoNCE(
                item_emb1[batch_pos_items],
                item_emb2[batch_pos_items],
                temperature
            )

            loss_cl = cl_user + cl_item

            # ==================================================
            # 3. JOINT LOSS
            # ==================================================
            loss = loss_bpr + lambda_cl * loss_cl

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(
            f"Epoch [{epoch+1}/{num_epochs}] | "
            f"Loss: {total_loss / len(train_loader):.4f} | "
            f"BPR: {loss_bpr.item():.4f} | CL: {loss_cl.item():.4f}"
        )

    return model


In [14]:
 train_interactions_tensor = torch.tensor(train_interactions, dtype=torch.long).to(device)

  train_interactions_tensor = torch.tensor(train_interactions, dtype=torch.long).to(device)


In [15]:
from torch.utils.data import TensorDataset, DataLoader
trained_model = train_simgcl(
        model=model,
        train_interactions=train_interactions_tensor,
        num_epochs=50,
        batch_size=256,
        temperature=0.2,
        learning_rate=0.001
    )

Epoch [1/50] | Loss: 0.9398 | BPR: 0.6865 | CL: 2.1589
Epoch [2/50] | Loss: 0.9385 | BPR: 0.6856 | CL: 2.1898
Epoch [3/50] | Loss: 0.9376 | BPR: 0.6842 | CL: 2.2012
Epoch [4/50] | Loss: 0.9366 | BPR: 0.6814 | CL: 2.1593
Epoch [5/50] | Loss: 0.9357 | BPR: 0.6824 | CL: 2.1644
Epoch [6/50] | Loss: 0.9342 | BPR: 0.6802 | CL: 2.2465
Epoch [7/50] | Loss: 0.9327 | BPR: 0.6794 | CL: 2.2315
Epoch [8/50] | Loss: 0.9312 | BPR: 0.6802 | CL: 2.2227
Epoch [9/50] | Loss: 0.9299 | BPR: 0.6694 | CL: 2.2504
Epoch [10/50] | Loss: 0.9280 | BPR: 0.6673 | CL: 2.2031
Epoch [11/50] | Loss: 0.9261 | BPR: 0.6735 | CL: 2.1967
Epoch [12/50] | Loss: 0.9242 | BPR: 0.6688 | CL: 2.2405
Epoch [13/50] | Loss: 0.9218 | BPR: 0.6696 | CL: 2.2431
Epoch [14/50] | Loss: 0.9196 | BPR: 0.6493 | CL: 2.2510
Epoch [15/50] | Loss: 0.9170 | BPR: 0.6351 | CL: 2.3024
Epoch [16/50] | Loss: 0.9145 | BPR: 0.6655 | CL: 2.2218
Epoch [17/50] | Loss: 0.9116 | BPR: 0.6545 | CL: 2.2363
Epoch [18/50] | Loss: 0.9082 | BPR: 0.6685 | CL: 2.2793
E

In [16]:
import pandas as pd
import numpy as np
import torch

# 1. T·∫£i l·∫°i t·∫≠p test v·ªõi ƒë√∫ng ƒë·ªãnh d·∫°ng
test_df = pd.read_csv('/kaggle/input/movie-recomendation-fall-2020/test.txt', 
                     sep='\t', 
                     header=None,
                     names=['user_id', 'movie_id'])

print("Test data shape:", test_df.shape)
print("Test data columns:", test_df.columns.tolist())
print("\nFirst few rows:")
print(test_df.head())
print("\nD·ªØ li·ªáu test:")
print(test_df.info())

# 2. T·∫°o c·ªôt Id cho file submit (t·ª´ 1 ƒë·∫øn s·ªë l∆∞·ª£ng m·∫´u)
test_df['Id'] = range(1, len(test_df) + 1)

# 3. √Ånh x·∫° user_id v√† movie_id sang index ƒë√£ chu·∫©n h√≥a
test_df['user_id_mapped'] = test_df['user_id'].map(user_id_mapping)
test_df['movie_id_mapped'] = test_df['movie_id'].map(item_id_mapping)

# ƒê·∫øm s·ªë user/item kh√¥ng c√≥ trong t·∫≠p train
missing_users = test_df['user_id_mapped'].isna().sum()
missing_items = test_df['movie_id_mapped'].isna().sum()
print(f"\nS·ªë user kh√¥ng c√≥ trong t·∫≠p train: {missing_users}/{len(test_df)}")
print(f"S·ªë item kh√¥ng c√≥ trong t·∫≠p train: {missing_items}/{len(test_df)}")

# 4. ƒê·∫£m b·∫£o model ·ªü ch·∫ø ƒë·ªô ƒë√°nh gi√°
trained_model.eval()

# 5. L·∫•y embeddings t·ª´ model
with torch.no_grad():
    user_emb, item_emb = trained_model.get_embeddings()

# 6. T·∫°o d·ª± ƒëo√°n
predictions = []

for _, row in test_df.iterrows():
    user_idx = row['user_id_mapped']
    item_idx = row['movie_id_mapped']
    
    # Ki·ªÉm tra xem user/item c√≥ trong t·∫≠p train kh√¥ng
    if pd.isna(user_idx) or pd.isna(item_idx):
        # N·∫øu user ho·∫∑c item m·ªõi, d√πng rating trung b√¨nh = 2.5
        pred_rating = 2.5
    else:
        # Chuy·ªÉn sang integer
        user_idx = int(user_idx)
        item_idx = int(item_idx)
        
        # L·∫•y embeddings
        user_embedding = user_emb[user_idx]
        item_embedding = item_emb[item_idx]
        
        # T√≠nh rating d·ª± ƒëo√°n b·∫±ng dot product
        pred_rating = torch.dot(user_embedding, item_embedding).item()
        
        # Chu·∫©n h√≥a rating v·ªÅ kho·∫£ng [1, 5]
        # C√°ch 1: D√πng sigmoid function
        pred_rating = 1 + 4 * (1 / (1 + np.exp(-pred_rating)))
        
        # C√°ch 2: Ho·∫∑c scale d·ª±a tr√™n min-max (c·∫ßn t√≠nh t·ª´ train)
        # Gi·∫£ s·ª≠ dot product trong kho·∫£ng [-10, 10]
        # pred_rating = (pred_rating + 10) / 20 * 4 + 1
        
        # ƒê·∫£m b·∫£o trong kho·∫£ng [1, 5]
        pred_rating = max(1.0, min(5.0, pred_rating))
    
    predictions.append(pred_rating)

# 7. T·∫°o file submit
test_df['Score'] = predictions

# Ch·ªâ l·∫•y 2 c·ªôt c·∫ßn thi·∫øt cho submission
submit_df = test_df[['Id', 'Score']].copy()

# 8. L∆∞u file submit
submit_path = '/kaggle/working/SimGCL2.csv'
submit_df.to_csv(submit_path, index=False)

print(f"\n‚úÖ File submit ƒë√£ ƒë∆∞·ª£c t·∫°o t·∫°i: {submit_path}")
print(f"K√≠ch th∆∞·ªõc file: {submit_df.shape}")
print(f"\n10 d√≤ng ƒë·∫ßu ti√™n c·ªßa file submit:")
print(submit_df.head(10))
print(f"\nTh·ªëng k√™ rating d·ª± ƒëo√°n:")
print(submit_df['Score'].describe())

# 9. Ki·ªÉm tra ƒë·ªãnh d·∫°ng file
print(f"\nüìã Ki·ªÉm tra file submit:")
sample = pd.read_csv(submit_path)
print(f"C·ªôt: {sample.columns.tolist()}")
print(f"Ki·ªÉu d·ªØ li·ªáu: {sample.dtypes.tolist()}")
print(f"Min Score: {sample['Score'].min():.4f}")
print(f"Max Score: {sample['Score'].max():.4f}")
print(f"Mean Score: {sample['Score'].mean():.4f}")

# 10. L∆∞u th√™m b·∫£n backup ƒë·ªÉ ki·ªÉm tra
backup_path = '/kaggle/working/submission_with_details.csv'
test_df[['Id', 'user_id', 'movie_id', 'user_id_mapped', 'movie_id_mapped', 'Score']].to_csv(backup_path, index=False)
print(f"\nüìÅ File backup chi ti·∫øt: {backup_path}")

Test data shape: (9430, 2)
Test data columns: ['user_id', 'movie_id']

First few rows:
   user_id  movie_id
0        1        20
1        1        33
2        1        61
3        1       117
4        1       155

D·ªØ li·ªáu test:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9430 entries, 0 to 9429
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   user_id   9430 non-null   int64
 1   movie_id  9430 non-null   int64
dtypes: int64(2)
memory usage: 147.5 KB
None

S·ªë user kh√¥ng c√≥ trong t·∫≠p train: 0/9430
S·ªë item kh√¥ng c√≥ trong t·∫≠p train: 2/9430

‚úÖ File submit ƒë√£ ƒë∆∞·ª£c t·∫°o t·∫°i: /kaggle/working/SimGCL2.csv
K√≠ch th∆∞·ªõc file: (9430, 2)

10 d√≤ng ƒë·∫ßu ti√™n c·ªßa file submit:
   Id     Score
0   1  4.999327
1   2  1.110645
2   3  3.881274
3   4  3.088794
4   5  4.762532
5   6  3.679695
6   7  4.930430
7   8  4.987152
8   9  1.886794
9  10  1.696910

Th·ªëng k√™ rating d·ª± ƒëo√°n:
count    9430.000000