In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch_geometric.data import Data
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.loader import DataLoader
from sklearn.metrics import mean_squared_error, mean_absolute_error

1. Data Preparation

1.1 Imports and dataset

In [2]:
# Load the dataset
url = "../../ratings_Electronics (1).csv"
col_names = ['userId', 'productId', 'rating', 'timestamp']
df = pd.read_csv(url,
                 header = None,
                 names = col_names)

df.head()

Unnamed: 0,userId,productId,rating,timestamp
0,AKM1MP6P0OYPR,132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,439886341,1.0,1334707200


1.2 Preprocessing

In [4]:

# Clean the data
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Crop data
df = df.head(10000)

# Encode user IDs and item IDs
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

df['userId'] = user_encoder.fit_transform(df['userId'])
df['productId'] = item_encoder.fit_transform(df['productId'])

# Split the data into training, validation, and test sets
train_df, test_df = train_test_split(df, test_size=0.20, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# Display the shape of the datasets
print("Training set shape:", train_df.shape)
print("Validation set shape:", val_df.shape)
print("Test set shape:", test_df.shape)

# user_item_matrix = train_df.pivot_table(index='userId', columns='productId', values='rating', fill_value=0)
# print(user_item_matrix)

Training set shape: (6400, 4)
Validation set shape: (1600, 4)
Test set shape: (2000, 4)


In [10]:
df.head()

Unnamed: 0,userId,productId,rating,timestamp
0,8690,0,5.0,1365811200
1,3470,1,5.0,1341100800
2,4247,2,1.0,1367193600
3,4889,2,3.0,1374451200
4,1206,2,1.0,1334707200


2. Graph Construction

In [12]:
# Create edge index from user-item interactions
edge_index = torch.tensor(np.array([train_df['userId'].values, train_df['productId'].values]), dtype=torch.long)

# Create edge attributes (ratings)
edge_attr = torch.tensor(train_df['rating'].values, dtype=torch.float)

# Create the PyTorch Geometric data object
data = Data(edge_index=edge_index, edge_attr=edge_attr)

# Display the data object
data

Data(edge_index=[2, 6400], edge_attr=[6400])

3. Feature Engineering

In [13]:
num_users = df['userId'].nunique()
num_items = df['productId'].nunique()
num_nodes = num_users + num_items

# Create node features
node_features = torch.eye(num_nodes)

# Add node features to the data object
data.x = node_features

# Display the updated data object
data

Data(edge_index=[2, 6400], edge_attr=[6400], x=[11131, 11131])

4. Model

In [16]:
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.fc = torch.nn.Linear(hidden_channels * 2, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        
        # Apply the final linear layer on the concatenated edge features
        edge_pred = self.fc(torch.cat([x[edge_index[0]], x[edge_index[1]]], dim=1))
        return edge_pred.squeeze()

# Initialize the model
model = GCN(in_channels=node_features.size(1), hidden_channels=16, out_channels=1)

print(model)

GCN(
  (conv1): GCNConv(11131, 16)
  (conv2): GCNConv(16, 16)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)


5. Training

In [17]:
# Prepare the data loader
train_loader = DataLoader([data], batch_size=2, shuffle=True)

# Define the loss function and optimizer
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training loop
for epoch in range(200):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        out = model(batch)
        loss = criterion(out, batch.edge_attr.view(-1, 1))
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch + 1}, Loss: {loss.item()}')

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1, Loss: 18.296546936035156
Epoch 2, Loss: 17.326868057250977
Epoch 3, Loss: 15.77271556854248
Epoch 4, Loss: 13.883223533630371
Epoch 5, Loss: 11.822896957397461
Epoch 6, Loss: 9.822543144226074
Epoch 7, Loss: 8.233381271362305
Epoch 8, Loss: 7.431187629699707
Epoch 9, Loss: 7.485739707946777
Epoch 10, Loss: 7.745590686798096
Epoch 11, Loss: 7.448578357696533
Epoch 12, Loss: 6.584700107574463
Epoch 13, Loss: 5.592834949493408
Epoch 14, Loss: 4.798435211181641
Epoch 15, Loss: 4.304017066955566
Epoch 16, Loss: 4.054843902587891
Epoch 17, Loss: 3.9369192123413086
Epoch 18, Loss: 3.8436503410339355
Epoch 19, Loss: 3.709399700164795
Epoch 20, Loss: 3.523167133331299
Epoch 21, Loss: 3.3166046142578125
Epoch 22, Loss: 3.1433513164520264
Epoch 23, Loss: 3.049170732498169
Epoch 24, Loss: 3.0435893535614014
Epoch 25, Loss: 3.085623025894165
Epoch 26, Loss: 3.1048836708068848
Epoch 27, Loss: 3.054145574569702
Epoch 28, Loss: 2.9405934810638428
Epoch 29, Loss: 2.8095643520355225
Epoch 30, L

6. Evaluation

In [19]:
# Convert validation and test data to PyTorch Geometric format
val_edge_index = torch.tensor([val_df['userId'].values, val_df['productId'].values], dtype=torch.long)
val_edge_attr = torch.tensor(val_df['rating'].values, dtype=torch.float)

test_edge_index = torch.tensor([test_df['userId'].values, test_df['productId'].values], dtype=torch.long)
test_edge_attr = torch.tensor(test_df['rating'].values, dtype=torch.float)

# Create data objects for validation and test sets
val_data = Data(edge_index=val_edge_index, edge_attr=val_edge_attr, x=node_features)
test_data = Data(edge_index=test_edge_index, edge_attr=test_edge_attr, x=node_features)

# Evaluate the model
model.eval()
with torch.no_grad():
    val_out = model(val_data)
    test_out = model(test_data)

# Calculate evaluation metrics
val_rmse = mean_squared_error(val_edge_attr.numpy(), val_out.numpy())
val_mae = mean_absolute_error(val_edge_attr.numpy(), val_out.numpy())

test_rmse = mean_squared_error(test_edge_attr.numpy(), test_out.numpy())
test_mae = mean_absolute_error(test_edge_attr.numpy(), test_out.numpy())

print(f'Validation RMSE: {val_rmse}, Validation MAE: {val_mae}')
print(f'Test RMSE: {test_rmse}, Test MAE: {test_mae}')

Validation RMSE: 2.4902966022491455, Validation MAE: 1.422216534614563
Test RMSE: 2.4232308864593506, Test MAE: 1.3926533460617065


In [28]:
# Assuming val_edge_index, val_edge_attr, and val_out are defined
# Convert val_edge_index to a DataFrame
user_ids = val_edge_index[0].numpy()  # User IDs
product_ids = val_edge_index[1].numpy()  # Product IDs
ratings = val_edge_attr.numpy()  # Actual ratings
predicted_ratings = val_out.numpy()  # Predicted ratings

# Create a DataFrame
comparison_df = pd.DataFrame({
    'User ID': user_ids,
    'Product ID': product_ids,
    'Rating (val_edge_attr)': ratings,
    'Predicted Rating (val_out)': predicted_ratings
})

# Display the DataFrame
print(comparison_df.head(50))

    User ID  Product ID  Rating (val_edge_attr)  Predicted Rating (val_out)
0      4649          63                     4.0                    3.349208
1      3774        1229                     5.0                    3.506133
2      9226          57                     5.0                    3.268442
3      3495        1200                     3.0                    3.103469
4      5074          53                     4.0                    3.527458
5      2208          38                     5.0                    4.289050
6      4122          61                     5.0                    3.666212
7      9461          53                     5.0                    3.531311
8      3750          38                     5.0                    4.283827
9      8434          65                     2.0                    3.256761
10     8589         262                     3.0                    3.225618
11     1350         782                     5.0                    3.311023
12     5311 