
###Downloading Dataset

In [1]:
# import kagglehub

# # Download latest version
# path = kagglehub.dataset_download("saurav9786/amazon-product-reviews")

# print("Path to dataset files:", path)

1.Data Preparation

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch_geometric.data import Data
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.loader import DataLoader
from sklearn.metrics import mean_squared_error, mean_absolute_error
from torchviz import make_dot

In [2]:
# Load the dataset
url = "./1/ratings_Electronics (1).csv"
col_names = ["user_id", "product_id", "rating", "timestamp"]
df = pd.read_csv(url, 
                  header = None,
                 names = col_names)
# Crop data
df = df.head(10000)

df.dropna(inplace = True)
df.drop_duplicates(inplace=True)


# Encode user IDs and item IDs
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

df['user_id'] = user_encoder.fit_transform(df['user_id'])
df['product_id'] = item_encoder.fit_transform(df['product_id'])

# Split the data into training, validation, and test sets
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Display the shape of the datasets
print("Training set shape:", train_df.shape)
print("Validation set shape:", val_df.shape)
print("Test set shape:", test_df.shape)

df.head()

Training set shape: (8000, 4)
Validation set shape: (1000, 4)
Test set shape: (1000, 4)


Unnamed: 0,user_id,product_id,rating,timestamp
0,8690,0,5.0,1365811200
1,3470,1,5.0,1341100800
2,4247,2,1.0,1367193600
3,4889,2,3.0,1374451200
4,1206,2,1.0,1334707200


2.Graph Contruction

In [3]:
# Create edge index from user-item interactions
edge_index = torch.tensor(np.array([train_df['user_id'].values, train_df['product_id'].values]), dtype=torch.long)

# Create edge attributes (ratings)
edge_attr = torch.tensor(train_df['rating'].values, dtype=torch.float)

# Create the PyTorch Geometric data object
data = Data(edge_index=edge_index, edge_attr=edge_attr)

# Display the data object
data

Data(edge_index=[2, 8000], edge_attr=[8000])

3. Feature Engineering 

In [4]:
num_users = df['user_id'].nunique()
num_items = df['product_id'].nunique()
num_nodes = num_users + num_items

# Create node features
node_features = torch.eye(num_nodes)

# Add node features to the data object
data.x = node_features

# Display the updated data object
data

Data(edge_index=[2, 8000], edge_attr=[8000], x=[11131, 11131])

4. Model

In [5]:
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.fc = torch.nn.Linear(hidden_channels * 2, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        
        # Apply the final linear layer on the concatenated edge features
        edge_pred = self.fc(torch.cat([x[edge_index[0]], x[edge_index[1]]], dim=1))
        return edge_pred.squeeze()

# Initialize the model
model = GCN(in_channels=node_features.size(1), hidden_channels=16, out_channels=1)

print(model)

GCN(
  (conv1): GCNConv(11131, 16)
  (conv2): GCNConv(16, 16)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)


4.1. Drawing the architecture

In [6]:

# Create a dummy input for visualization
dummy_data = torch.zeros((1, node_features.size(1)))  # Adjust the size as needed
dummy_edge_index = torch.tensor([[0], [0]], dtype=torch.long)  # Example edge index
dummy_data_object = Data(x=dummy_data, edge_index=dummy_edge_index)

# Forward pass to get the output
output = model(dummy_data_object)

# Visualize the model architecture
dot = make_dot(output, params=dict(model.named_parameters()))
dot.render("gcn_model_architecture", format="png")  # Save as PNG file

'gcn_model_architecture.png'

5. Training

In [7]:
# Prepare the data loader
train_loader = DataLoader([data], batch_size=16, shuffle=True)

# Define the loss function and optimizer
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training loop
for epoch in range(200):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        out = model(batch)
        loss = criterion(out, batch.edge_attr.view(-1, 1))
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch + 1}, Loss: {loss.item()}')

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1, Loss: 18.598840713500977
Epoch 2, Loss: 17.447160720825195
Epoch 3, Loss: 15.80548095703125
Epoch 4, Loss: 13.80807113647461
Epoch 5, Loss: 11.646232604980469
Epoch 6, Loss: 9.628479957580566
Epoch 7, Loss: 8.216821670532227
Epoch 8, Loss: 7.831818580627441
Epoch 9, Loss: 8.222282409667969
Epoch 10, Loss: 8.225130081176758
Epoch 11, Loss: 7.493508815765381
Epoch 12, Loss: 6.430135726928711
Epoch 13, Loss: 5.502135276794434
Epoch 14, Loss: 4.850258827209473
Epoch 15, Loss: 4.511412620544434
Epoch 16, Loss: 4.360482215881348
Epoch 17, Loss: 4.254871845245361
Epoch 18, Loss: 4.110317230224609
Epoch 19, Loss: 3.9027109146118164
Epoch 20, Loss: 3.6668694019317627
Epoch 21, Loss: 3.464456081390381
Epoch 22, Loss: 3.3498716354370117
Epoch 23, Loss: 3.3334872722625732
Epoch 24, Loss: 3.3537817001342773
Epoch 25, Loss: 3.3182876110076904
Epoch 26, Loss: 3.199431896209717
Epoch 27, Loss: 3.043088674545288
Epoch 28, Loss: 2.902818441390991
Epoch 29, Loss: 2.808452606201172
Epoch 30, Loss

6.Evaluation

In [10]:
# Convert validation and test data to PyTorch Geometric format
val_edge_index = torch.tensor([val_df['user_id'].values, val_df['product_id'].values], dtype=torch.long)
val_edge_attr = torch.tensor(val_df['rating'].values, dtype=torch.float)

test_edge_index = torch.tensor([test_df['user_id'].values, test_df['product_id'].values], dtype=torch.long)
test_edge_attr = torch.tensor(test_df['rating'].values, dtype=torch.float)

# Create data objects for validation and test sets
val_data = Data(edge_index=val_edge_index, edge_attr=val_edge_attr, x=node_features)
test_data = Data(edge_index=test_edge_index, edge_attr=test_edge_attr, x=node_features)


# Calculate evaluation metrics
val_mse = mean_squared_error(val_edge_attr.numpy(), val_out.numpy())
val_rmse = np.sqrt(val_mse)  # Manually calculate RMSE
val_mae = mean_absolute_error(val_edge_attr.numpy(), val_out.numpy())

test_mse = mean_squared_error(test_edge_attr.numpy(), test_out.numpy())
test_rmse = np.sqrt(test_mse)  # Manually calculate RMSE
test_mae = mean_absolute_error(test_edge_attr.numpy(), test_out.numpy())

print(f'Validation RMSE: {val_rmse}, Validation MAE: {val_mae}')
print(f'Test RMSE: {test_rmse}, Test MAE: {test_mae}')

Validation RMSE: 1.4742927666112078, Validation MAE: 1.275151252746582
Test RMSE: 1.4363036775310978, Test MAE: 1.2575424909591675


In [11]:
test_data

Data(x=[11131, 11131], edge_index=[2, 1000], edge_attr=[1000])