In [2]:
#!pip install datasets torch 

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset

In [3]:
movies = load_dataset("ExecuteAutomation/ImdbMovieDataSet")
music = load_dataset("maharshipandya/spotify-tracks-dataset")
books = load_dataset("Eitanli/goodreads")

README.md:   0%|          | 0.00/850 [00:00<?, ?B/s]

imdb_movies.csv:   0%|          | 0.00/6.72M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10178 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/4.68k [00:00<?, ?B/s]

dataset.csv:   0%|          | 0.00/20.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/114000 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/737 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


goodreads_data.csv:   0%|          | 0.00/11.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [4]:
movies_df = movies['train'].to_pandas()
music_df = music['train'].to_pandas()
books_df = books['train'].to_pandas()

In [5]:
print(movies_df.columns)
print(music_df.columns)
print(books_df.columns)

Index(['names', 'date_x', 'score', 'genre', 'overview', 'crew', 'orig_title',
       'status', 'orig_lang', 'budget_x', 'revenue', 'country'],
      dtype='object')
Index(['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name',
       'popularity', 'duration_ms', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
       'track_genre'],
      dtype='object')
Index(['Unnamed: 0', 'Book', 'Author', 'Description', 'Genres', 'Avg_Rating',
       'Num_Ratings', 'URL'],
      dtype='object')


In [6]:
# Assign unique Item_IDs
movies_df["Item_ID"] = "Movie_" + movies_df["names"].astype(str)
music_df["Item_ID"] = "Music_" + music_df["track_id"].astype(str)
books_df["Item_ID"] = "Book_" + books_df["Book"].astype(str)

# Standardizing Columns
movies_df = movies_df[["Item_ID", "orig_title", "genre", "overview", "score", "crew", "date_x"]]
music_df = music_df[["Item_ID", "track_name", "track_genre", "popularity", "artists"]]
books_df = books_df[["Item_ID", "Book", "Genres", "Description", "Avg_Rating", "Author"]]

# Rename for consistency
movies_df.rename(columns={"orig_title": "Title", "genre": "Genre", "overview": "Description", "score": "Popularity", "crew": "Creator", "date_x": "Timestamp"}, inplace=True)
music_df.rename(columns={"track_name": "Title", "track_genre": "Genre", "popularity": "Popularity", "artists": "Creator"}, inplace=True)
books_df.rename(columns={"Book": "Title", "Genres": "Genre", "Description": "Description", "Avg_Rating": "Popularity", "Author": "Creator"}, inplace=True)

# Add Item_Type
movies_df["Item_Type"] = "Movie"
music_df["Item_Type"] = "Music"
books_df["Item_Type"] = "Book"

# Handle missing timestamps (generate random timestamps)
movies_df["Timestamp"] = pd.to_datetime(movies_df["Timestamp"], errors="coerce")
music_df["Timestamp"] = pd.to_datetime("2024-03-17")  # Static timestamp
books_df["Timestamp"] = pd.to_datetime("2024-03-17")  # Static timestamp

# Combine datasets
merged_df = pd.concat([movies_df, music_df, books_df], ignore_index=True)

# Save preprocessed dataset
merged_df.to_csv("session_data.csv", index=False)

print(merged_df.head())


                             Item_ID                        Title  \
0                    Movie_Creed III                    Creed III   
1     Movie_Avatar: The Way of Water     Avatar: The Way of Water   
2  Movie_The Super Mario Bros. Movie  The Super Mario Bros. Movie   
3                      Movie_Mummies                       Momias   
4                    Movie_Supercell                    Supercell   

                                           Genre  \
0                                  Drama, Action   
1             Science Fiction, Adventure, Action   
2  Animation, Adventure, Family, Fantasy, Comedy   
3  Animation, Comedy, Family, Adventure, Fantasy   
4                                         Action   

                                         Description  Popularity  \
0  After dominating the boxing world, Adonis Cree...        73.0   
1  Set more than a decade after the events of the...        78.0   
2  While working underground to fix a water main,...        76.0   


In [7]:
# Simulating user sessions
session_ids = np.random.randint(1000, 5000, size=len(merged_df))
action_types = np.random.choice(["Clicked", "Searched", "Scrolled"], size=len(merged_df))

merged_df["Session_ID"] = session_ids
merged_df["Action_Type"] = action_types

# Save session-based data
merged_df.to_csv("session_events.csv", index=False)

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Load session data
data = pd.read_csv("session_events.csv")

# Convert Item_IDs into unique numerical values
item_vocab = {item: idx for idx, item in enumerate(set(data["Item_ID"]))}
inv_vocab = {idx: item for item, idx in item_vocab.items()}

# Convert sessions into numerical sequences
sessions = data.groupby("Session_ID")["Item_ID"].apply(lambda x: [item_vocab[i] for i in x]).tolist()

# Dataset Class
from torch.nn.utils.rnn import pad_sequence

class SessionDataset(Dataset):
    def __init__(self, sessions, max_length=50):
        self.sessions = sessions
        self.max_length = max_length

    def __len__(self):
        return len(self.sessions)

    def __getitem__(self, idx):
        session = self.sessions[idx]
        session_tensor = torch.tensor(session, dtype=torch.long)

        # Pad sequences to the max length
        if len(session_tensor) < self.max_length:
            pad_size = self.max_length - len(session_tensor)
            session_tensor = torch.cat([session_tensor, torch.zeros(pad_size, dtype=torch.long)])

        return session_tensor[:-1], session_tensor[1:]  # Input & Target

# Collate function to pad sequences dynamically in the DataLoader
def collate_fn(batch):
    inputs, targets = zip(*batch)
    inputs_padded = pad_sequence([torch.tensor(seq) for seq in inputs], batch_first=True, padding_value=0)
    targets_padded = pad_sequence([torch.tensor(seq) for seq in targets], batch_first=True, padding_value=0)
    return inputs_padded, targets_padded


# Define GRU Model
class GRU4Rec(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256):
        super(GRU4Rec, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.gru(x)
        return self.fc(out)

# Training Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataset = SessionDataset(sessions)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)

model = GRU4Rec(len(item_vocab)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs")
    model = nn.DataParallel(model)

for epoch in range(50):
    for batch in dataloader:
        #inputs, targets = batch
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, len(item_vocab)), targets.view(-1))
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

#torch.save(model.state_dict(), "session_rec_model.pth")
torch.save(model.module.state_dict() if isinstance(model, nn.DataParallel) else model.state_dict(), "session_rec_model.pth")

Using 2 GPUs


  inputs_padded = pad_sequence([torch.tensor(seq) for seq in inputs], batch_first=True, padding_value=0)
  targets_padded = pad_sequence([torch.tensor(seq) for seq in targets], batch_first=True, padding_value=0)


Epoch 1, Loss: 7.690988540649414
Epoch 2, Loss: 7.596072673797607
Epoch 3, Loss: 7.072775840759277
Epoch 4, Loss: 6.372457027435303
Epoch 5, Loss: 5.28287935256958
Epoch 6, Loss: 4.004252910614014
Epoch 7, Loss: 2.8613224029541016
Epoch 8, Loss: 1.953985571861267
Epoch 9, Loss: 1.0319585800170898
Epoch 10, Loss: 0.7213608622550964
Epoch 11, Loss: 0.41617482900619507
Epoch 12, Loss: 0.25872746109962463
Epoch 13, Loss: 0.20532339811325073
Epoch 14, Loss: 0.11697680503129959
Epoch 15, Loss: 0.08248290419578552
Epoch 16, Loss: 0.08017588406801224
Epoch 17, Loss: 0.05788752809166908
Epoch 18, Loss: 0.044767845422029495
Epoch 19, Loss: 0.038895055651664734
Epoch 20, Loss: 0.03399191424250603
Epoch 21, Loss: 0.03168043494224548
Epoch 22, Loss: 0.02802925743162632
Epoch 23, Loss: 0.028693964704871178
Epoch 24, Loss: 0.02787163481116295
Epoch 25, Loss: 0.02221776731312275
Epoch 26, Loss: 0.017336327582597733
Epoch 27, Loss: 0.01764662377536297
Epoch 28, Loss: 0.018843399360775948
Epoch 29, Loss

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from torch.nn.utils.rnn import pad_sequence

# Load session data
data = pd.read_csv("session_events.csv")

# Convert Item_IDs into unique numerical values
item_vocab = {item: idx for idx, item in enumerate(set(data["Item_ID"]))}
inv_vocab = {idx: item for item, idx in item_vocab.items()}

# Convert sessions into numerical sequences
sessions = data.groupby("Session_ID")["Item_ID"].apply(lambda x: [item_vocab[i] for i in x]).tolist()

# Dataset Class
class SessionDataset(Dataset):
    def __init__(self, sessions, max_length=50):
        self.sessions = sessions
        self.max_length = max_length

    def __len__(self):
        return len(self.sessions)

    def __getitem__(self, idx):
        session = self.sessions[idx]
        session_tensor = torch.tensor(session, dtype=torch.long)

        # Pad sequences to the max length
        if len(session_tensor) < self.max_length:
            pad_size = self.max_length - len(session_tensor)
            session_tensor = torch.cat([session_tensor, torch.zeros(pad_size, dtype=torch.long)])

        return session_tensor[:-1], session_tensor[1:]  # Input & Target

# Collate function to pad sequences dynamically in the DataLoader
def collate_fn(batch):
    inputs, targets = zip(*batch)
    inputs_padded = pad_sequence([torch.tensor(seq) for seq in inputs], batch_first=True, padding_value=0)
    targets_padded = pad_sequence([torch.tensor(seq) for seq in targets], batch_first=True, padding_value=0)
    return inputs_padded, targets_padded

# Define GRU Model
class GRU4Rec(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256):
        super(GRU4Rec, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.gru(x)
        return self.fc(out)

# Training Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataset = SessionDataset(sessions)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)

model = GRU4Rec(len(item_vocab)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs")
    model = nn.DataParallel(model)

# Early Stopping Parameters
patience = 5  # Number of epochs to wait before stopping
best_loss = float('inf')
epochs_no_improve = 0
early_stop = False

for epoch in range(50):
    epoch_loss = 0
    for batch in dataloader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, len(item_vocab)), targets.view(-1).long())
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    epoch_loss /= len(dataloader)
    print(f"Epoch {epoch+1}, Loss: {epoch_loss}")

    # Early Stopping Logic
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        epochs_no_improve = 0
        # Save best model
        torch.save(model.module.state_dict() if isinstance(model, nn.DataParallel) else model.state_dict(), "best_session_rec_model.pth")
    else:
        epochs_no_improve += 1
        print(f"No improvement for {epochs_no_improve}/{patience} epochs.")

    if epochs_no_improve >= patience:
        print("Early stopping triggered!")
        early_stop = True
        break  # Stop training

# Save final model
torch.save(model.module.state_dict() if isinstance(model, nn.DataParallel) else model.state_dict(), "session_rec_model.pth")


Using 2 GPUs


  inputs_padded = pad_sequence([torch.tensor(seq) for seq in inputs], batch_first=True, padding_value=0)
  targets_padded = pad_sequence([torch.tensor(seq) for seq in targets], batch_first=True, padding_value=0)


Epoch 1, Loss: 8.307071125696575
Epoch 2, Loss: 7.65139435208033
Epoch 3, Loss: 7.225096967485216
Epoch 4, Loss: 6.392066531711155
Epoch 5, Loss: 5.274069975292872
Epoch 6, Loss: 4.047372151934911
Epoch 7, Loss: 2.8272931954217335
Epoch 8, Loss: 1.7609415338152932
Epoch 9, Loss: 1.0468707964533852
Epoch 10, Loss: 0.6365305544838072
Epoch 11, Loss: 0.3960812318892706
Epoch 12, Loss: 0.2561607233115605
Epoch 13, Loss: 0.1735328539969429
Epoch 14, Loss: 0.12308307441454085
Epoch 15, Loss: 0.09184038166015868
Epoch 16, Loss: 0.07152377128128022
Epoch 17, Loss: 0.05786299740984326
Epoch 18, Loss: 0.04808862211685332
Epoch 19, Loss: 0.0411058192451795
Epoch 20, Loss: 0.03562455198594502
Epoch 21, Loss: 0.03134127413587911
Epoch 22, Loss: 0.027880316068019186
Epoch 23, Loss: 0.02512658458380472
Epoch 24, Loss: 0.022715698277193403
Epoch 25, Loss: 0.02078732555466039
Epoch 26, Loss: 0.019079701590632634
Epoch 27, Loss: 0.017627021519555933
Epoch 28, Loss: 0.016315578970880734
Epoch 29, Loss: 0

In [11]:
# Load model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GRU4Rec(len(item_vocab)).to(device)

# Load best trained weights
model.load_state_dict(torch.load("best_session_rec_model.pth", map_location=device))
model.eval()  # Set model to evaluation mode

  model.load_state_dict(torch.load("best_session_rec_model.pth", map_location=device))


GRU4Rec(
  (embedding): Embedding(109272, 128)
  (gru): GRU(128, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=109272, bias=True)
)

In [12]:
import torch.nn.functional as F

def recommend_next_items(session, top_k=5):
    """
    Given a session (list of Item_IDs), predict the next top-k recommended items.
    """
    # Convert session items to numerical indices
    session_numeric = [item_vocab[item] for item in session if item in item_vocab]
    
    if not session_numeric:
        print("Error: None of the session items exist in the vocabulary.")
        return []
    
    # Convert to tensor and move to device
    session_tensor = torch.tensor(session_numeric, dtype=torch.long).unsqueeze(0).to(device)

    # Get model predictions
    with torch.no_grad():
        output = model(session_tensor)  # Shape: (1, seq_len, vocab_size)

    # Get the last output prediction
    last_item_logits = output[:, -1, :]  # Shape: (1, vocab_size)

    # Convert logits to probabilities
    probabilities = F.softmax(last_item_logits, dim=-1)

    # Get top-k recommended item indices
    top_k_indices = torch.topk(probabilities, top_k, dim=-1).indices.squeeze(0).tolist()

    # Convert indices back to Item_IDs
    recommended_items = [inv_vocab[idx] for idx in top_k_indices]

    return recommended_items


In [14]:
# Example session (list of previously interacted Item_IDs)
example_session = ["Movie_Creed III", "Movie_Mummies", "Movie_Supercell"]

# Get top 5 recommendations
recommendations = recommend_next_items(example_session, top_k=5)
print("Recommended Items:", recommendations)


Recommended Items: ['Movie_Ford v Ferrari', 'Music_5awljpWNO5TpXCyjpvCBbs', 'Music_29RiulWABWHcTRLkDqVCl1', 'Music_69Jv0CiMlrpfjh9N2WFkr0', 'Music_40o76YIOwDazc0h2QrZhWl']
