In [15]:
#!pip install datasets torch 

In [6]:
import warnings
warnings.filterwarnings("ignore")

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset

In [8]:
movies = load_dataset("ExecuteAutomation/ImdbMovieDataSet")
music = load_dataset("maharshipandya/spotify-tracks-dataset")
books = load_dataset("Eitanli/goodreads")

Repo card metadata block was not found. Setting CardData to empty.


In [9]:
movies_df = movies['train'].to_pandas()
music_df = music['train'].to_pandas()
books_df = books['train'].to_pandas()

In [10]:
print(movies_df.columns)
print(music_df.columns)
print(books_df.columns)

Index(['names', 'date_x', 'score', 'genre', 'overview', 'crew', 'orig_title',
       'status', 'orig_lang', 'budget_x', 'revenue', 'country'],
      dtype='object')
Index(['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name',
       'popularity', 'duration_ms', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
       'track_genre'],
      dtype='object')
Index(['Unnamed: 0', 'Book', 'Author', 'Description', 'Genres', 'Avg_Rating',
       'Num_Ratings', 'URL'],
      dtype='object')


In [11]:
# Assign unique Item_IDs
movies_df["Item_ID"] = "Movie_" + movies_df["names"].astype(str)
music_df["Item_ID"] = "Music_" + music_df["track_id"].astype(str)
books_df["Item_ID"] = "Book_" + books_df["Book"].astype(str)

# Standardizing Columns
movies_df = movies_df[["Item_ID", "orig_title", "genre", "overview", "score", "crew", "date_x"]]
music_df = music_df[["Item_ID", "track_name", "track_genre", "popularity", "artists"]]
books_df = books_df[["Item_ID", "Book", "Genres", "Description", "Avg_Rating", "Author"]]

# Rename for consistency
movies_df.rename(columns={"orig_title": "Title", "genre": "Genre", "overview": "Description", "score": "Popularity", "crew": "Creator", "date_x": "Timestamp"}, inplace=True)
music_df.rename(columns={"track_name": "Title", "track_genre": "Genre", "popularity": "Popularity", "artists": "Creator"}, inplace=True)
books_df.rename(columns={"Book": "Title", "Genres": "Genre", "Description": "Description", "Avg_Rating": "Popularity", "Author": "Creator"}, inplace=True)

# Add Item_Type
movies_df["Item_Type"] = "Movie"
music_df["Item_Type"] = "Music"
books_df["Item_Type"] = "Book"

# Handle missing timestamps (generate random timestamps)
movies_df["Timestamp"] = pd.to_datetime(movies_df["Timestamp"], errors="coerce")
music_df["Timestamp"] = pd.to_datetime("2024-03-17")  # Static timestamp
books_df["Timestamp"] = pd.to_datetime("2024-03-17")  # Static timestamp

# Combine datasets
merged_df = pd.concat([movies_df, music_df, books_df], ignore_index=True)

# Save preprocessed dataset
merged_df.to_csv("session_data.csv", index=False)

print(merged_df.head())


                             Item_ID                        Title  \
0                    Movie_Creed III                    Creed III   
1     Movie_Avatar: The Way of Water     Avatar: The Way of Water   
2  Movie_The Super Mario Bros. Movie  The Super Mario Bros. Movie   
3                      Movie_Mummies                       Momias   
4                    Movie_Supercell                    Supercell   

                                           Genre  \
0                                  Drama, Action   
1             Science Fiction, Adventure, Action   
2  Animation, Adventure, Family, Fantasy, Comedy   
3  Animation, Comedy, Family, Adventure, Fantasy   
4                                         Action   

                                         Description  Popularity  \
0  After dominating the boxing world, Adonis Cree...        73.0   
1  Set more than a decade after the events of the...        78.0   
2  While working underground to fix a water main,...        76.0   


In [12]:
# Simulating user sessions
session_ids = np.random.randint(1000, 5000, size=len(merged_df))
action_types = np.random.choice(["Clicked", "Searched", "Scrolled"], size=len(merged_df))

merged_df["Session_ID"] = session_ids
merged_df["Action_Type"] = action_types

# Save session-based data
merged_df.to_csv("session_events.csv", index=False)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Load session data
data = pd.read_csv("session_events.csv")

# Convert Item_IDs into unique numerical values
item_vocab = {item: idx for idx, item in enumerate(set(data["Item_ID"]))}
inv_vocab = {idx: item for item, idx in item_vocab.items()}

# Convert sessions into numerical sequences
sessions = data.groupby("Session_ID")["Item_ID"].apply(lambda x: [item_vocab[i] for i in x]).tolist()

# Dataset Class
from torch.nn.utils.rnn import pad_sequence

class SessionDataset(Dataset):
    def __init__(self, sessions, max_length=50):
        self.sessions = sessions
        self.max_length = max_length

    def __len__(self):
        return len(self.sessions)

    def __getitem__(self, idx):
        session = self.sessions[idx]
        session_tensor = torch.tensor(session, dtype=torch.long)

        # Pad sequences to the max length
        if len(session_tensor) < self.max_length:
            pad_size = self.max_length - len(session_tensor)
            session_tensor = torch.cat([session_tensor, torch.zeros(pad_size, dtype=torch.long)])

        return session_tensor[:-1], session_tensor[1:]  # Input & Target

# Collate function to pad sequences dynamically in the DataLoader
def collate_fn(batch):
    inputs, targets = zip(*batch)
    inputs_padded = pad_sequence([torch.tensor(seq) for seq in inputs], batch_first=True, padding_value=0)
    targets_padded = pad_sequence([torch.tensor(seq) for seq in targets], batch_first=True, padding_value=0)
    return inputs_padded, targets_padded


# Define GRU Model
class GRU4Rec(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256):
        super(GRU4Rec, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.gru(x)
        return self.fc(out)

# Training Model
dataset = SessionDataset(sessions)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)

model = GRU4Rec(len(item_vocab))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(5):
    for batch in dataloader:
        inputs, targets = batch
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, len(item_vocab)), targets.view(-1))
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

torch.save(model.state_dict(), "session_rec_model.pth")
