In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import DataLoader, Dataset
import pandas as pd

In [None]:
spotify_data = pd.read_csv("spotify_synthetic_data.csv")

In [None]:
# Preprocessing
def preprocess_data(data):
    # Encode categorical features
    label_encoders = {}
    for col in ['track_name', 'artist', 'album', 'genre']:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
        label_encoders[col] = le

    # Scale numerical features
    scaler = StandardScaler()
    numerical_features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
                          'instrumentalness', 'liveness', 'valence', 'tempo']
    data[numerical_features] = scaler.fit_transform(data[numerical_features])

    return data, label_encoders, scaler

spotify_data, label_encoders, scaler = preprocess_data(spotify_data)


In [None]:

# Splitting data
X = spotify_data[['track_name', 'artist', 'album', 'genre', 'danceability', 'energy', 'loudness',
                  'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']]
y = spotify_data['genre']  # Using genre as the recommendation target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Dataset class
class SpotifyDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        user_input = torch.tensor(self.X.iloc[idx, 0], dtype=torch.long)  # track_name
        item_input = torch.tensor(self.X.iloc[idx, 1], dtype=torch.long)  # artist
        numerical_features = torch.tensor(self.X.iloc[idx, 4:].values, dtype=torch.float32)  # numerical features
        label = torch.tensor(self.y.iloc[idx], dtype=torch.long)  # genre
        return user_input, item_input, numerical_features, label


In [None]:
train_dataset = SpotifyDataset(X_train, y_train)
test_dataset = SpotifyDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [None]:
# Two-Tower Model
def create_embedding_layer(input_dim, output_dim):
    return nn.Embedding(input_dim, output_dim)

class TwoTowerRecommender(nn.Module):
    def __init__(self, user_input_dim, item_input_dim, embed_dim):
        super(TwoTowerRecommender, self).__init__()

        # User tower
        self.user_embedding = create_embedding_layer(user_input_dim, embed_dim)
        self.user_fc = nn.Sequential(
            nn.Linear(embed_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64)
        )

        # Item tower
        self.item_embedding = create_embedding_layer(item_input_dim, embed_dim)
        self.item_fc = nn.Sequential(
            nn.Linear(embed_dim + 9, 128),  # 9 numerical features
            nn.ReLU(),
            nn.Linear(128, 64)
        )

    def forward(self, user_input, item_input, numerical_features):
        # User tower
        user_embed = self.user_embedding(user_input)
        user_vector = self.user_fc(user_embed)

        # Item tower
        item_embed = self.item_embedding(item_input)
        item_input_combined = torch.cat([item_embed, numerical_features], dim=1)
        item_vector = self.item_fc(item_input_combined)

        # Dot product for similarity
        return torch.sum(user_vector * item_vector, dim=1)

In [None]:
# Model instantiation
user_input_dim = spotify_data['track_name'].nunique()
item_input_dim = spotify_data['artist'].nunique()
embed_dim = 50


In [None]:
model = TwoTowerRecommender(user_input_dim, item_input_dim, embed_dim)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Training Loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for user_input, item_input, numerical_features, labels in train_loader:
        # Forward pass
        outputs = model(user_input, item_input, numerical_features)
        loss = criterion(outputs, labels.float())

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss / len(train_loader):.4f}")

Epoch [1/10], Loss: -197.5664
Epoch [2/10], Loss: -5105.8124
Epoch [3/10], Loss: -47110.4682
Epoch [4/10], Loss: -270066.0439
Epoch [5/10], Loss: -1150083.3306
Epoch [6/10], Loss: -4095013.9475
Epoch [7/10], Loss: -12122658.6100
Epoch [8/10], Loss: -31946520.7800
Epoch [9/10], Loss: -76296702.3200
Epoch [10/10], Loss: -168488197.4400


In [None]:
# Function to recommend songs for a given user track
def recommend_songs(user_track_id, candidate_songs, model, top_k=5):
    model.eval()

    user_input = torch.tensor([user_track_id] * len(candidate_songs), dtype=torch.long)
    item_input = torch.tensor(candidate_songs['artist'].values, dtype=torch.long)
    numerical_features = torch.tensor(candidate_songs.iloc[:, 4:].values, dtype=torch.float32)

    with torch.no_grad():
        scores = model(user_input, item_input, numerical_features)

    # Attach scores to candidate songs
    candidate_songs['score'] = scores.numpy()
    recommendations = candidate_songs.sort_values(by='score', ascending=False).head(top_k)
    return recommendations

In [None]:
user_track_id = 1041
candidate_songs = spotify_data.sample(20)
recommended_songs = recommend_songs(user_track_id, candidate_songs, model, top_k=5)
print(recommended_songs[['track_name', 'artist', 'score']])

      track_name  artist       score
283         1206      37  83183416.0
882         1871      45  72438920.0
1331         371      93  61379428.0
488         1433      96  61377024.0
7           1778      86  60033172.0
