In [1]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import os
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import random
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors


In [3]:
class Encoder(nn.Module):
    def __init__(self, input_dim=768, latent_dim=256):
        super(Encoder, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)

        self.fc2 = nn.Linear(512, 384)
        self.bn2 = nn.BatchNorm1d(384)

        self.fc3 = nn.Linear(384, latent_dim)

    def forward(self, x):
        h = F.leaky_relu(self.bn1(self.fc1(x)), 0.2)
        h = F.leaky_relu(self.bn2(self.fc2(h)), 0.2)
        latent = self.fc3(h)
        return latent

In [4]:
class Decoder(nn.Module):
    def __init__(self, latent_dim=256, output_dim=768):
        super(Decoder, self).__init__()
        self.fc1 = nn.Linear(latent_dim, 384)
        self.bn1 = nn.BatchNorm1d(384)

        self.fc2 = nn.Linear(384, 512)
        self.bn2 = nn.BatchNorm1d(512)

        self.fc3 = nn.Linear(512, output_dim)

    def forward(self, z):
        h = F.leaky_relu(self.bn1(self.fc1(z)), 0.2)
        h = F.leaky_relu(self.bn2(self.fc2(h)), 0.2)
        reconstructed = torch.sigmoid(self.fc3(h))
        return reconstructed

In [5]:
class Discriminator(nn.Module):
    def __init__(self, latent_dim=256):
        super(Discriminator, self).__init__()
        self.fc1 = nn.Linear(latent_dim, 256)
        self.bn1 = nn.BatchNorm1d(256)

        self.fc2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128)

        self.fc3 = nn.Linear(128, 1)

    def forward(self, z):
        h = F.leaky_relu(self.bn1(self.fc1(z)), 0.2)
        h = F.leaky_relu(self.bn2(self.fc2(h)), 0.2)
        logits = self.fc3(h)
        return logits

In [6]:
class AdversarialAutoencoder:
    def __init__(self, input_dim=768, latent_dim=256, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.input_dim = input_dim
        self.latent_dim = latent_dim
        self.device = device

        # Initialize networks
        self.encoder = Encoder(input_dim, latent_dim).to(device)
        self.decoder = Decoder(latent_dim, input_dim).to(device)
        self.discriminator = Discriminator(latent_dim).to(device)

        # Initialize optimizers
        self.encoder_optimizer = optim.Adam(self.encoder.parameters(), lr=0.001)
        self.decoder_optimizer = optim.Adam(self.decoder.parameters(), lr=0.001)
        self.discriminator_optimizer = optim.Adam(self.discriminator.parameters(), lr=0.0001)

        # Loss functions
        self.reconstruction_loss = nn.MSELoss()
        self.adversarial_loss = nn.BCEWithLogitsLoss()

    def train_step(self, x_batch):
        batch_size = x_batch.size(0)
        x_batch = x_batch.to(self.device)

        # Target tensors
        real_target = torch.ones(batch_size, 1).to(self.device)
        fake_target = torch.zeros(batch_size, 1).to(self.device)

        # Train Autoencoder
        self.encoder_optimizer.zero_grad()
        self.decoder_optimizer.zero_grad()

        # Encode and decode the input
        z = self.encoder(x_batch)
        x_reconstructed = self.decoder(z)

        # Compute reconstruction loss
        recon_loss = self.reconstruction_loss(x_reconstructed, x_batch)

        # Compute adversarial loss for the generator (encoder)
        gen_loss = self.adversarial_loss(self.discriminator(z), real_target)

        # Total autoencoder loss
        ae_loss = recon_loss + gen_loss

        # Backpropagate and update parameters
        ae_loss.backward()
        self.encoder_optimizer.step()
        self.decoder_optimizer.step()

        # Train Discriminator
        self.discriminator_optimizer.zero_grad()

        # Generate latent vectors from the prior distribution (Gaussian in this case)
        z_prior = torch.randn(batch_size, self.latent_dim).to(self.device)

        # Get encoded samples
        z_encoded = self.encoder(x_batch).detach()  # Detach to avoid training the encoder again

        # Compute discriminator loss
        real_loss = self.adversarial_loss(self.discriminator(z_prior), real_target)
        fake_loss = self.adversarial_loss(self.discriminator(z_encoded), fake_target)
        d_loss = (real_loss + fake_loss) / 2

        # Backpropagate and update parameters
        d_loss.backward()
        self.discriminator_optimizer.step()

        return {
            'reconstruction_loss': recon_loss.item(),
            'generator_loss': gen_loss.item(),
            'discriminator_loss': d_loss.item()
        }

    def train(self, data_loader, epochs=100):
        self.encoder.train()
        self.decoder.train()
        self.discriminator.train()

        training_history = []

        for epoch in range(epochs):
            epoch_losses = {'reconstruction_loss': 0, 'generator_loss': 0, 'discriminator_loss': 0}
            batch_count = 0

            for batch_idx, (x_batch, _) in enumerate(data_loader):
                step_losses = self.train_step(x_batch)

                for key in epoch_losses:
                    epoch_losses[key] += step_losses[key]
                batch_count += 1

            # Calculate average losses for the epoch
            for key in epoch_losses:
                epoch_losses[key] /= batch_count

            training_history.append(epoch_losses)

            print(f"Epoch [{epoch+1}/{epochs}] - "
                  f"Recon Loss: {epoch_losses['reconstruction_loss']:.4f}, "
                  f"Gen Loss: {epoch_losses['generator_loss']:.4f}, "
                  f"Disc Loss: {epoch_losses['discriminator_loss']:.4f}")

        return training_history

    def encode(self, x):
        self.encoder.eval()
        with torch.no_grad():
            x = x.to(self.device)
            z = self.encoder(x)
        return z

    def decode(self, z):
        self.decoder.eval()
        with torch.no_grad():
            z = z.to(self.device)
            x_reconstructed = self.decoder(z)
        return x_reconstructed

    def reconstruct(self, x):
        self.encoder.eval()
        self.decoder.eval()
        with torch.no_grad():
            x = x.to(self.device)
            z = self.encoder(x)
            x_reconstructed = self.decoder(z)
        return x_reconstructed

    def save_model(self, path):
        torch.save({
            'encoder_state_dict': self.encoder.state_dict(),
            'decoder_state_dict': self.decoder.state_dict(),
            'discriminator_state_dict': self.discriminator.state_dict()
        }, path)

    def load_model(self, path):
        checkpoint = torch.load(path,  weights_only=True)
        self.encoder.load_state_dict(checkpoint['encoder_state_dict'])
        self.decoder.load_state_dict(checkpoint['decoder_state_dict'])
        self.discriminator.load_state_dict(checkpoint['discriminator_state_dict'])

In [7]:
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

True
0
Tesla P100-PCIE-16GB


In [8]:
checkpoint = torch.load('/kaggle/input/autoencoder/pytorch/default/1/models/best_aae_model.pt')
print(checkpoint.keys())

  checkpoint = torch.load('/kaggle/input/autoencoder/pytorch/default/1/models/best_aae_model.pt')


dict_keys(['encoder_state_dict', 'decoder_state_dict', 'discriminator_state_dict'])


In [9]:
bestModel = AdversarialAutoencoder()
bestModel.load_model("/kaggle/input/autoencoder/pytorch/default/1/models/best_aae_model.pt")

# Basic Recommendation on whole data

In [4]:
movies_df = pd.read_parquet("/kaggle/input/recommendation-embeddings-inputs/movies_embeddings.parquet")
songs_df = pd.read_parquet("/kaggle/input/recommendation-embeddings-inputs/song_embeddings.parquet")

In [5]:
movies_df.head()

Unnamed: 0,id,title,overview,genres,embedding
0,27205,Inception,Cobb a skilled thief who commits corporate esp...,"Action, Science Fiction, Adventure","[0.01589059643447399, 0.11273891478776932, -0...."
1,157336,Interstellar,The adventures of a group of explorers who mak...,"Adventure, Drama, Science Fiction","[0.037922028452157974, -0.005655079614371061, ..."
2,155,The Dark Knight,Batman raises the stakes in his war on crime W...,"Drama, Action, Crime, Thriller","[0.011266704648733139, 0.032755907624959946, -..."
3,19995,Avatar,In the 22nd century a paraplegic Marine is dis...,"Action, Adventure, Fantasy, Science Fiction","[0.01744804158806801, 0.03436880186200142, 0.0..."
4,24428,The Avengers,When an unexpected enemy emerges and threatens...,"Science Fiction, Action, Adventure","[0.027801260352134705, -0.019952325150370598, ..."


In [6]:
songs_df.head()

Unnamed: 0,title,tag,lyrics,embedding
0,Poor Poor Pitiful Me,country,Well I lay my head on the railroad track\nWait...,"[0.029062896966934204, 0.08223594725131989, -0..."
1,Cuckoos Nest,country,There is a thorn bush\nIn Outcolia\nThere is a...,"[0.0009067401406355202, -0.09515126794576645, ..."
2,Wedding Bells,country,I have the invitation that your sent me\nYou w...,"[0.04617173597216606, 0.013956493698060513, -0..."
3,Could Have Fooled Me,country,Im fading like the taillights\nOf a car that y...,"[0.015817370265722275, -0.0025993576273322105,..."
4,Shot of Glory,country,Its finally payday\nMeeting the boys at my pla...,"[-0.011555955745279789, 0.0511910654604435, 0...."


In [7]:
movie_emb = np.vstack(movies_df['embedding'].values)
song_emb  = np.vstack(songs_df['embedding'].values)

nn = NearestNeighbors(n_neighbors=5, metric='cosine', algorithm='brute')
nn.fit(movie_emb)

titles = movies_df['title'].tolist()
recs   = []

batch_size = 500
for start in range(0, song_emb.shape[0], batch_size):
    end   = min(start + batch_size, song_emb.shape[0])
    batch = song_emb[start:end]
    dist, idx = nn.kneighbors(batch)

    for i, (neigh_idxs, neigh_dists) in enumerate(zip(idx, dist)):
        song = songs_df.iloc[start + i]
        row = {
            'song_title': song['title'],
            'tag':        song['tag'],
            'lyrics':     song['lyrics'],
        }
        for rank, (m_idx, d) in enumerate(zip(neigh_idxs, neigh_dists), start=1):
            row[f'best{rank}_title'] = titles[m_idx]
            row[f'best{rank}_score'] = float(1.0 - d)
        recs.append(row)

rec_df = pd.DataFrame(recs)
rec_df.to_csv("recommendation_with_scores.csv",index=False)
rec_df.head(), rec_df.shape

(             song_title      tag  \
 0  Poor Poor Pitiful Me  country   
 1          Cuckoos Nest  country   
 2         Wedding Bells  country   
 3  Could Have Fooled Me  country   
 4         Shot of Glory  country   
 
                                               lyrics  \
 0  Well I lay my head on the railroad track\nWait...   
 1  There is a thorn bush\nIn Outcolia\nThere is a...   
 2  I have the invitation that your sent me\nYou w...   
 3  Im fading like the taillights\nOf a car that y...   
 4  Its finally payday\nMeeting the boys at my pla...   
 
                                     best1_title  best1_score  \
 0                               Faceless Things     0.526864   
 1                                  Soul Catcher     0.538646   
 2                                  Tainted Love     0.588257   
 3                                  Tainted Love     0.720001   
 4  The White Stripes: Under Nova Scotian Lights     0.592151   
 
                           best2_title  

In [8]:
rec_df.to_csv("recommendation_with_scores.csv",index=False)


# Autoencoder Recommendation for whole data

In [14]:
song_embeddings = np.array(songs_df['embedding'].tolist())
song_embeddings = torch.tensor(song_embeddings, dtype=torch.float32)
movie_embeddings = np.array(movies_df['embedding'].tolist())
movie_embeddings = torch.tensor(movie_embeddings, dtype=torch.float32)

songs_df["embedding"] = [embedding.tolist() for embedding in bestModel.encode(song_embeddings).cpu().detach().numpy()]
movies_df["embedding"] = [embedding.tolist() for embedding in bestModel.encode(movie_embeddings).cpu().detach().numpy()]

In [15]:
del song_embeddings
del movie_embeddings

In [23]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors

movie_emb = np.vstack(movies_df['embedding'].values)
song_emb  = np.vstack(songs_df['embedding'].values)

nn = NearestNeighbors(n_neighbors=5, metric='cosine', algorithm='brute')
nn.fit(movie_emb)

titles = movies_df['title'].tolist()
recs   = []

batch_size = 500
for start in range(0, song_emb.shape[0], batch_size):
    end   = min(start + batch_size, song_emb.shape[0])
    batch = song_emb[start:end]
    dist, idx = nn.kneighbors(batch)

    for i, (neigh_idxs, neigh_dists) in enumerate(zip(idx, dist)):
        song = songs_df.iloc[start + i]
        row = {
            'song_title': song['title'],
            'tag':        song['tag'],
            'lyrics':     song['lyrics'],
        }
        for rank, (m_idx, d) in enumerate(zip(neigh_idxs, neigh_dists), start=1):
            row[f'best{rank}_title'] = titles[m_idx]
            row[f'best{rank}_score'] = float(1.0 - d)
        recs.append(row)

rec_df = pd.DataFrame(recs)
rec_df.to_csv("autoencoder_recommendation_with_scores.csv",index=False)
rec_df.head()

OSError: Cannot save file into a non-existent directory: '/mnt/data'

In [32]:
rec_df.shape

(519948, 13)