In [7]:
from datasets import load_dataset

# Load the dataset
ds = load_dataset("bigdata-pw/Pitchfork", split="train")

# Split the dataset into 80% train and 20% test
train_test_split = ds.train_test_split(test_size=0.2)

train_ds = train_test_split['train']
test_ds = train_test_split['test']

In [20]:
print(ds[0:5])

{'alternativeHeadline': ['Each Sunday, Pitchfork takes an in-depth look at a significant album from the past, and any record not in our archives is eligible. Today, we explore the solitary sound of the incomparable Nebraska.', 'The debut album from the meteoric pop star lives in a world of its own: gothic, bass-heavy, at turns daring and quite beautiful.', 'The double-disc second album from all nine rappers can feel bloated and disjointed, but it is also the greatest distillation of the Wu at their most chaotic and functional.', "Tied to a film-related renewed interest in the band, Joy Division's three formative, formidable works get cleaned up and reissued in deluxe form.", 'The Texas singer uses experimental vocal collages set against actual Lynchian backdrops to create a uniquely off-kilter kind of evanescence.'], 'description': ['Each Sunday, Pitchfork takes an in-depth look at a significant album from the past, and any record not in our archives is eligible. Today, we explore the 

In [40]:
import pandas as pd

# Create a DataFrame with titles, descriptions, and images
titles_and_descriptions = pd.DataFrame({
    'title': [item['headline'] for item in ds],
    'description': [item['reviewBody'] for item in ds],
    'image': [item['thumbnailUrl'] for item in ds]
})

results = [split_author_album(x) for x in ds['itemReviewed']]
authors, album_names = zip(*results)  # Unzip the results

# Create the new columns
titles_and_descriptions['author'] = authors
titles_and_descriptions['album_name'] = album_names

# Print the first few rows of the updated DataFrame
print(titles_and_descriptions.head())



                                      title  \
0                                  Nebraska   
1  When We All Fall Asleep, Where Do We Go?   
2                           Wu-Tang Forever   
3        Unknown Pleasures / Closer / Still   
4                       Cellophane Memories   

                                         description  \
0  Bruce Springsteen’s 1982 solo album Nebraska h...   
1  Billie Eilish has suddenly become an obscenely...   
2  In the summer of 1997, the Wu-Tang Clan were i...   
3  Rock history is jammed with messy, stupid, and...   
4  The presence of director David Lynch’s name at...   

                                               image  \
0  https://media.pitchfork.com/photos/5aabf0b32a8...   
1  https://media.pitchfork.com/photos/65f9e40316c...   
2  https://media.pitchfork.com/photos/633c64bf429...   
3                                               None   
4  https://media.pitchfork.com/photos/665f23d6346...   

                      author               

In [41]:
df=titles_and_descriptions

In [42]:
df.head()

Unnamed: 0,title,description,image,author,album_name
0,Nebraska,Bruce Springsteen’s 1982 solo album Nebraska h...,https://media.pitchfork.com/photos/5aabf0b32a8...,Bruce Springsteen,Nebraska
1,"When We All Fall Asleep, Where Do We Go?",Billie Eilish has suddenly become an obscenely...,https://media.pitchfork.com/photos/65f9e40316c...,Billie Eilish,"When We All Fall Asleep, Where Do We Go?"
2,Wu-Tang Forever,"In the summer of 1997, the Wu-Tang Clan were i...",https://media.pitchfork.com/photos/633c64bf429...,Wu-Tang Clan,Wu-Tang Forever
3,Unknown Pleasures / Closer / Still,"Rock history is jammed with messy, stupid, and...",,Joy Division,Unknown Pleasures / Closer / Still
4,Cellophane Memories,The presence of director David Lynch’s name at...,https://media.pitchfork.com/photos/665f23d6346...,Chrystabell / David Lynch,Cellophane Memories


In [43]:
df['ablum_name'] = df['album_name'].str.lower()

In [44]:
df['album_name'].nunique()

13142

In [45]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    BertTokenizer, 
    BertModel, 
    BertForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [49]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
from tqdm import tqdm

class SimpleAlbumRecommender:
    def __init__(self, model_name='bert-base-uncased'):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")
        
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name).to(self.device)
        self.model.eval()
        
        self.df = None
        self.embeddings = None
    
    def fit(self, df, review_col='description', album_col='album_name'):
        print("Initializing recommender with your dataset...")
        self.df = df.copy()
        
        self.df[review_col] = self.df[review_col].astype(str).str.lower()
        
        print("\nDataset Information:")
        print(f"Number of albums: {len(self.df)}")
        print(f"Columns: {self.df.columns.tolist()}")
        
        print("\nComputing BERT embeddings for all reviews...")
        self.embeddings = {}
        
        # Process in batches
        batch_size = 8
        for i in tqdm(range(0, len(self.df), batch_size)):
            batch_reviews = self.df[review_col].iloc[i:i+batch_size].tolist()
            
            # Tokenize
            encoded = self.tokenizer(
                batch_reviews,
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors='pt'
            )
            
            # Get embeddings
            with torch.no_grad():
                input_ids = encoded['input_ids'].to(self.device)
                attention_mask = encoded['attention_mask'].to(self.device)
                outputs = self.model(input_ids, attention_mask)
                batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            
            # Store embeddings
            for j, embedding in enumerate(batch_embeddings):
                self.embeddings[i+j] = embedding
        
        print("Ready to make recommendations!")
        
    def recommend(self, text, n_recommendations=5):
        """Get album recommendations based on input text"""
        if self.embeddings is None:
            raise ValueError("Please fit the recommender first using .fit()")
            
        # Get embedding for input text
        encoded = self.tokenizer(
            text,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )
        
        with torch.no_grad():
            input_ids = encoded['input_ids'].to(self.device)
            attention_mask = encoded['attention_mask'].to(self.device)
            outputs = self.model(input_ids, attention_mask)
            input_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()[0]
        
        # Calculate similarities
        similarities = {}
        for idx, embedding in self.embeddings.items():
            similarity = np.dot(input_embedding, embedding) / (
                np.linalg.norm(input_embedding) * np.linalg.norm(embedding)
            )
            similarities[idx] = similarity
        
        # Get top recommendations
        top_indices = sorted(similarities.keys(), 
                           key=lambda x: similarities[x], 
                           reverse=True)[:n_recommendations]
        
        recommendations = self.df.iloc[top_indices]
        return recommendations


recommender = SimpleAlbumRecommender()
recommender.fit(df, review_col='description', album_col='album_name')

# Get recommendations
mood = "I'm feeling sad and need something emotional"
recommendations = recommender.recommend(mood, n_recommendations=5)
print("\nRecommended albums:")
print(recommendations[['album_name']])


Using device: cuda
Initializing recommender with your dataset...

Dataset Information:
Number of albums: 14055
Columns: ['title', 'description', 'image', 'author', 'album_name', 'ablum_name']

Computing BERT embeddings for all reviews...


100%|██████████████████████████████████████████████████████████████████████████████| 1757/1757 [10:02<00:00,  2.92it/s]

Ready to make recommendations!

Recommended albums:
                                           album_name
6494                                    Life Is Yours
13508                                        flounder
1853   Demonstrating Visible Differences of Height EP
11144                         Bark Your Head Off, Dog
94                              Kindness for Weakness





In [51]:
mood = "feeling happy and jpoyful"
recommendations = recommender.recommend(mood, n_recommendations=5)
print("\nRecommended albums:")
print(recommendations[['album_name']])


Recommended albums:
                          album_name
6494                   Life Is Yours
11144        Bark Your Head Off, Dog
3359   Where Polly People Go to Read
12746   Salt and Sugar Look the Same
13960                     Gris Klein
