## Collaborative filtering by user

User --> Songs ID list

In [1]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# Load the dataset
file_path = 'song_dataset.csv'  # Update the file path if needed
data = pd.read_csv(file_path)

# Preprocessing
## Remove duplicates if any
data = data.drop_duplicates(['user', 'song'])

## Create a pivot table with users as rows and songs as columns
pivot_table = data.pivot(index='user', columns='song', values='play_count').fillna(0)

## Convert the pivot table to a sparse matrix
matrix = csr_matrix(pivot_table.values)
 
# Building the Model - Using K-Nearest Neighbors
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
model_knn.fit(matrix)

# Function to recommend songs for a user
def recommend_songs_for_user(model, pivot_table, user_id, n_recommendations):
    if user_id not in pivot_table.index:
        return "User ID not found in the dataset."

    user_idx = pivot_table.index.get_loc(user_id)
    distances, indices = model.kneighbors(pivot_table.iloc[user_idx, :].values.reshape(1, -1), n_neighbors=20)
    
    # Get the songs listened to by the user
    listened_songs = set(pivot_table.columns[pivot_table.iloc[user_idx].to_numpy().nonzero()[0]].tolist())

    recommend_list = set()
    for idx in indices.flatten():
        if idx == user_idx:
            continue  # skip the user itself
        # Add songs listened by similar users, excluding those already listened by the user
        user_songs = set(pivot_table.columns[pivot_table.iloc[idx].to_numpy().nonzero()[0]].tolist())
        recommend_list.update(user_songs)

    # Remove already listened songs and limit the number of recommendations
    recommend_list.difference_update(listened_songs)
    recommend_list = list(recommend_list)[:n_recommendations]

    return recommend_list

# Example Usage
user_id = '969cc6fb74e076a68e36a04409cb9d3765757508'
recommendations = recommend_songs_for_user(model_knn, pivot_table, user_id, 10)
print("Recommended Songs for User", user_id, ":")
print(recommendations)

Recommended Songs for User 969cc6fb74e076a68e36a04409cb9d3765757508 :
['SOERLVY12AB01842AA', 'SOFXVLQ12AF72AD42E', 'SOALKBV12A6D4F6EE2', 'SODHPUM12AF72A1DF2', 'SORWULZ12A6D4F5B1E', 'SOPTLQL12AB018D56F', 'SOINKUL12AB0188B02', 'SOETLEX12AF72A3070', 'SOSDIVF12A8C13F067', 'SORUYEG12B0B807430']


In [2]:
# List of song IDs for which you want to find the song names
song_ids = recommendations

# Filter the dataset for the given song IDs and retrieve their names
song_names = data[data['song'].isin(song_ids)]['title'].drop_duplicates().tolist()

# Print the song names
print("Song Names:")
for name in song_names:
    print(name)

Song Names:
Billionaire [feat. Bruno Mars]  (Explicit Album Version)
Queen Of My Double Wide Trailer
Blind
Bumpy's Lament
So Beautiful
Get Up
You Will Always Be The Same
Becoming Insane
Missy
Be My Baby


## By list of songs :

In [3]:
# list = ['SOBFNSP12AF72A0E22', 'SOIAOBY12A8C13BF75','SORJMZL12A8C13AC49']

user_song_ids = ['SOBFNSP12AF72A0E22', 'SOIAOBY12A8C13BF75', 'SORJMZL12A8C13AC49']

## Collaborative filtering

Songs ID list --> Recommended songs ID list

In [4]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Load the dataset
data = pd.read_csv('song_dataset.csv')  # Update with the path to your dataset

# Preprocessing
## Remove duplicates if any
data = data.drop_duplicates(['user', 'song'])

## Aggregate play counts for each user-song pair
data_agg = data.groupby(['user', 'song']).play_count.sum().reset_index()

## Create a pivot table
pivot_table = data_agg.pivot(index='user', columns='song', values='play_count').fillna(0)

## Create a sparse matrix
matrix = csr_matrix(pivot_table.values)

# Compute Cosine Similarity
cosine_sim = cosine_similarity(matrix)

# Function to Recommend Songs
def recommend_songs_collaborative_filtering(input_songs, pivot_table, cosine_sim, top_n=10):
    # Create a pseudo-user vector
    pseudo_user = pd.Series(0, index=pivot_table.columns)
    for song in input_songs:
        if song in pseudo_user.index:
            pseudo_user[song] = 1

    # Find similar users
    pseudo_user_matrix = csr_matrix(pseudo_user.values.reshape(1, -1))
    sim_scores = cosine_similarity(pseudo_user_matrix, matrix).flatten()
    similar_users = sim_scores.argsort()[::-1][1:]  # Excluding the pseudo-user itself

    # Aggregate recommendations from similar users
    song_recs = {}
    for user_idx in similar_users[:20]:  # Consider top 20 similar users
        user_songs = pivot_table.columns[(pivot_table.iloc[user_idx] > 0)].tolist()
        for song in user_songs:
            if song not in input_songs:
                song_recs[song] = song_recs.get(song, 0) + sim_scores[user_idx]

    recommended_songs = sorted(song_recs, key=song_recs.get, reverse=True)[:top_n]
    return recommended_songs

# Example Usage
input_songs = user_song_ids
recommended_song_ids_cf = recommend_songs_collaborative_filtering(input_songs, pivot_table, cosine_sim)
print("Recommended Songs:", recommended_song_ids_cf)

Recommended Songs: ['SOAXGDH12A8C13F8A1', 'SODJWHY12A8C142CCE', 'SOFRQTD12A81C233C0', 'SOLFXKT12AB017E3E0', 'SONYKOW12AB01849C9', 'SOTWNDJ12A8C143984', 'SOUSMXX12AB0185C24', 'SOUVTSM12AC468F6A7', 'SOWCKVR12A8C142411', 'SOAUWYT12A81C206F1']


In [5]:
# List of song IDs for which you want to find the song names
song_ids = recommended_song_ids_cf

# Filter the dataset for the given song IDs and retrieve their names
song_names = data[data['song'].isin(song_ids)]['title'].drop_duplicates().tolist()

# Print the song names
print("Song Names:")
for name in song_names:
    print(name)

Song Names:
Sehr kosmisch
Undo
Dog Days Are Over (Radio Edit)
Hey_ Soul Sister
Fireflies
Secrets
Marry Me
OMG
Drop The World
Use Somebody


## Content-Based Filtering

Songs ID list --> Recommended songs ID list

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
data = pd.read_csv('song_dataset.csv')  # Update with the path to your dataset

# Data Preprocessing
## Combine text features for content-based filtering
data['combined_features'] = data['title'] + ' ' + data['artist_name'] + ' ' + data['release']

# Content-based Features using TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['combined_features'])

def recommend_songs_content_based(input_songs, data, tfidf_matrix, top_n=10):
    song_indices = [data.index[data['song'] == song_id].tolist()[0] for song_id in input_songs if song_id in data['song'].values]
    
    # Aggregate the similarities of input songs with all songs
    aggregate_sim_scores = sum(cosine_similarity(tfidf_matrix[song_idx], tfidf_matrix) for song_idx in song_indices)

    # Flatten the similarity scores array and get top N indices
    sim_scores_flattened = aggregate_sim_scores.flatten()
    recommended_song_indices = sim_scores_flattened.argsort()[-top_n-len(input_songs):-len(input_songs)][::-1]
    
    # Get recommended song IDs excluding the input songs
    recommended_song_ids = [data.iloc[idx]['song'] for idx in recommended_song_indices if data.iloc[idx]['song'] not in input_songs]

    return recommended_song_ids


# Example Usage
input_songs = user_song_ids
recommended_song_ids_cb = recommend_songs_content_based(input_songs, data, tfidf_matrix)
print("Recommended Songs:", recommended_song_ids_cb)

Recommended Songs: ['SOJXJHW12A6D4F8AE5', 'SOBVFZR12A6D4F8AE3', 'SOBVFZR12A6D4F8AE3', 'SOEOBYG12A6D4F8AE2', 'SOEOBYG12A6D4F8AE2', 'SOEGVZY12A58A7857E', 'SORJNVW12A8C13BF90']


In [7]:
# List of song IDs for which you want to find the song names
song_ids = recommended_song_ids_cb

# Filter the dataset for the given song IDs and retrieve their names
song_names = data[data['song'].isin(song_ids)]['title'].drop_duplicates().tolist()

# Print the song names
print("Song Names:")
for name in song_names:
    print(name)

Song Names:
Ears To The Ground (Album Version)
Nothing Gives Me Pleasure
Ugly Stories
Wonderful (Album Version)
Women And Men (Album Version)


## Hybrid approach

Songs ID list --> Recommended songs ID list

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix, vstack

# Load the dataset
data = pd.read_csv('song_dataset.csv')  # Update with your dataset's file path

# Data Preprocessing

## Remove duplicates if any
data = data.drop_duplicates(['user', 'song'])

## Combine text features for content-based filtering
data['combined_features'] = data['title'] + ' ' + data['artist_name'] + ' ' + data['release']

# Content-based Features
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['combined_features'])

# Data Preprocessing for Collaborative Filtering
## Aggregate play counts for each user-song pair
agg_data = data.groupby(['user', 'song']).agg({'play_count': 'sum'}).reset_index()

## Create pivot table
pivot_table = agg_data.pivot(index='user', columns='song', values='play_count').fillna(0)
matrix = csr_matrix(pivot_table.values)
svd = TruncatedSVD(n_components=20)  # Reduced number of components
latent_matrix = svd.fit_transform(matrix)

def recommend_songs_hybrid(user_songs, all_songs, tfidf_matrix, latent_matrix, top_n=10):
    """
    Recommends songs based on a hybrid approach combining content-based and collaborative filtering.
    
    Args:
    user_songs: List of song IDs the user has already listened to.
    all_songs: DataFrame of all songs.
    tfidf_matrix: TF-IDF matrix for content-based features.
    latent_matrix: Matrix from SVD for collaborative features.
    top_n: Number of top recommendations to return.
    
    Returns:
    DataFrame of recommended songs.
    """
    # Filter out user songs that are not in the dataset
    valid_user_songs = [song for song in user_songs if song in all_songs['song'].values]

    # Get indices for valid user songs in the TF-IDF matrix
    valid_song_indices = [all_songs[all_songs['song'] == song].index[0] for song in valid_user_songs]
    
    # Content-based recommendations
    song_matrix = vstack([tfidf_matrix[i] for i in valid_song_indices])
    cosine_sim = cosine_similarity(song_matrix, tfidf_matrix)
    sim_scores = cosine_sim.mean(axis=0)
    content_based_recommendations = [all_songs['song'].iloc[i] for i in sim_scores.argsort()[-top_n:][::-1]]

    # Collaborative filtering recommendations
    user_vector = latent_matrix.mean(axis=0).reshape(1, -1)
    cosine_sim = cosine_similarity(user_vector, latent_matrix)
    sim_scores = cosine_sim[0]
    collaborative_recommendations = [all_songs['song'].iloc[i] for i in sim_scores.argsort()[-top_n:][::-1]]

    # Combine and filter out songs the user has already listened to
    combined_recommendations = list(set(content_based_recommendations + collaborative_recommendations))
    combined_recommendations = [song for song in combined_recommendations if song not in valid_user_songs]

    return all_songs[all_songs['song'].isin(combined_recommendations)].drop_duplicates('song').head(top_n)

# Example Usage
recommended_songs_hybrid = recommend_songs_hybrid(user_song_ids, data, tfidf_matrix, latent_matrix)
print("Recommended Songs:", recommended_songs_hybrid['song'].tolist())


Recommended Songs: ['SOBVFZR12A6D4F8AE3', 'SOEGVZY12A58A7857E', 'SOEOBYG12A6D4F8AE2', 'SOIQOQT12A8C136F96', 'SOOSIVQ12A6D4F8AE0', 'SOUSMXX12AB0185C24', 'SOGIYND12AB017B10E', 'SOCKYCG12A58A78E37', 'SOBNXJY12A8C13E070', 'SOXTMZY12AB01866D5']


In [9]:
# List of song IDs for which you want to find the song names
song_ids = recommended_songs_hybrid['song'].tolist()

# Filter the dataset for the given song IDs and retrieve their names
song_names = data[data['song'].isin(song_ids)]['title'].drop_duplicates().tolist()

# Print the song names
print("Song Names:")
for name in song_names:
    print(name)

Song Names:
Ears To The Ground (Album Version)
Nothing Gives Me Pleasure
Ugly Stories
Pilgrim
Christmas With Jesus (Album Version)
OMG
Live-In Skin
Hippie Priest Bum-out (Edit)
The Execution Of All Things (Single Version)
Underdog (In The Style of 'You Me At Six') [No Backing Vocals]


## Calculate similarities in the results 

In [10]:
def calculate_overlap_ratio(list1, list2):
    """Calculate the overlap ratio between two lists."""
    common_elements = set(list1).intersection(set(list2))
    return len(common_elements) / min(len(list1), len(list2))

def calculate_jaccard_similarity(list1, list2):
    """Calculate Jaccard similarity between two lists."""
    intersection = set(list1).intersection(set(list2))
    union = set(list1).union(set(list2))
    return len(intersection) / len(union)

recs_collaborative = recommended_song_ids_cf # recommendations from collaborative method
recs_cb = recommended_song_ids_cb # recommendations from Content-Based method
recs_hybrid = recommended_songs_hybrid['song'].tolist()   # recommendations from hybrid method

# Calculate similarities
overlap_ratio = calculate_overlap_ratio(recs_hybrid, recs_collaborative)
jaccard_similarity = calculate_jaccard_similarity(recs_hybrid, recs_collaborative)
print(f"collaborative vs hybrid :")
print(f"Overlap Ratio : {overlap_ratio}")
print(f"Jaccard Similarity : {jaccard_similarity}")

# Calculate similarities
overlap_ratio = calculate_overlap_ratio(recs_hybrid, recs_cb)
jaccard_similarity = calculate_jaccard_similarity(recs_hybrid, recs_cb)
print(f"Content-Based vs hybrid :")
print(f"Overlap Ratio : {overlap_ratio}")
print(f"Jaccard Similarity : {jaccard_similarity}")



collaborative vs hybrid :
Overlap Ratio : 0.1
Jaccard Similarity : 0.05263157894736842
Content-Based vs hybrid :
Overlap Ratio : 0.42857142857142855
Jaccard Similarity : 0.25
