In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
df = pd.read_csv('limited_dataset.csv')
df.head()

  df = pd.read_csv('limited_dataset.csv')


Unnamed: 0,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,...,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58,Unnamed: 59
0,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1.0,-6.746,...,,,,,,,,,,
1,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1.0,-6.746,...,,,,,,,,,,
2,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1.0,-6.746,...,,,,,,,,,,
3,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1.0,-6.746,...,,,,,,,,,,
4,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1.0,-6.746,...,,,,,,,,,,


In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from math import sqrt
import re

# Function to clean non-ASCII characters from strings
def clean_string(text):
    if isinstance(text, str):
        # Remove non-ASCII characters
        return re.sub(r'[^\x00-\x7F]+', '', text)
    return text

# Load the dataset
data = pd.read_csv("limited_dataset.csv")

# Clean the 'date_added' column to handle invalid dates
data['date_added'] = pd.to_datetime(data['date_added'], errors='coerce')

# Remove rows with invalid 'date_added'
data = data.dropna(subset=['date_added'])

# Count the number of unique playlists
num_playlists = data['playlist_name'].nunique()
print(f"Total number of playlists: {num_playlists}")

# Sort the data by playlist and date_added
data = data.sort_values(by=['playlist_name', 'date_added'])

# Create a list to store the training data (after removing the 3 most recent songs)
train_data = []
removed_songs = []

# Iterate over playlists and remove the last 3 songs
for playlist, group in data.groupby('playlist_name'):
    if len(group) > 3:
        # Remove the 3 most recent songs
        removed = group.tail(3)
        train = group.head(len(group) - 3)

        removed_songs.append(removed)
        train_data.append(train)

# Concatenate training data back into a single DataFrame
train_data = pd.concat(train_data)

# Concatenate removed songs into a single DataFrame
removed_songs = pd.concat(removed_songs)

# Collaborative Filtering: Create a song-playlist interaction matrix
song_playlist_matrix = pd.pivot_table(train_data, index='track_name', columns='playlist_name', aggfunc='size',
                                      fill_value=0)

# Clean track names in the song-playlist matrix
song_playlist_matrix.index = song_playlist_matrix.index.map(clean_string)

# Compute cosine similarity between songs based on the interaction matrix
cf_similarity_matrix = cosine_similarity(song_playlist_matrix)

# Convert CF similarity matrix to a DataFrame for easier handling
cf_similarity_df = pd.DataFrame(cf_similarity_matrix, index=song_playlist_matrix.index,
                                columns=song_playlist_matrix.index)

# Clean song names by stripping extra spaces and handling encoding
cf_similarity_df.index = cf_similarity_df.index.str.strip()
cf_similarity_df.columns = cf_similarity_df.columns.str.strip()

# Content-Based Filtering: Extract song metadata from the dataset
# Use relevant numeric features for content-based filtering
metadata_features = [
    'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
    'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo'
]

# Create a feature matrix for content-based filtering
feature_matrix = train_data[['track_name'] + metadata_features].drop_duplicates()
feature_matrix.set_index('track_name', inplace=True)

# Clean track names in the feature matrix
feature_matrix.index = feature_matrix.index.map(clean_string)


# Compute cosine similarity between songs based on content features
cbf_similarity_matrix = cosine_similarity(feature_matrix)

# Convert CBF similarity matrix to a DataFrame
cbf_similarity_df = pd.DataFrame(cbf_similarity_matrix, index=feature_matrix.index,
                                 columns=feature_matrix.index)

# Hybrid Similarity: Combine CF and CBF similarity matrices
alpha = 0.7  # Weight for CF similarity
hybrid_similarity_df = alpha * cf_similarity_df + (1 - alpha) * cbf_similarity_df

# Clean track names in the hybrid similarity DataFrame
hybrid_similarity_df.index = hybrid_similarity_df.index.map(clean_string)
hybrid_similarity_df.columns = hybrid_similarity_df.columns.map(clean_string)

# # Print the hybrid similarity DataFrame
# print("Hybrid Similarity DataFrame:")
# display(hybrid_similarity_df)
# print(hybrid_similarity_df.columns[hybrid_similarity_df.columns.duplicated()])
hybrid_similarity_df = hybrid_similarity_df.loc[:, ~hybrid_similarity_df.columns.duplicated()]

# Function to generate recommendations for each playlist
def get_hybrid_recommendations(playlist_songs, hybrid_similarity_df, top_n=1):
    recommended_songs = {}
    for song in playlist_songs:
        if song in hybrid_similarity_df.index:
            # Sort the similarity scores in descending order and exclude the song itself
            # print(type(hybrid_similarity_df[song]))
            similar_songs = hybrid_similarity_df[song].sort_values(ascending=False)[1:top_n + 1]
            recommended_songs[song] = similar_songs.index.tolist()
        else:
            recommended_songs[song] = []  # If song not found, return empty recommendation
    return recommended_songs

# Function to compute the hybrid similarity between two songs
def compute_hybrid_similarity(song_a, song_b, hybrid_similarity_df):
    if song_a in hybrid_similarity_df.index and song_b in hybrid_similarity_df.columns:
        similarity = hybrid_similarity_df.loc[song_a, song_b]
        # Ensure similarity is a single numeric value
        if isinstance(similarity, pd.Series):
            similarity = similarity.iloc[0]  # Take the first value if it's a Series
        return similarity
    else:
        return 0  # Return 0 if the song is not found in the similarity matrix

# Function to calculate RMSE
def calculate_rmse(recommended_songs, removed_songs, hybrid_similarity_df):
    all_errors = []
    for _, removed_group in removed_songs.groupby('playlist_name'):
        playlist_songs = removed_group['track_name'].tolist()

        for song in playlist_songs:
            recommended_song = recommended_songs.get(song, None)
            if recommended_song:
                # Calculate the similarity between the recommended song and the removed song(s)
                distances = [compute_hybrid_similarity(song, recommended, hybrid_similarity_df) for recommended in
                             recommended_song]
                # Ensure distances contains only numeric values
                if all(isinstance(d, (int, float)) for d in distances):
                    all_errors.append(min(distances))  # Use the minimum distance as the error
                else:
                    print(f"Warning: Non-numeric distances found for song '{song}'. Skipping this song.")
    # Calculate RMSE from the distances (convert distances to errors)
    if all_errors:
        rmse = sqrt(mean_squared_error(np.ones(len(all_errors)), all_errors))  # Using ones as true values (ideal similarity)
    else:
        rmse = 0  # If no valid distances, return RMSE as 0
    return rmse

# Generate hybrid recommendations for all playlists
all_recommended_songs = {}

# Iterate over playlists and generate recommendations for each
for playlist, group in data.groupby('playlist_name'):
    playlist_songs = group['track_name'].tolist()
    all_recommended_songs[playlist] = get_hybrid_recommendations(playlist_songs, hybrid_similarity_df, top_n=1)

# Calculate RMSE
rmse_value = calculate_rmse(all_recommended_songs, removed_songs, hybrid_similarity_df)
print(f"Root Mean Squared Error (RMSE): {rmse_value}")

  data = pd.read_csv("limited_dataset.csv")


Total number of playlists: 6060
Root Mean Squared Error (RMSE): 0.681935792756894
