# Spotify Music Recommendation

In [41]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import json

# Recommendations using 'data.csv' file only

In [22]:
# Load the CSV file
df = pd.read_csv('data.csv')

In [7]:
# Drop unnecessary columns
df = df.drop(['id', 'name', 'release_date'], axis=1, errors='ignore')

# Handle missing values
df = df.dropna()

# Automatically select numerical columns
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns

# Initialize the scaler
scaler = MinMaxScaler()

# Apply scaling to numerical columns
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

In [9]:
# Calculate cosine similarity
similarity_matrix = cosine_similarity(df[numerical_columns])

# Function to get recommendations
def recommend_songs(song_index, num_recommendations=5):
    similarity_scores = list(enumerate(similarity_matrix[song_index]))
    sorted_songs = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_songs = sorted_songs[1:num_recommendations + 1]
    return df.iloc[[i[0] for i in top_songs]]

In [10]:
# Test the recommender
song_index = 10  # Index of a song to base recommendations on
recommended_songs = recommend_songs(song_index)
print(recommended_songs)

     danceability    energy       key  loudness  mode  speechiness  \
54       0.285539  0.997987  0.727273  0.913713   1.0     0.201484   
189      0.116422  0.983897  0.545455  0.917595   1.0     0.224912   
153      0.278186  0.983897  0.545455  0.887314   1.0     0.099961   
75       0.397059  0.971820  0.545455  0.900989   1.0     0.099375   
144      0.000000  0.950684  0.727273  0.911033   1.0     0.131199   

     acousticness  instrumentalness  liveness   valence     tempo  \
54       0.000034          0.600619  0.289882  0.109770  0.399166   
189      0.000003          0.887513  0.053676  0.073780  0.458065   
153      0.000049          0.869969  0.072845  0.352175  0.306995   
75       0.000014          0.348813  0.023504  0.528951  0.365361   
144      0.000022          0.379773  0.020337  0.316185  0.003838   

     duration_ms  time_signature  liked  
54      0.294737            0.75    0.0  
189     0.361073            0.75    0.0  
153     0.286426            0.75    0.

# Recommendations using 'data.csv' and json files 

In [15]:
# Load the JSON files
with open('good.json', 'r') as file:
    good_data = json.load(file)

with open('dislike.json', 'r') as file:
    disliked_data = json.load(file)

# Display the content to understand its structure
print(good_data)
print(disliked_data)

{'audio_features': [{'danceability': 0.749, 'energy': 0.839, 'key': 6, 'loudness': -4.847, 'mode': 1, 'speechiness': 0.297, 'acousticness': 0.0867, 'instrumentalness': 0, 'liveness': 0.204, 'valence': 0.804, 'tempo': 172.068, 'type': 'audio_features', 'id': '55mcupbf7cIsuCEVAuTJVk', 'uri': 'spotify:track:55mcupbf7cIsuCEVAuTJVk', 'track_href': 'https://api.spotify.com/v1/tracks/55mcupbf7cIsuCEVAuTJVk', 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/55mcupbf7cIsuCEVAuTJVk', 'duration_ms': 111000, 'time_signature': 4}, {'danceability': 0.573, 'energy': 0.581, 'key': 10, 'loudness': -9.026, 'mode': 0, 'speechiness': 0.339, 'acousticness': 0.753, 'instrumentalness': 1.39e-06, 'liveness': 0.13, 'valence': 0.351, 'tempo': 76.506, 'type': 'audio_features', 'id': '57RtLWT7IpugV0yi5bsxJk', 'uri': 'spotify:track:57RtLWT7IpugV0yi5bsxJk', 'track_href': 'https://api.spotify.com/v1/tracks/57RtLWT7IpugV0yi5bsxJk', 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/57RtLWT7IpugV0yi5

In [24]:
# Convert JSON data into DataFrames
good_df = pd.json_normalize(good_data)
disliked_df = pd.json_normalize(disliked_data)

# Combine both DataFrames
combined_df = pd.concat([good_df, disliked_df], ignore_index=True)

In [26]:
# Load the CSV file
data_csv = pd.read_csv('data.csv')

In [37]:
# Normalize the 'audio_features' column from combined_df
audio_features_df = pd.json_normalize(combined_df['audio_features'])

# Combine the normalized audio features with the original CSV data
final_df = pd.concat([data_csv, audio_features_df], axis=1)

# Normalize all the columns that have nested dictionaries
for col in final_df.columns:
    if isinstance(final_df[col].iloc[0], dict):  # Check if the column contains nested dictionaries
        final_df = pd.concat([final_df.drop(columns=[col]), pd.json_normalize(final_df[col])], axis=1)

final_df = final_df.dropna(axis=1, thresh=int(0.5 * len(final_df)))

# Check the final DataFrame
print(final_df.head())

   danceability  energy  key  loudness  mode  speechiness  acousticness  \
0         0.803  0.6240    7    -6.764     0       0.0477         0.451   
1         0.762  0.7030   10    -7.951     0       0.3060         0.206   
2         0.261  0.0149    1   -27.528     1       0.0419         0.992   
3         0.722  0.7360    3    -6.994     0       0.0585         0.431   
4         0.787  0.5720    1    -7.516     1       0.2220         0.145   

   instrumentalness  liveness  valence    tempo  duration_ms  time_signature  \
0          0.000734    0.1000   0.6280   95.968       304524               4   
1          0.000000    0.0912   0.5190  151.329       247178               4   
2          0.897000    0.1020   0.0382   75.296       286987               4   
3          0.000001    0.1230   0.5820   89.860       208920               4   
4          0.000000    0.0753   0.6470  155.117       179413               4   

   liked  
0      0  
1      1  
2      0  
3      1  
4      1  


In [47]:
# Compute pairwise cosine similarity between songs based on audio features
cosine_sim = cosine_similarity(final_df.drop(['liked'], axis=1))

# Function to recommend songs based on a given song index
def recommend_song(song_index, top_n=5):
    # Get similarity scores for the song
    sim_scores = list(enumerate(cosine_sim[song_index]))
    
    # Sort songs by similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top N most similar songs (excluding the song itself)
    sim_scores = sim_scores[1:top_n+1]
    
    song_indices = [i[0] for i in sim_scores]
    similarity_scores = [i[1] for i in sim_scores]
    
    recommended_songs = final_df.iloc[song_indices]
    
    # Add similarity scores to the output
    recommended_songs['similarity_score'] = similarity_scores
    
    return recommended_songs[['similarity_score', 'danceability', 'energy', 'loudness']]


In [48]:
# Example: recommend songs similar to the first song
recommended_songs = recommend_song(10, top_n=5)
print(recommended_songs)

     similarity_score  danceability  energy  loudness
40                1.0         0.358   0.977    -8.179
94                1.0         0.908   0.610    -5.735
114               1.0         0.256   0.952    -8.631
24                1.0         0.729   0.533   -10.104
149               1.0         0.829   0.708    -5.050


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommended_songs['similarity_score'] = similarity_scores
