### ***First : importing the libraries***

In [18]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

### ***second : introducing the dataset***

In [19]:

file_path = '/kaggle/input/spotify-1million-tracks/spotify_data.csv'
music_data = pd.read_csv(file_path)

print(music_data.head())


   Unnamed: 0    artist_name        track_name                track_id  \
0           0     Jason Mraz   I Won't Give Up  53QF56cjZA9RTuuMZDrSA6   
1           1     Jason Mraz  93 Million Miles  1s8tP3jP4GZcyHDsjvw218   
2           2  Joshua Hyslop  Do Not Let Me Go  7BRCa8MPiyuvr2VU3O9W0F   
3           3   Boyce Avenue          Fast Car  63wsZUhUZLlh1OsyrZq7sz   
4           4   Andrew Belle  Sky's Still Blue  6nXIYClvJAfi6ujLiKqEq8   

   popularity  year     genre  danceability  energy  key  loudness  mode  \
0          68  2012  acoustic         0.483   0.303    4   -10.058     1   
1          50  2012  acoustic         0.572   0.454    3   -10.286     1   
2          57  2012  acoustic         0.409   0.234    3   -13.711     1   
3          58  2012  acoustic         0.392   0.251   10    -9.845     1   
4          54  2012  acoustic         0.430   0.791    6    -5.419     0   

   speechiness  acousticness  instrumentalness  liveness  valence    tempo  \
0       0.0429      

In [20]:
music_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1159764 entries, 0 to 1159763
Data columns (total 20 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   Unnamed: 0        1159764 non-null  int64  
 1   artist_name       1159749 non-null  object 
 2   track_name        1159763 non-null  object 
 3   track_id          1159764 non-null  object 
 4   popularity        1159764 non-null  int64  
 5   year              1159764 non-null  int64  
 6   genre             1159764 non-null  object 
 7   danceability      1159764 non-null  float64
 8   energy            1159764 non-null  float64
 9   key               1159764 non-null  int64  
 10  loudness          1159764 non-null  float64
 11  mode              1159764 non-null  int64  
 12  speechiness       1159764 non-null  float64
 13  acousticness      1159764 non-null  float64
 14  instrumentalness  1159764 non-null  float64
 15  liveness          1159764 non-null  float64
 16  

In [21]:
columns_to_drop = ['Unnamed: 0', 'track_id', 'year']
music_data = music_data.drop(columns_to_drop, axis=1)

In [22]:
music_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1159764 entries, 0 to 1159763
Data columns (total 17 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   artist_name       1159749 non-null  object 
 1   track_name        1159763 non-null  object 
 2   popularity        1159764 non-null  int64  
 3   genre             1159764 non-null  object 
 4   danceability      1159764 non-null  float64
 5   energy            1159764 non-null  float64
 6   key               1159764 non-null  int64  
 7   loudness          1159764 non-null  float64
 8   mode              1159764 non-null  int64  
 9   speechiness       1159764 non-null  float64
 10  acousticness      1159764 non-null  float64
 11  instrumentalness  1159764 non-null  float64
 12  liveness          1159764 non-null  float64
 13  valence           1159764 non-null  float64
 14  tempo             1159764 non-null  float64
 15  duration_ms       1159764 non-null  int64  
 16  

In [23]:
# Replacing the missing numerical values with the mean of each - to prevent outliers in the data
numerical_columns = music_data.select_dtypes(include=['float64', 'int64']).columns
music_data[numerical_columns] = music_data[numerical_columns].apply(lambda x: x.fillna(x.mean()), axis=0)
# Fill missing values for 'artist_name' and 'track_name' with a placeholder 
music_data[['artist_name', 'track_name']] = music_data[['artist_name', 'track_name']].fillna('Unknown')

In [24]:
music_data.info()
#Sweet, we have a perfect line of 1159764s!

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1159764 entries, 0 to 1159763
Data columns (total 17 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   artist_name       1159764 non-null  object 
 1   track_name        1159764 non-null  object 
 2   popularity        1159764 non-null  int64  
 3   genre             1159764 non-null  object 
 4   danceability      1159764 non-null  float64
 5   energy            1159764 non-null  float64
 6   key               1159764 non-null  int64  
 7   loudness          1159764 non-null  float64
 8   mode              1159764 non-null  int64  
 9   speechiness       1159764 non-null  float64
 10  acousticness      1159764 non-null  float64
 11  instrumentalness  1159764 non-null  float64
 12  liveness          1159764 non-null  float64
 13  valence           1159764 non-null  float64
 14  tempo             1159764 non-null  float64
 15  duration_ms       1159764 non-null  int64  
 16  

In [25]:
music_data.shape

(1159764, 17)

In [26]:
music_data['genre'].value_counts()

genre
black-metal       21852
gospel            21621
ambient           21389
acoustic          21097
alt-rock          20918
                  ...  
chicago-house      5170
dubstep            4774
detroit-techno     3920
rock               3319
songwriter          589
Name: count, Length: 82, dtype: int64

In [27]:
music_data.head()

Unnamed: 0,artist_name,track_name,popularity,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Jason Mraz,I Won't Give Up,68,acoustic,0.483,0.303,4,-10.058,1,0.0429,0.694,0.0,0.115,0.139,133.406,240166,3
1,Jason Mraz,93 Million Miles,50,acoustic,0.572,0.454,3,-10.286,1,0.0258,0.477,1.4e-05,0.0974,0.515,140.182,216387,4
2,Joshua Hyslop,Do Not Let Me Go,57,acoustic,0.409,0.234,3,-13.711,1,0.0323,0.338,5e-05,0.0895,0.145,139.832,158960,4
3,Boyce Avenue,Fast Car,58,acoustic,0.392,0.251,10,-9.845,1,0.0363,0.807,0.0,0.0797,0.508,204.961,304293,4
4,Andrew Belle,Sky's Still Blue,54,acoustic,0.43,0.791,6,-5.419,0,0.0302,0.0726,0.0193,0.11,0.217,171.864,244320,4


### ***Third : Making a train & test set***

In [28]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(music_data, test_size=0.2, random_state=42)

print("Train set shape:", train_set.shape)
print("Test set shape:", test_set.shape)

Train set shape: (927811, 17)
Test set shape: (231953, 17)


In [29]:
music_data = train_set.copy()

### ***Fourth : Feature normalization using Min-Max Scalar***

In [30]:
from sklearn.preprocessing import MinMaxScaler

#numerical features for scaling
numerical_features = [
    'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'popularity' ,
    'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms'
]

# Initialize the scaler
scaler = MinMaxScaler()

# Normalize the numerical features
music_data[numerical_features] = scaler.fit_transform(music_data[numerical_features])

# Check the normalized values
music_data.head()

Unnamed: 0,artist_name,track_name,popularity,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
931854,Kenny Bee,別恨離愁,0.0,cantopop,0.638469,0.254,8,0.721652,1,0.028115,0.831325,1e-06,0.112,0.33,0.331437,0.037298,4
892251,Demarkus Lewis,Don't Test Me,0.0,deep-house,0.853978,0.526,10,0.691514,0,0.091864,3.3e-05,0.0693,0.0847,0.238,0.503966,0.064765,4
486917,Sam Tallent,Money,0.03,comedy,0.460222,0.816,11,0.781569,1,0.654995,0.929719,0.00012,0.943,0.697,0.506306,0.007592,3
1126450,London Elektricity,Yikes!,0.14,drum-and-bass,0.495468,0.968,2,0.845143,1,0.126674,0.001918,0.835,0.202,0.355,0.692019,0.068603,4
171470,Steve Hofstetter,Seeing Red,0.06,comedy,0.443102,0.523,2,0.599608,0,0.930999,0.860442,0.0,0.951,0.304,0.305833,0.027909,4


### ***Fifth : Defining the recommendation system***
#### ***Content-based recommendation system using cosine similarity***
##### with artists

In [31]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

def get_song_features(index, data):
    # Select the relevant feature columns for the song (numerical features)
    feature_columns = [
        'danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
        'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms'
    ]
    return data.loc[index, feature_columns].values.reshape(1, -1)

def recommend_songs_based_on_artist_popularity_and_features_by_name(song_name, data, top_n=5):
    # Find the song index by name
    song_index = data[data['track_name'].str.lower() == song_name.lower()].index
    
    if len(song_index) == 0:
        return "Song not found in the dataset."
    
    # Get the first index (in case there are multiple songs with the same name)
    index = song_index[0]

    # Get the artist and popularity of the song at the specified index
    target_artist = data.loc[index, 'artist_name']
    target_popularity = data.loc[index, 'popularity']
    
    # Get all songs by the same artist
    artist_songs = data[data['artist_name'] == target_artist]
    
    # Exclude the target song from recommendations
    artist_songs = artist_songs[artist_songs.index != index]
    
    # Sort the songs by popularity in descending order
    artist_songs = artist_songs.sort_values(by='popularity', ascending=False)
    
    # Get the feature vector of the input song
    target_features = get_song_features(index, data)
    
    # Initialize list to store similarity scores
    similarity_scores = []
    
    # Calculate cosine similarity for each song by the same artist
    for i, row in artist_songs.iterrows():
        song_features = get_song_features(i, data)
        similarity = cosine_similarity(target_features, song_features)[0][0]
        similarity_scores.append((i, similarity, row['popularity'], row['track_name']))
    
    # Sort by similarity first and popularity second
    similarity_scores.sort(key=lambda x: (x[1], x[2]), reverse=True)
    
    # Get the top N recommendations based on the similarity and popularity
    top_recommendations = similarity_scores[:top_n]
    
    # Return the recommended songs by track name
    recommended_songs = []
    for song_idx, sim, _, track_name in top_recommendations:
        recommended_songs.append({
            'track_name': track_name,
            'artist_name': data.loc[song_idx, 'artist_name'],
            'popularity': data.loc[song_idx, 'popularity']
        })
    
    return pd.DataFrame(recommended_songs)

# Example usage:
song_name = 'Shape of You'  # Replace with the song name you're interested in
recommendations = recommend_songs_based_on_artist_popularity_and_features_by_name(song_name, music_data)
print(recommendations)


            track_name artist_name  popularity
0        Ass Back Home     Secrets        0.43
1            Let Me In     Secrets        0.19
2              The End     Secrets        0.18
3      Fragile Figures     Secrets        0.18
4  Sleep Well, Darling     Secrets        0.40


##### without artist recommendation

#### ***Content-based recommendation system using Kmeans***

In [32]:
from sklearn.cluster import KMeans

# Apply K-means clustering on the scaled numerical features
kmeans = KMeans(n_clusters=10, random_state=42)  # You can change the number of clusters as needed

# Fit the model and assign the cluster labels to the music_data
music_data['cluster'] = kmeans.fit_predict(music_data[numerical_features])

# Check the first few rows to ensure the cluster assignments are added
print(music_data[['track_name', 'cluster']].head())




            track_name  cluster
931854            別恨離愁        5
892251   Don't Test Me        8
486917           Money        7
1126450         Yikes!        1
171470      Seeing Red        7


In [33]:
music_data[['track_name', 'cluster']].head()

Unnamed: 0,track_name,cluster
931854,別恨離愁,5
892251,Don't Test Me,8
486917,Money,7
1126450,Yikes!,1
171470,Seeing Red,7


In [36]:
def recommend_songs_from_cluster(song_name, data, top_n=5):
    # Find the index of the song by name (case insensitive)
    song_indices = data[data['track_name'].str.lower() == song_name.lower()].index
    
    if len(song_indices) == 0:
        return f"Song '{song_name}' not found in the dataset."
    
    # Pick the first matching song
    target_index = song_indices[0]
    
    # Get the cluster of the target song
    target_cluster = data.loc[target_index, 'artist_name']
    
    # Get all songs in the same cluster
    cluster_songs = data[data['artist_name'] == target_cluster]
    
    # Exclude the target song from recommendations
    cluster_songs = cluster_songs[cluster_songs['track_name'] != song_name]

    recommended_songs = cluster_songs.sort_values(by='popularity', ascending=False)
    
  
    return recommended_songs[['track_name', 'artist_name', 'popularity']].head(top_n)

# User Input - MR:
song_name = "Strangers in The Night"  # Replace with the song name you're interested in - DM
recommendations = recommend_songs_from_cluster(song_name, music_data)
print(recommendations)


                                         track_name artist_name  popularity
97161      Pennies From Heaven (with Michael Bublé)   Paul Anka        0.48
97234   It's Hard To Say Goodbye (with Céline Dion)   Paul Anka        0.46
653027                           Puppy Love - Remix   Paul Anka        0.44
97236                   My Way (with Frank Sinatra)   Paul Anka        0.43
519560                           You Are My Destiny   Paul Anka        0.41
