### ***First : importing the libraries***

In [2]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

### ***second : introducing the dataset***

In [3]:

file_path = '../Spotify_music_dataset/spotify_data.csv'
music_data = pd.read_csv(file_path)

print(music_data.head())


   Unnamed: 0    artist_name        track_name                track_id  \
0           0     Jason Mraz   I Won't Give Up  53QF56cjZA9RTuuMZDrSA6   
1           1     Jason Mraz  93 Million Miles  1s8tP3jP4GZcyHDsjvw218   
2           2  Joshua Hyslop  Do Not Let Me Go  7BRCa8MPiyuvr2VU3O9W0F   
3           3   Boyce Avenue          Fast Car  63wsZUhUZLlh1OsyrZq7sz   
4           4   Andrew Belle  Sky's Still Blue  6nXIYClvJAfi6ujLiKqEq8   

   popularity  year     genre  danceability  energy  key  loudness  mode  \
0          68  2012  acoustic         0.483   0.303    4   -10.058     1   
1          50  2012  acoustic         0.572   0.454    3   -10.286     1   
2          57  2012  acoustic         0.409   0.234    3   -13.711     1   
3          58  2012  acoustic         0.392   0.251   10    -9.845     1   
4          54  2012  acoustic         0.430   0.791    6    -5.419     0   

   speechiness  acousticness  instrumentalness  liveness  valence    tempo  \
0       0.0429      

In [4]:
music_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1159764 entries, 0 to 1159763
Data columns (total 20 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   Unnamed: 0        1159764 non-null  int64  
 1   artist_name       1159749 non-null  object 
 2   track_name        1159763 non-null  object 
 3   track_id          1159764 non-null  object 
 4   popularity        1159764 non-null  int64  
 5   year              1159764 non-null  int64  
 6   genre             1159764 non-null  object 
 7   danceability      1159764 non-null  float64
 8   energy            1159764 non-null  float64
 9   key               1159764 non-null  int64  
 10  loudness          1159764 non-null  float64
 11  mode              1159764 non-null  int64  
 12  speechiness       1159764 non-null  float64
 13  acousticness      1159764 non-null  float64
 14  instrumentalness  1159764 non-null  float64
 15  liveness          1159764 non-null  float64
 16  

In [5]:
columns_to_drop = ['Unnamed: 0', 'track_id', 'year']
music_data = music_data.drop(columns_to_drop, axis=1)

In [6]:
music_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1159764 entries, 0 to 1159763
Data columns (total 17 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   artist_name       1159749 non-null  object 
 1   track_name        1159763 non-null  object 
 2   popularity        1159764 non-null  int64  
 3   genre             1159764 non-null  object 
 4   danceability      1159764 non-null  float64
 5   energy            1159764 non-null  float64
 6   key               1159764 non-null  int64  
 7   loudness          1159764 non-null  float64
 8   mode              1159764 non-null  int64  
 9   speechiness       1159764 non-null  float64
 10  acousticness      1159764 non-null  float64
 11  instrumentalness  1159764 non-null  float64
 12  liveness          1159764 non-null  float64
 13  valence           1159764 non-null  float64
 14  tempo             1159764 non-null  float64
 15  duration_ms       1159764 non-null  int64  
 16  

In [7]:
# Replacing the missing numerical values with the mean of each - to prevent outliers in the data
numerical_columns = music_data.select_dtypes(include=['float64', 'int64']).columns
music_data[numerical_columns] = music_data[numerical_columns].apply(lambda x: x.fillna(x.mean()), axis=0)
# Fill missing values for 'artist_name' and 'track_name' with a placeholder 
music_data[['artist_name', 'track_name']] = music_data[['artist_name', 'track_name']].fillna('Unknown')

In [8]:
music_data.info()
#Sweet, we have a perfect line of 1159764s!

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1159764 entries, 0 to 1159763
Data columns (total 17 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   artist_name       1159764 non-null  object 
 1   track_name        1159764 non-null  object 
 2   popularity        1159764 non-null  int64  
 3   genre             1159764 non-null  object 
 4   danceability      1159764 non-null  float64
 5   energy            1159764 non-null  float64
 6   key               1159764 non-null  int64  
 7   loudness          1159764 non-null  float64
 8   mode              1159764 non-null  int64  
 9   speechiness       1159764 non-null  float64
 10  acousticness      1159764 non-null  float64
 11  instrumentalness  1159764 non-null  float64
 12  liveness          1159764 non-null  float64
 13  valence           1159764 non-null  float64
 14  tempo             1159764 non-null  float64
 15  duration_ms       1159764 non-null  int64  
 16  

In [9]:
music_data.shape

(1159764, 17)

In [10]:
music_data['genre'].value_counts()

genre
black-metal       21852
gospel            21621
ambient           21389
acoustic          21097
alt-rock          20918
                  ...  
chicago-house      5170
dubstep            4774
detroit-techno     3920
rock               3319
songwriter          589
Name: count, Length: 82, dtype: int64

In [11]:
music_data.head()

Unnamed: 0,artist_name,track_name,popularity,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Jason Mraz,I Won't Give Up,68,acoustic,0.483,0.303,4,-10.058,1,0.0429,0.694,0.0,0.115,0.139,133.406,240166,3
1,Jason Mraz,93 Million Miles,50,acoustic,0.572,0.454,3,-10.286,1,0.0258,0.477,1.4e-05,0.0974,0.515,140.182,216387,4
2,Joshua Hyslop,Do Not Let Me Go,57,acoustic,0.409,0.234,3,-13.711,1,0.0323,0.338,5e-05,0.0895,0.145,139.832,158960,4
3,Boyce Avenue,Fast Car,58,acoustic,0.392,0.251,10,-9.845,1,0.0363,0.807,0.0,0.0797,0.508,204.961,304293,4
4,Andrew Belle,Sky's Still Blue,54,acoustic,0.43,0.791,6,-5.419,0,0.0302,0.0726,0.0193,0.11,0.217,171.864,244320,4


### ***Third : Making a train & test set***

In [12]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(music_data, test_size=0.2, random_state=42)

print("Train set shape:", train_set.shape)
print("Test set shape:", test_set.shape)

Train set shape: (927811, 17)
Test set shape: (231953, 17)


In [13]:
music_data = train_set.copy()

### ***Fourth : Feature normalization using Min-Max Scalar***

In [14]:
from sklearn.preprocessing import MinMaxScaler

#numerical features for scaling
numerical_features = [
    'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'popularity' ,
    'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms'
]

# Initialize the scaler
scaler = MinMaxScaler()

# Normalize the numerical features
music_data[numerical_features] = scaler.fit_transform(music_data[numerical_features])

# Check the normalized values
music_data.head()

Unnamed: 0,artist_name,track_name,popularity,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
931854,Kenny Bee,別恨離愁,0.0,cantopop,0.638469,0.254,8,0.721652,1,0.028115,0.831325,1e-06,0.112,0.33,0.331437,0.037298,4
892251,Demarkus Lewis,Don't Test Me,0.0,deep-house,0.853978,0.526,10,0.691514,0,0.091864,3.3e-05,0.0693,0.0847,0.238,0.503966,0.064765,4
486917,Sam Tallent,Money,0.03,comedy,0.460222,0.816,11,0.781569,1,0.654995,0.929719,0.00012,0.943,0.697,0.506306,0.007592,3
1126450,London Elektricity,Yikes!,0.14,drum-and-bass,0.495468,0.968,2,0.845143,1,0.126674,0.001918,0.835,0.202,0.355,0.692019,0.068603,4
171470,Steve Hofstetter,Seeing Red,0.06,comedy,0.443102,0.523,2,0.599608,0,0.930999,0.860442,0.0,0.951,0.304,0.305833,0.027909,4


### ***Fifth : Defining the recommendation system***
#### ***Content-based recommendation system using cosine similarity***
##### with artists

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

def get_song_features(index, data):
    # Select the relevant feature columns for the song (numerical features)
    feature_columns = [
        'danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
        'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms'
    ]
    return data.loc[index, feature_columns].values.reshape(1, -1)

def recommend_songs_based_on_artist_popularity_and_features_by_name(song_name, data, top_n=5):
    # Find the song index by name
    song_index = data[data['track_name'].str.lower() == song_name.lower()].index
    
    if len(song_index) == 0:
        return "Song not found in the dataset."
    
    # Get the first index (in case there are multiple songs with the same name)
    index = song_index[0]

    # Get the artist and popularity of the song at the specified index
    target_artist = data.loc[index, 'artist_name']
    target_popularity = data.loc[index, 'popularity']
    
    # Get all songs by the same artist
    artist_songs = data[data['artist_name'] == target_artist]
    
    # Exclude the target song from recommendations
    artist_songs = artist_songs[artist_songs.index != index]
    
    # Sort the songs by popularity in descending order
    artist_songs = artist_songs.sort_values(by='popularity', ascending=False)
    
    # Get the feature vector of the input song
    target_features = get_song_features(index, data)
    
    # Initialize list to store similarity scores
    similarity_scores = []
    
    # Calculate cosine similarity for each song by the same artist
    for i, row in artist_songs.iterrows():
        song_features = get_song_features(i, data)
        similarity = cosine_similarity(target_features, song_features)[0][0]
        similarity_scores.append((i, similarity, row['popularity'], row['track_name']))
    
    # Sort by similarity first and popularity second
    similarity_scores.sort(key=lambda x: (x[1], x[2]), reverse=True)
    
    # Get the top N recommendations based on the similarity and popularity
    top_recommendations = similarity_scores[:top_n]
    
    # Return the recommended songs by track name
    recommended_songs = []
    for song_idx, sim, _, track_name in top_recommendations:
        recommended_songs.append({
            'track_name': track_name,
            'artist_name': data.loc[song_idx, 'artist_name'],
            'popularity': data.loc[song_idx, 'popularity']
        })
    
    return pd.DataFrame(recommended_songs)

# Example usage:
song_name = 'Shape of You'  # Replace with the song name you're interested in
recommendations = recommend_songs_based_on_artist_popularity_and_features_by_name(song_name, music_data)
recommendations


Unnamed: 0,track_name,artist_name,popularity
0,Ass Back Home,Secrets,0.43
1,Let Me In,Secrets,0.19
2,The End,Secrets,0.18
3,Fragile Figures,Secrets,0.18
4,"Sleep Well, Darling",Secrets,0.4


##### without artist recommendation

In [16]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

# Helper function to normalize data and create an overall evaluation column
def preprocess_data(data):
    feature_columns = [
        'danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
        'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'popularity'
    ]
    # Normalize features
    scaler = MinMaxScaler()
    data[feature_columns] = scaler.fit_transform(data[feature_columns])
    
    # Create a weighted overall evaluation column (optional)
    data['overall_evaluation'] = (
        0.2 * data['danceability'] +
        0.2 * data['energy'] +
        0.15 * data['valence'] +
        0.1 * data['popularity'] +
        0.1 * data['acousticness'] +
        0.1 * data['instrumentalness'] +
        0.1 * data['tempo']
    )
    return data, feature_columns

# Function to recommend songs using cosine similarity
def recommend_songs(song_name, data, feature_columns, top_n=5):
    # Find the song index by name
    song_index = data[data['track_name'].str.lower() == song_name.lower()].index
    
    if len(song_index) == 0:
        return "Song not found in the dataset."
    
    # Get the first index (in case there are duplicates)
    index = song_index[0]

    # Extract features for similarity calculation
    song_features = data[feature_columns].values

    # Get the feature vector of the target song
    target_features = song_features[index].reshape(1, -1)

    # Compute cosine similarity for all songs at once
    similarity_scores = cosine_similarity(target_features, song_features)[0]

    # Add similarity scores to the dataframe
    data['similarity_score'] = similarity_scores

    # Exclude the target song from recommendations
    recommendations = data[data.index != index]

    # Sort by similarity and popularity
    recommendations = recommendations.sort_values(
        by=['similarity_score', 'popularity'], ascending=[False, False]
    )

    # Get the top N recommendations
    top_recommendations = recommendations.head(top_n)

    # Return relevant details of the recommended songs
    return top_recommendations[['track_name', 'artist_name', 'popularity', 'similarity_score']]

# Preprocess the dataset
music_data, feature_columns = preprocess_data(music_data)


In [17]:
# Example usage
song_name = 'Legends never die'  # Replace with your song
recommendations = recommend_songs(song_name, music_data, feature_columns)
recommendations

Unnamed: 0,track_name,artist_name,popularity,similarity_score
349194,Mary Had A Little Lamb,HooplaKidz,0.09,1.0
461122,Mary Had a Little Lamb,HooplaKidz,0.01,0.998808
416229,A Song Call Marley - Recorded at Spotify Studi...,Toots & The Maytals,0.12,0.99784
637557,"Madre Mía, Perdóname",El Super Trio,0.11,0.997779
497267,A Loba,Raquel dos Teclados,0.11,0.997613


#### ***Content-based recommendation system using Kmeans***

In [18]:
from sklearn.cluster import KMeans

# Apply K-means clustering on the scaled numerical features
kmeans = KMeans(n_clusters=10, random_state=42)  # You can change the number of clusters as needed

# Fit the model and assign the cluster labels to the music_data
music_data['cluster'] = kmeans.fit_predict(music_data[numerical_features])

# Check the first few rows to ensure the cluster assignments are added
music_data[['track_name', 'cluster']].head()


Unnamed: 0,track_name,cluster
931854,別恨離愁,0
892251,Don't Test Me,8
486917,Money,4
1126450,Yikes!,5
171470,Seeing Red,4


In [19]:
music_data[['track_name', 'cluster']].head()

Unnamed: 0,track_name,cluster
931854,別恨離愁,0
892251,Don't Test Me,8
486917,Money,4
1126450,Yikes!,5
171470,Seeing Red,4


##### With artist recommendations

In [20]:
def recommend_songs_from_cluster(song_name, data, top_n=5):
    # Find the index of the song by name (case insensitive)
    song_indices = data[data['track_name'].str.lower() == song_name.lower()].index
    
    if len(song_indices) == 0:
        return f"Song '{song_name}' not found in the dataset."
    
    # Pick the first matching song
    target_index = song_indices[0]
    
    # Get the cluster of the target song
    target_cluster = data.loc[target_index, 'artist_name']
    
    # Get all songs in the same cluster
    cluster_songs = data[data['artist_name'] == target_cluster]
    
    # Exclude the target song from recommendations
    cluster_songs = cluster_songs[cluster_songs['track_name'] != song_name]

    recommended_songs = cluster_songs.sort_values(by='popularity', ascending=False)
    
  
    return recommended_songs[['track_name', 'artist_name', 'popularity']].head(top_n)

In [21]:
# User Input - MR:
song_name = "Strangers in The Night"  # Replace with the song name you're interested in - DM
recommendations = recommend_songs_from_cluster(song_name, music_data)
recommendations


Unnamed: 0,track_name,artist_name,popularity
97161,Pennies From Heaven (with Michael Bublé),Paul Anka,0.48
97234,It's Hard To Say Goodbye (with Céline Dion),Paul Anka,0.46
653027,Puppy Love - Remix,Paul Anka,0.44
97236,My Way (with Frank Sinatra),Paul Anka,0.43
519560,You Are My Destiny,Paul Anka,0.41


##### without artist recommendation

In [22]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

In [23]:
# Step 1: Preprocess the data
def preprocess_data(data):
    feature_columns = [
        'danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
        'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'popularity'
    ]
    # Normalize the features
    scaler = MinMaxScaler()
    data[feature_columns] = scaler.fit_transform(data[feature_columns])
    return data, feature_columns

In [24]:
# Step 2: Apply K-means clustering
def apply_kmeans(data, feature_columns, n_clusters=10):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    data['cluster'] = kmeans.fit_predict(data[feature_columns])
    return kmeans, data

In [25]:
# Step 3: Recommend songs based on K-means clustering
def recommend_songs_kmeans_only(song_name, data, top_n=5):
    # Find the song index by name
    song_index = data[data['track_name'].str.lower() == song_name.lower()].index
    
    if len(song_index) == 0:
        return "Song not found in the dataset."
    
    # Get the first index (in case there are duplicates)
    index = song_index[0]
    
    # Identify the cluster of the selected song
    target_cluster = data.loc[index, 'cluster']
    
    # Filter the data to include only songs in the same cluster
    cluster_data = data[data['cluster'] == target_cluster]
    
    # Exclude the selected song from recommendations
    cluster_data = cluster_data[cluster_data.index != index]
    
    # Sort songs by popularity (descending) and select the top N
    recommendations = cluster_data.sort_values(by='popularity', ascending=False).head(top_n)
    
    # Return recommended songs
    return recommendations[['track_name', 'artist_name', 'popularity', 'cluster']]

# Preprocess the dataset
music_data, feature_columns = preprocess_data(music_data)

# Apply K-means clustering
kmeans_model, music_data = apply_kmeans(music_data, feature_columns, n_clusters=10)

In [26]:
# Example usage
song_name = 'Strangers in The Night'  # Replace with the name of the song you're interested in
recommendations = recommend_songs_kmeans_only(song_name, music_data)

print("Recommended Songs:")
recommendations

Recommended Songs:


Unnamed: 0,track_name,artist_name,popularity,cluster
612503,Flowers,Miley Cyrus,1.0,8
605178,"Shakira: Bzrp Music Sessions, Vol. 53",Bizarrap,0.96,8
612504,Die For You - Remix,The Weeknd,0.95,8
569184,Kill Bill,SZA,0.94,8
560200,"Quevedo: Bzrp Music Sessions, Vol. 52",Bizarrap,0.92,8


#### ***A combination of kMeans and cosine similartiy***

In [27]:
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler


In [28]:
# Step 1: Preprocess the data
def preprocess_data(data):
    feature_columns = [
        'danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
        'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'popularity'
    ]
    # Normalize the features
    scaler = MinMaxScaler()
    data[feature_columns] = scaler.fit_transform(data[feature_columns])
    return data, feature_columns

In [29]:
# Step 2: Apply K-means clustering
def apply_kmeans(data, feature_columns, n_clusters=10):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    data['cluster'] = kmeans.fit_predict(data[feature_columns])
    return kmeans, data

In [30]:




# Step 3: Recommendation system combining K-means and cosine similarity
def recommend_songs_combined(song_name, data, feature_columns, kmeans, top_n=5):
    # Find the song index by name
    song_index = data[data['track_name'].str.lower() == song_name.lower()].index
    
    if len(song_index) == 0:
        return "Song not found in the dataset."
    
    # Get the first index (in case there are duplicates)
    index = song_index[0]

    # Get the cluster of the target song
    target_cluster = data.loc[index, 'cluster']

    # Filter data to only include songs in the same cluster
    cluster_data = data[data['cluster'] == target_cluster]

    # Get feature vector of the target song
    target_features = data.loc[index, feature_columns].values.reshape(1, -1)

    # Compute cosine similarity within the same cluster
    cluster_features = cluster_data[feature_columns].values
    similarity_scores = cosine_similarity(target_features, cluster_features)[0]

    # Add similarity scores to the cluster data
    cluster_data = cluster_data.copy()
    cluster_data['similarity_score'] = similarity_scores

    # Exclude the target song from recommendations
    cluster_data = cluster_data[cluster_data.index != index]

    # Sort by similarity and popularity
    recommendations = cluster_data.sort_values(
        by=['similarity_score', 'popularity'], ascending=[False, False]
    ).head(top_n)

    # Get input song information
    input_song_info = data.loc[index, ['track_name', 'artist_name', 'popularity', 'cluster'] + feature_columns].to_dict()

    return input_song_info, recommendations[['track_name', 'artist_name', 'popularity', 'similarity_score', 'cluster']]

# Preprocess the dataset
music_data, feature_columns = preprocess_data(music_data)

# Apply K-means clustering
kmeans_model, music_data = apply_kmeans(music_data, feature_columns, n_clusters=10)

In [31]:
# Example usage
song_name = 'shape of you'  # Replace with your desired song
input_song_info, recommendations = recommend_songs_combined(song_name, music_data, feature_columns, kmeans_model)

print("Input Song Information:")
pd.DataFrame([input_song_info]) # Display the input song details

Input Song Information:


Unnamed: 0,track_name,artist_name,popularity,cluster,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,Shape Of You,Secrets,0.3,9,0.555891,0.925,0.8523,0.070031,0.001596,0.0,0.164,0.695,0.383979,0.041531


In [32]:

print("\nRecommended Songs:")
recommendations  # Display the recommended songs


Recommended Songs:


Unnamed: 0,track_name,artist_name,popularity,similarity_score,cluster
56907,Beautiful,Pop Evil,0.31,0.999705,9
552262,The Devil,Twiddle,0.28,0.99966,9
177578,Choked Out,The Mountain Goats,0.32,0.999378,9
953190,"Grammy Family (feat. DJ Khaled, Kanye West & J...",Consequence,0.34,0.999328,9
571928,Hang a Cross on Me,Pond,0.31,0.999312,9


## ***Test code***

In [35]:
music_data = test_set.copy()
music_data.head()

Unnamed: 0,artist_name,track_name,popularity,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
882616,Cabas,Amor De Mis Amores,38,alt-rock,0.725,0.553,6,-6.319,0,0.034,0.276,7e-06,0.185,0.729,90.009,206653,4
621408,Ketil Bjørnstad,Første sang,11,swedish,0.277,0.164,9,-16.743,0,0.0373,0.878,0.000181,0.335,0.184,89.308,459733,4
927704,Project 86,Evil (A Chorus Of Resistance),38,alt-rock,0.486,0.927,2,-4.845,0,0.0428,3e-06,0.0145,0.0952,0.377,135.54,183373,4
439351,Ital Tek,Open Heart,18,dubstep,0.411,0.442,1,-12.745,0,0.027,0.485,0.926,0.191,0.172,174.019,347610,3
266036,I-Roy,Irie Right,18,dancehall,0.748,0.66,10,-4.648,0,0.271,0.125,0.0,0.0783,0.4,75.583,196179,4


In [36]:
def preprocess_data(data):
    feature_columns = [
        'danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
        'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'popularity'
    ]

    scaler = MinMaxScaler()
    data[feature_columns] = scaler.fit_transform(data[feature_columns])
    return data, feature_columns

In [37]:
def apply_kmeans(data, feature_columns, n_clusters=10):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    data['cluster'] = kmeans.fit_predict(data[feature_columns])
    return kmeans, data


In [38]:
def recommend_songs_combined(song_name, data, feature_columns, kmeans, top_n=5):

    song_index = data[data['track_name'].str.lower() == song_name.lower()].index
    
    if len(song_index) == 0:
        return "Song not found in the dataset."

In [39]:

def recommend_songs_combined(song_name, data, feature_columns, kmeans, top_n=5):

    song_index = data[data['track_name'].str.lower() == song_name.lower()].index
    
    if len(song_index) == 0:
        return "Song not found in the dataset."
    
    index = song_index[0]

    target_cluster = data.loc[index, 'cluster']

    cluster_data = data[data['cluster'] == target_cluster]

    target_features = data.loc[index, feature_columns].values.reshape(1, -1)

    cluster_features = cluster_data[feature_columns].values
    similarity_scores = cosine_similarity(target_features, cluster_features)[0]

    cluster_data = cluster_data.copy()
    cluster_data['similarity_score'] = similarity_scores

    cluster_data = cluster_data[cluster_data.index != index]

    recommendations = cluster_data.sort_values(
        by=['similarity_score', 'popularity'], ascending=[False, False]
    ).head(top_n)

    input_song_info = data.loc[index, ['track_name', 'artist_name', 'popularity', 'cluster'] + feature_columns].to_dict()

    return input_song_info, recommendations[['track_name', 'artist_name', 'popularity', 'similarity_score', 'cluster']]

music_data, feature_columns = preprocess_data(music_data)

kmeans_model, music_data = apply_kmeans(music_data, feature_columns, n_clusters=10)

song_name = 'shape of you'
input_song_info, recommendations = recommend_songs_combined(song_name, music_data, feature_columns, kmeans_model)

print("Input Song Information:")
print(pd.DataFrame([input_song_info]))


print("\nRecommended Songs:")
recommendations

Input Song Information:
     track_name    artist_name  popularity  cluster  danceability  energy  \
0  Shape of You  Peter Gergely    0.096774        1      0.550403   0.421   

   loudness  speechiness  acousticness  instrumentalness  liveness   valence  \
0  0.796067     0.033953      0.651606             0.148    0.0944  0.771772   

      tempo  duration_ms  
0  0.381954     0.035867  

Recommended Songs:


Unnamed: 0,track_name,artist_name,popularity,similarity_score,cluster
894231,Mark My Word,Marcia Griffiths,0.107527,0.995364,1
726621,"Baby, Baby Don't Cry - Album Version / Stereo",Smokey Robinson & The Miracles,0.086022,0.994687,1
1020278,Brand New,Emilie Mover,0.129032,0.994664,1
785060,Hey Mr Policeman,Family,0.086022,0.99457,1
970810,Turfistica,Trio Hugo Diaz,0.053763,0.994308,1
