In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import pickle


In [2]:
df = pd.read_csv('spotify_tracks_data.csv')

In [3]:
df.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,7FPbxfwWa2gzB9ePX19I5z,"Arijit Singh, Neeti Mohan",Boss,Har Kisi Ko,57,337046,False,0.553,0.705,3,-6.898,1,0.0308,0.478,0.0,0.278,0.324,95.007,4,"filmi, modern bollywood"
1,6pGhyosvAt0ke0XH3vfFAr,Jazzy B,Romeo,Dil Luteya,39,261760,False,0.812,0.786,2,-5.939,1,0.205,0.254,0.0,0.25,0.885,89.984,4,"classic bhangra, punjabi pop, bhangra"
2,6O1RAYwmOqkcoJQr7J9SFJ,"Manan Bhardwaj, Arijit Singh",Yaariyan 2,Oonchi Oonchi Deewarein,58,251109,False,0.263,0.616,5,-5.17,1,0.0398,0.533,0.0,0.086,0.298,78.353,4,"desi pop, filmi, modern bollywood"
3,6ZBPTNro2APVK8mDBc9L30,"Atif Aslam, Monali Thakur",Tu Mohabbat Hai (Valentine Special),"Tu Mohabbat Hai (From ""Tere Naal Love Ho Gaya"")",0,322757,False,0.637,0.669,10,-6.333,1,0.0361,0.355,6.2e-05,0.0963,0.523,118.967,4,"filmi, classic pakistani pop, sufi, modern bol..."
4,4KSHc0ATBNg29sFqUmcfNq,"Jatin-Lalit, Babul Supriyo, Mahalakshmi Iyer, ...",Fanaa,Chanda Chamke,46,228388,False,0.894,0.687,5,-8.651,1,0.119,0.366,0.0,0.111,0.645,113.992,4,"filmi, classic bollywood, afghan pop, modern b..."


In [4]:
# Keep only rows with song name and artist
df = df.dropna(subset=['track_name', 'artists'])

# Remove duplicates
df = df.drop_duplicates(subset=['track_name', 'artists'])

df.reset_index(drop=True, inplace=True)
df.head()


Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,7FPbxfwWa2gzB9ePX19I5z,"Arijit Singh, Neeti Mohan",Boss,Har Kisi Ko,57,337046,False,0.553,0.705,3,-6.898,1,0.0308,0.478,0.0,0.278,0.324,95.007,4,"filmi, modern bollywood"
1,6pGhyosvAt0ke0XH3vfFAr,Jazzy B,Romeo,Dil Luteya,39,261760,False,0.812,0.786,2,-5.939,1,0.205,0.254,0.0,0.25,0.885,89.984,4,"classic bhangra, punjabi pop, bhangra"
2,6O1RAYwmOqkcoJQr7J9SFJ,"Manan Bhardwaj, Arijit Singh",Yaariyan 2,Oonchi Oonchi Deewarein,58,251109,False,0.263,0.616,5,-5.17,1,0.0398,0.533,0.0,0.086,0.298,78.353,4,"desi pop, filmi, modern bollywood"
3,6ZBPTNro2APVK8mDBc9L30,"Atif Aslam, Monali Thakur",Tu Mohabbat Hai (Valentine Special),"Tu Mohabbat Hai (From ""Tere Naal Love Ho Gaya"")",0,322757,False,0.637,0.669,10,-6.333,1,0.0361,0.355,6.2e-05,0.0963,0.523,118.967,4,"filmi, classic pakistani pop, sufi, modern bol..."
4,4KSHc0ATBNg29sFqUmcfNq,"Jatin-Lalit, Babul Supriyo, Mahalakshmi Iyer, ...",Fanaa,Chanda Chamke,46,228388,False,0.894,0.687,5,-8.651,1,0.119,0.366,0.0,0.111,0.645,113.992,4,"filmi, classic bollywood, afghan pop, modern b..."


In [5]:
df = df.rename(columns={
    'track_name': 'song_name',
    'artists': 'singer'
})


In [6]:
feature_cols = ['danceability','energy','valence','acousticness','instrumentalness','liveness','tempo']

df_features = df[feature_cols].copy()

# Handle missing values
df_features = df_features.fillna(df_features.mean())

df_features.head()


Unnamed: 0,danceability,energy,valence,acousticness,instrumentalness,liveness,tempo
0,0.553,0.705,0.324,0.478,0.0,0.278,95.007
1,0.812,0.786,0.885,0.254,0.0,0.25,89.984
2,0.263,0.616,0.298,0.533,0.0,0.086,78.353
3,0.637,0.669,0.523,0.355,6.2e-05,0.0963,118.967
4,0.894,0.687,0.645,0.366,0.0,0.111,113.992


In [7]:
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(df_features)

# Save scaler
joblib.dump(scaler, "scaler.pkl")

scaled_features[:5]


array([[5.16706444e-01, 7.03968639e-01, 3.10162128e-01, 4.80798082e-01,
        0.00000000e+00, 2.71158245e-01, 3.14866556e-01],
       [8.25775656e-01, 7.85806660e-01, 9.04630709e-01, 2.55408102e-01,
        0.00000000e+00, 2.42046163e-01, 2.84357196e-01],
       [1.70644391e-01, 6.14047850e-01, 2.82610999e-01, 5.36139372e-01,
        0.00000000e+00, 7.15325431e-02, 2.13711294e-01],
       [6.16945107e-01, 6.67596185e-01, 5.21034227e-01, 3.57034834e-01,
        6.39260021e-05, 8.22416303e-02, 4.60397964e-01],
       [9.23627685e-01, 6.85782412e-01, 6.50312599e-01, 3.68103092e-01,
        0.00000000e+00, 9.75254731e-02, 4.30180153e-01]])

In [8]:
similarity = cosine_similarity(scaled_features)
similarity.shape


(3598, 3598)

In [9]:
song_to_index = {name: idx for idx, name in enumerate(df['song_name'])}
index_to_song = {idx: name for idx, name in enumerate(df['song_name'])}

index_mapping = {
    "song_to_index": song_to_index,
    "index_to_song": index_to_song
}

with open("index_mapping.pkl", "wb") as f:
    pickle.dump(index_mapping, f)


In [10]:
np.save("similarity.npy", similarity)
df.to_csv("cleaned_dataset.csv", index=False)

print("Artifacts saved:")
print("✔ similarity.npy")
print("✔ index_mapping.pkl")
print("✔ scaler.pkl")
print("✔ cleaned_dataset.csv")


Artifacts saved:
✔ similarity.npy
✔ index_mapping.pkl
✔ scaler.pkl
✔ cleaned_dataset.csv


In [11]:
def assign_mood(row):
    val = row['valence']
    energy = row['energy']
    tempo = row['tempo']
    acoustic = row['acousticness']

    if val > 0.65 and energy > 0.6:
        return "Happy"
    elif val < 0.35 and acoustic > 0.5:
        return "Sad"
    elif energy > 0.75 or tempo > 120:
        return "Energetic"
    elif energy < 0.4 and acoustic > 0.4:
        return "Calm"
    else:
        return "Surprise"


In [17]:
df['mood'] = df.apply(assign_mood, axis=1)


In [18]:
df['mood'].value_counts()


mood
Energetic    1064
Surprise     1054
Happy         933
Sad           402
Calm          145
Name: count, dtype: int64

In [12]:
df['mood'] = df.apply(assign_mood, axis=1)
df['mood'].value_counts()


mood
Energetic    1064
Surprise     1054
Happy         933
Sad           402
Calm          145
Name: count, dtype: int64

In [13]:
df.to_csv("cleaned_dataset.csv", index=False)
print("Updated dataset saved with mood labels.")


Updated dataset saved with mood labels.


In [14]:
def mood_based_recommend(mood, top_n=10):
    # Filter songs by mood
    mood_df = df[df['mood'].str.lower() == mood.lower()]

    if mood_df.empty:
        return pd.DataFrame()

    # Pick a seed song from this mood (top valence/energy example)
    seed = mood_df.sample(1)['song_name'].iloc[0]

    if seed not in song_to_index:
        return mood_df.head(top_n)[['song_name','singer','valence','energy','mood']]

    # Compute CF recommendations
    idx = song_to_index[seed]
    sims = similarity[idx]

    similar_indices = sims.argsort()[::-1]
    similar_indices = [i for i in similar_indices if i != idx]

    # Convert to DataFrame
    recs = df.iloc[similar_indices]

    # Filter by same mood again
    final_recs = recs[recs['mood'].str.lower() == mood.lower()]

    # Return top N
    return final_recs.head(top_n)[['song_name','singer','valence','energy','mood']]


In [16]:
print("Happy mood recommendations:")
mood_based_recommend("calm", 10)


Happy mood recommendations:


Unnamed: 0,song_name,singer,valence,energy,mood
1072,Chura Liya Hai Tum Ne,Mohammed Rafi,0.706,0.301,Calm
3131,"Chura Liya Hai Tumne Jo Dil Ko - From ""Yaadon ...","Asha Bhosle, Mohammed Rafi",0.684,0.318,Calm
3091,"Chura Liya Hai Tumne Jo Dil Ko (From ""Yaadon K...","Asha Bhosle, Mohammed Rafi",0.684,0.318,Calm
3024,Chura Liya Hai Tumne Jo Dil Ko,"Asha Bhosle, Mohammed Rafi, R. D. Burman",0.684,0.318,Calm
2573,Chura Liya Hai Tumne,"Asha Bhosle, Kishore Kumar, R. D. Burman, Mohd...",0.692,0.371,Calm
1463,Chal Dariya Mein Doob Jayen,"Laxmikant–Pyarelal, Kishore Kumar, Lata Manges...",0.683,0.377,Calm
1939,"Teri Bindiya Re - From ""Abhimaan""","Mohammed Rafi, Lata Mangeshkar",0.683,0.378,Calm
2127,Teri Bindiya Re,"Lata Mangeshkar, Mohammed Rafi, S. D. Burman",0.681,0.378,Calm
2614,Chura Liya Hai Tumne,R. D. Burman,0.684,0.368,Calm
1541,Aaj Mausam Bada Beimaan Hai,Mohammed Rafi,0.604,0.382,Calm


In [19]:
df.to_csv("cleaned_dataset.csv", index=False)
print("Dataset saved with mood column.")


Dataset saved with mood column.
