In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('spotify_millsongdata.csv')

In [3]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
df.shape

(57650, 4)

In [5]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [6]:
df.drop('link', axis=1).reset_index(drop=True)

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...
...,...,...,...
57645,Ziggy Marley,Good Old Days,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,northern star \r\nam i frightened \r\nwhere ...


In [7]:
 df = df.sample(5000)

In [8]:
df.shape

(5000, 4)

## Text Cleaning and Processing

In [9]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

In [10]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [11]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [14]:
similarity[0]

array([1.        , 0.02015486, 0.03075532, ..., 0.03738152, 0.02070937,
       0.03751019])

In [15]:
df[df['song'] == 'Crying Over You']

Unnamed: 0,artist,song,link,text


In [16]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [17]:
import pickle
pickle.dump(similarity, open('similarity.pkl', 'wb'))
pickle.dump(df,open('df.pkl','wb'))

In [18]:
pickle.load(open('df.pkl', 'rb'))

Unnamed: 0,artist,song,link,text
49400,Queen Adreena,Princess Carwash,/q/queen+adreena/princess+carwash_20607846.html,hang it up sailor your flag ship it burn and i...
57343,Yngwie Malmsteen,The Bogeyman,/y/yngwie+malmsteen/the+bogeyman_20531782.html,late at night i like to hide . i dwell insid t...
15925,Peter Tosh,Till Your Well Runs Dry,/p/peter+tosh/till+your+well+runs+dry_20809367...,you said you love me and then you left you bro...
17879,Rod Stewart,Lady Luck,/r/rod+stewart/lady+luck_10242430.html,ladi luck here i am on time proppin ' up the b...
6450,Freddie King,Same Old Blues,/f/freddie+king/same+old+blues_20825545.html,morn rain keep on fall like the tear that fall...
...,...,...,...,...
5850,Faces,Maybe I'm Amazed,/f/faces/maybe+im+amazed_20052423.html,mayb i 'm amaz at the way you love me all the ...
32098,Evanescence,The Other Side,/e/evanescence/the+other+side_20979987.html,"make me whole again , open your eye taunt by t..."
26487,Britney Spears,Little Me,/b/britney+spears/little+me_20897705.html,"just yesterday you were look up to me and , yo..."
871,Arrogant Worms,Me Like Hockey!,/a/arrogant+worms/me+like+hockey_20256048.html,me work hard 5 day a week sweep garbag from th...


In [19]:
import os
print("similarity.pkl:", os.path.getsize("similarity.pkl"), "bytes")
print("df.pkl:", os.path.getsize("df.pkl"), "bytes")

similarity.pkl: 200000164 bytes
df.pkl: 5861608 bytes


In [20]:
# Load your full similarity matrix
with open('similarity.pkl', 'rb') as f:  # or np.load('similarity.npy')
    similarity = pickle.load(f)

print("Shape of similarity matrix:", np.array(similarity).shape)


Shape of similarity matrix: (5000, 5000)


In [21]:
# Convert to NumPy array if not already
similarity = np.array(similarity)

# Find midpoint
mid = len(similarity) // 2

# Split into two parts
similarity_part1 = similarity[:mid, :]
similarity_part2 = similarity[mid:, :]


In [22]:
with open('similarity_part_1.pkl', 'wb') as f:
    pickle.dump(similarity_part1, f)

with open('similarity_part_2.pkl', 'wb') as f:
    pickle.dump(similarity_part2, f)

print("Similarity matrix successfully split into 2 parts!")


Similarity matrix successfully split into 2 parts!


In [23]:
with open('similarity_part_1.pkl', 'rb') as f:
    part1 = pickle.load(f)
print("Part 1 shape:", np.array(part1).shape)

with open('similarity_part_2.pkl', 'rb') as f:
    part2 = pickle.load(f)
print("Part 2 shape:", np.array(part2).shape)


Part 1 shape: (2500, 5000)
Part 2 shape: (2500, 5000)


In [24]:
with open("similarity.pkl", "wb") as f:
    pickle.dump(similarity, f)

with open("df.pkl", "wb") as f:
    pickle.dump(df, f)

print("✅ Saved: df.pkl and similarity.pkl")

✅ Saved: df.pkl and similarity.pkl
