In [47]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv('spotify_millsongdata.csv')

In [6]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [7]:
df.shape

(57650, 4)

In [8]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [9]:
df.drop('link', axis=1).reset_index(drop=True)

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...
...,...,...,...
57645,Ziggy Marley,Good Old Days,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,northern star \r\nam i frightened \r\nwhere ...


In [11]:
 df = df.sample(5000)

In [12]:
df.shape

(5000, 4)

## Text Cleaning and Processing

In [15]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

In [16]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [17]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [21]:
similarity[0]

array([1.        , 0.0193098 , 0.01134571, ..., 0.        , 0.02590268,
       0.00121626])

In [22]:
df[df['song'] == 'Crying Over You']

Unnamed: 0,artist,song,link,text
19891,UB40,Crying Over You,/u/ub40/crying+over+you_20141696.html,cri over you in the morn cri over you in the e...


In [32]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [36]:
import pickle
pickle.dump(similarity, open('similarity.pkl', 'wb'))
pickle.dump(df,open('df.pkl','wb'))

In [46]:
pickle.load(open('df.pkl', 'rb'))

Unnamed: 0,artist,song,link,text
54986,Venom,Sacrifice,/v/venom/sacrifice_20175433.html,"hood figur , cloud sky fire burn within their ..."
46372,Oasis,Bring It On Down,/o/oasis/bring+it+on+down_20102324.html,what wa that sound ring around your brain ? to...
46361,Oasis,All Around The World,/o/oasis/all+around+the+world_20102312.html,it 's a bit earli in the midnight hour for me ...
42809,Marianne Faithfull,Once I Had A Sweetheart,/m/marianne+faithfull/once+i+had+a+sweetheart_...,onc i had a sweetheart and now i have none onc...
6068,Fleetwood Mac,Although The Sun Is Shining,/f/fleetwood+mac/although+the+sun+is+shining_2...,although the sun is shine high abov there 's o...
...,...,...,...,...
31709,Erasure,Can't Help Falling In Love,/e/erasure/cant+help+falling+in+love_20503533....,wise men say onli fool rush in but i ca n't he...
53185,Tim McGraw,For A Little While,/t/tim+mcgraw/for+a+little+while_20137346.html,"hot sun , dancin ' on the river we 'd sit on t..."
9253,Jim Croce,Big Fat Woman,/j/jim+croce/big+fat+woman_20153345.html,you big fat woman get your fat leg off of me y...
40848,Kris Kristofferson,Darby's Castle,/k/kris+kristofferson/darbys+castle_20080516.html,"see the ruin on the hill , where the smoke is ..."


In [39]:
import os
print("similarity.pkl:", os.path.getsize("similarity.pkl"), "bytes")
print("df.pkl:", os.path.getsize("df.pkl"), "bytes")

similarity.pkl: 200000164 bytes
df.pkl: 5792477 bytes


In [48]:
# Load your full similarity matrix
with open('similarity.pkl', 'rb') as f:  # or np.load('similarity.npy')
    similarity = pickle.load(f)

print("Shape of similarity matrix:", np.array(similarity).shape)


Shape of similarity matrix: (5000, 5000)


In [49]:
# Convert to NumPy array if not already
similarity = np.array(similarity)

# Find midpoint
mid = len(similarity) // 2

# Split into two parts
similarity_part1 = similarity[:mid, :]
similarity_part2 = similarity[mid:, :]


In [50]:
with open('similarity_part_1.pkl', 'wb') as f:
    pickle.dump(similarity_part1, f)

with open('similarity_part_2.pkl', 'wb') as f:
    pickle.dump(similarity_part2, f)

print("Similarity matrix successfully split into 2 parts!")


Similarity matrix successfully split into 2 parts!


In [51]:
with open('similarity_part_1.pkl', 'rb') as f:
    part1 = pickle.load(f)
print("Part 1 shape:", np.array(part1).shape)

with open('similarity_part_2.pkl', 'rb') as f:
    part2 = pickle.load(f)
print("Part 2 shape:", np.array(part2).shape)


Part 1 shape: (2500, 5000)
Part 2 shape: (2500, 5000)


In [40]:
with open("similarity.pkl", "wb") as f:
    pickle.dump(similarity, f)

with open("df.pkl", "wb") as f:
    pickle.dump(df, f)

print("✅ Saved: df.pkl and similarity.pkl")

✅ Saved: df.pkl and similarity.pkl
