In [None]:
!pip install spotipy




In [None]:
import pandas as pd
import kagglehub
import nltk
from nltk.stem.porter import PorterStemmer
import os
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [54]:
# Download dataset
dataset_path = kagglehub.dataset_download("notshrirang/spotify-million-song-dataset")

# Read CSV file
csv_file_path = f"{dataset_path}/spotify_millsongdata.csv"
df = pd.read_csv(csv_file_path)

# Display first few rows
print(df.head())


  artist                   song                                        link  \
0   ABBA  Ahe's My Kind Of Girl  /a/abba/ahes+my+kind+of+girl_20598417.html   
1   ABBA       Andante, Andante       /a/abba/andante+andante_20002708.html   
2   ABBA         As Good As New        /a/abba/as+good+as+new_20003033.html   
3   ABBA                   Bang                  /a/abba/bang_20598415.html   
4   ABBA       Bang-A-Boomerang      /a/abba/bang+a+boomerang_20002668.html   

                                                text  
0  Look at her face, it's a wonderful face  \r\nA...  
1  Take it easy with me, please  \r\nTouch me gen...  
2  I'll never know why I had to go  \r\nWhy I had...  
3  Making somebody happy is a question of give an...  
4  Making somebody happy is a question of give an...  


In [55]:
df.isnull().sum()

Unnamed: 0,0
artist,0
song,0
link,0
text,0


In [56]:
df =df.sample(5000).drop('link', axis=1).reset_index(drop=True)

In [57]:
df['text'][0]

"Trust to your instincts  \r\nIf it's safely restrained  \r\nLightning reactions  \r\nMust be carefully trained  \r\n  \r\nHeat of the moment  \r\nCurse of the young  \r\nSpit out your anger  \r\nDon't swallow your tongue  \r\n  \r\nStick it out  \r\n  \r\nDon't swallow the poison  \r\nSpit it out  \r\nDon't swallow your pride  \r\nStick it out  \r\nDon't swallow your anger  \r\nSpit it out  \r\nDon't swallow the lies  \r\n  \r\nNatural reflex  \r\nPendulum swing  \r\nYou might be too dizzy  \r\nTo do the right thing  \r\n  \r\nTrial under fire  \r\nUltimate proof  \r\nMoment of crisis  \r\nDon't swallow the truth  \r\n  \r\nStick it out  \r\n  \r\nEach time we bathe our reactions  \r\nIn artificial light  \r\nEach time we alter the focus  \r\nTo make the wrong moves seem right  \r\n  \r\nYou get so used to deception  \r\nYou make yourself a nervous wreck  \r\nYou get so used to surrender  \r\nRunning back to cover your neck  \r\n  \r\nStick it out\r\n\r\n"

In [58]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

In [59]:
def fix_nltk_for_colab():
    nltk_data_dir = '/content/nltk_data'
    os.makedirs(nltk_data_dir, exist_ok=True)
    nltk.data.path.insert(0, nltk_data_dir)

    print(f"NLTK will look for data in: {nltk.data.path}")

    # Clean install of punkt
    print("\nStep 2: Fresh download of punkt resource...")
    nltk.download('punkt', download_dir=nltk_data_dir)

    # Try to handle punkt_tab issue
    print("\nStep 3: Handling punkt_tab dependency...")
    try:
        nltk.download('punkt_tab', download_dir=nltk_data_dir)
        print("Successfully downloaded punkt_tab!")
    except:
        print("Direct download of punkt_tab failed (expected).")

    print("\nStep 4: Verifying punkt installation...")
    try:
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        print("Successfully loaded punkt tokenizer!")
        test = nltk.word_tokenize("Test sentence for Colab.")
        print(f"Tokenization test successful: {test}")
        return True
    except Exception as e:
        print(f"Tokenizer verification failed: {e}")
        return False


In [60]:
# Create stemmer
stemmer = PorterStemmer()

In [61]:
# Fix NLTK in Colab
success = fix_nltk_for_colab()

NLTK will look for data in: ['/content/nltk_data', '/content/nltk_data', '/content/nltk_data', '/root/nltk_data', '/usr/nltk_data', '/usr/share/nltk_data', '/usr/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']

Step 2: Fresh download of punkt resource...

Step 3: Handling punkt_tab dependency...
Successfully downloaded punkt_tab!

Step 4: Verifying punkt installation...


[nltk_data] Downloading package punkt to /content/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /content/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Successfully loaded punkt tokenizer!
Tokenization test successful: ['Test', 'sentence', 'for', 'Colab', '.']


In [62]:
def tokenization(txt):
    if not isinstance(txt, str):
        return ""

    try:
        tokens = nltk.word_tokenize(txt)
        stemming = [stemmer.stem(w) for w in tokens]
        return " ".join(stemming)
    except Exception as e:
        print(f"Tokenization error: {e}")
        # Fallback to basic tokenization if nltk fails
        tokens = txt.lower().split()
        stemming = [stemmer.stem(w) for w in tokens]
        return " ".join(stemming)


In [64]:
if success:
    sample_text = "Testing the tokenization function with multiple words in Google Colab."
    result = tokenization(sample_text)
    print(f"\nOriginal: {sample_text}")
    print(f"Tokenized and stemmed: {result}")

    print("\nYou can now use this function with your dataframe:")
    print("df['text'] = df['text'].apply(lambda x: tokenization(x))")
else:
    print("\nCould not fully fix NLTK resources. The function includes a fallback method.")
    print("df['text'] = df['text'].apply(lambda x: tokenization(x))")


Original: Testing the tokenization function with multiple words in Google Colab.
Tokenized and stemmed: test the token function with multipl word in googl colab .

You can now use this function with your dataframe:
df['text'] = df['text'].apply(lambda x: tokenization(x))


In [63]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [65]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)


In [66]:
similarity[0]

array([1.        , 0.00803735, 0.00936442, ..., 0.        , 0.00297763,
       0.05252255])

In [67]:
df.head()

Unnamed: 0,artist,song,text
0,Rush,Stick It Out,trust to your instinct if it 's safe restrain ...
1,The Jam,A Solid Bond In Your Heart,feel is a word i ca n't explain at least not i...
2,Adam Sandler,The Taliban Can,"hey , alright , gather round , the taliban is ..."
3,David Bowie,I Keep Forgetting,[ choru : x2 ] i keep forget you do n't love m...
4,Kelly Clarkson,How I Feel,look like i made a mess again heartbreak every...


In [68]:
df[df['song'] == 'I Had A Dream']


Unnamed: 0,artist,song,text
2263,John Prine,I Had A Dream,i had a dream last night last night i had a dr...


In [69]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])

    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)

    return songs

In [70]:
recommendation('I Had A Dream')

['Good Night',
 'The Dream Is Over',
 'Dream Of Mirrors',
 'Dream',
 'Dreams',
 'Dream',
 "I Can't Make My Dreams Understand",
 'See You In Your Dreams',
 "All Night's All Right",
 'Do You Still Dream?',
 'Goodnight',
 'Last Night I Dreamed Of Heaven',
 'Keep On Dreaming',
 'In The Still Of The Night',
 'If I Was Your Lover',
 'I Dreamed About Mama Last Night',
 'Own The Night',
 'Do You Dream Of Me',
 'The Night Is Yours, The Night Is Mine',
 'Turn On Your Light']

In [None]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))
