In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('/content/archive (6).zip')

In [3]:
df.shape

(57650, 4)

In [4]:
df.describe()

Unnamed: 0,artist,song,link,text
count,57650,57650,57650,57650
unique,643,44824,57650,57494
top,Donna Summer,Have Yourself A Merry Little Christmas,/z/zwan/heartsong_20148991.html,I've got sunshine on a cloudy day \nWhen it's...
freq,191,35,1,6


In [5]:
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [6]:
df.isnull().sum()

Unnamed: 0,0
artist,0
song,0
link,0
text,0


In [7]:
df.duplicated().sum()

np.int64(0)

In [8]:
df = df.sample(n=5000).drop('link', axis=1).reset_index(drop=True)

In [9]:
df.shape

(5000, 3)

In [10]:
df['text'] = df['text'].str.lower().replace(r'[^\w\s]','').replace(r'\n',' ', regex=True)

In [12]:
df['text'][0]

"you need meat,   go to the market.   you need bread,   try the bakery.   you need lovin',   look here, baby.   i got just what you need.   i gotta love somebody.   i gotta love somebody.   i gotta love somebody.   somebody gonna love me.   you need money,   go to the bank, dear.   you need honey,   look to the bees.   you need lovin',   come on here, woman.   well, i got just what you need.   i wanna love somebody.   i gotta love somebody.   i gotta love somebody.   somebody gonna love me.   oh, play it now.   you need meat,   go to the market.   you need bread,   try the bakery.   if you need lovin',   come on here, baby.   well, i got just what you need.   i wanna love somebody.   yeah, love somebody.   i gotta love somebody.   somebody got to love me.   play your guitar.   you need money,   go to the bank, dear.   if you need honey,   go find a bee.   if need lovin',   well i'm your doctor.   i've got people sick   on what you need.   i gotta love somebody.   whoa, love.   i wanna 

In [13]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [14]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [15]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [16]:
similarity[0]

array([1.        , 0.01012974, 0.05233525, ..., 0.05265143, 0.02375014,
       0.04188861])

In [26]:
# After your df is loaded and 'song' column exists
print(f"Raw string from df.loc[2, 'song']: {repr(df.loc[2, 'song'])}")

Raw string from df.loc[2, 'song']: 'Tough Lover'


In [28]:
# df[df['song']==''].index[0]
# --- Option 1: Direct Match (if you're sure of exact case and no hidden chars) ---
# This will work if 'As Good As New' is *exactly* as it appears in the DataFrame
target_song_title = 'Tough Lover'
if target_song_title in df['song'].values:
    # 'df['song']' refers to the DataFrame column
    # 'target_song_title' refers to the string variable holding "As Good As New"
    song_index = df[df['song'] == target_song_title].index[0]
    print(f"Index of '{target_song_title}': {song_index}")
else:
    print(f"'{target_song_title}' not found in the DataFrame (exact match).")

Index of 'Tough Lover': 2


In [34]:
def clean_song_title(title):
  """Cleans and normalizes a song title."""
  # You can add more cleaning steps here if needed,
  # e.g., removing punctuation, extra spaces, etc.
  return title.strip()

def recommendation(song_title_input): # Renamed parameter for clarity
    # Clean and normalize the input song title
    cleaned_input_title = clean_song_title(song_title_input).lower()

    # Find the index using the cleaned column
    if cleaned_input_title not in df['song'].str.lower().values: # Check against lowercased song titles
        print(f"Error: Song '{song_title_input}' not found in the dataset.")
        return [] # Return an empty list or raise a more specific error

    # Find the index based on the original case for potential later use,
    # but the similarity is based on the text column, which is lowercased and stemmed
    idx = df[df['song'].str.lower() == cleaned_input_title].index[0]


    # Calculate distances based on the similarity matrix
    # Ensure 'similarity' matrix is already computed globally (as seen in your screenshot)
    distances = sorted(list(enumerate(similarity[idx])), reverse=True, key=lambda x:x[1])

    songs = []
    # Loop from 1 to 21 to get 20 recommendations (excluding the song itself)
    for m_id in distances[1:21]:
        # Use .iloc for integer-location based indexing
        songs.append(df.iloc[m_id[0]].song) # Access the original 'song' column for output

    return songs

# --- AFTER defining the function AND verifying 'Alma Mater' exists (or picking a new song) ---

# Example call if 'Alma Mater' was found:
# recommended_songs = recommendation('Alma Mater')
# print(recommended_songs)

# Example call if 'Alma Mater' was NOT found, but 'Tough Lover' IS found:
recommended_songs = recommendation('Tough Lover')
print(f"Recommendations for 'Tough Lover':\n{recommended_songs}")

Recommendations for 'Tough Lover':
["She's Too Tough", 'Lovers', 'Yeah Yeah Yeah Yeah', 'Yeah,yeah,yeah,yeah,yeah', 'Lover For Life', 'It All Comes Down To Love', "She's Too Tough", 'Friends And Lovers', "You're On Your Own", 'Mr Loverboy', 'To Be A Star', 'How You Remind Me', 'Just A Fool', 'Daybreak', 'All That Glitters', "Ladies' Night", "It's Too Bad Things Are Going So Tough", 'School Teacher', 'In My Tree', 'Freakin It']


In [36]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))