In [5]:
import pandas as pd
import numpy as np

# Define the error handling behavior
error_bad_lines = False  # Skip rows with errors instead of raising an error
warn_bad_lines = True  # Print a warning for rows with errors

try:
  df = pd.read_csv(
      "spotify_millsongdata.csv",
      on_bad_lines='skip',  # or 'warn' or a custom function
      engine='python',  # or 'c'
      encoding='utf-8',  # or 'latin-1' or other encoding
      quotechar='"',  # or other quote character
      escapechar='\\',  # or other escape character
  )
  df.head(5)
except pd.errors.ParserError as e:
  print(f"Error parsing CSV: {e}")
  print("Inspecting the file around line 6651 for potential errors...")
  # Add logic here to inspect the file around line 6651
  # You can open the file and print lines around 6651 to see the issue
  with open("spotify_millsongdata.csv", 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
      if i >= 6646 and i <= 6656:
        print(line.strip())

In [6]:
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [7]:
df.tail(5)

Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


In [8]:
df.shape


(57650, 4)

In [9]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [10]:
# Try dropping the 'link' column only if it exists
if 'link' in df.columns:
  df = df.sample(5000).drop('link', axis=1).reset_index(drop=True)
else:
  print("Column 'link' not found in DataFrame. Skipping drop operation.")
  df = df.sample(5000).reset_index(drop=True) # Sample and reset index without dropping 'link'
df.head(10)

Unnamed: 0,artist,song,text
0,Judas Priest,Thunder Road,"Red light, green light \r\nI'm coming home to..."
1,Cheap Trick,Let Go,"I took your photographs, I threw them all away..."
2,Ray Boltz,Take Up Your Cross,Take Up Your Cross \r\nWords by Steve Millika...
3,Mazzy Star,She Hangs Brightly,She hangs brightly from the tree \r\nWonders ...
4,Howard Jones,What Is Love?,I love you whether or not you love me \r\nI l...
5,Fiona Apple,Pale September,Pale September \r\nI wore the time like a dre...
6,Vybz Kartel,Badda Dan Dem,Me nuh care where yuh bad from \r\nOr the end...
7,Incognito,Got To Know,(Jean-Paul Maunick) \r\nWe've got so little t...
8,Jimi Hendrix,Somewhere,"Oh uh \r\nI see fingers, hands and shades of ..."
9,Alice Cooper,Millie And Billie,Billie I wonder why are we insane \r\nWill we...


In [11]:

df['text'][0]


"Red light, green light  \r\nI'm coming home tonight  \r\nBurning the freeway  \r\nOut of control  \r\n  \r\nRed light, dead lines  \r\nWe streak from town to town  \r\nI's too much, I need your touch  \r\nI've been away too long  \r\n  \r\nOut again upon the thunder road  \r\nDriving back to you where I belong  \r\nI've had enough  \r\nDreams can wait  \r\nI'm coming home  \r\n  \r\nSpotlights, wild nights  \r\nI know it wrong from right  \r\nPlaces, the faces  \r\nAll look the same  \r\n  \r\nHot wired, so tired  \r\nLiving from song to song  \r\nThe madness, the badness  \r\nIt's just a game  \r\n  \r\nWatch the sunrise  \r\nFrom coast to coast  \r\nThat's when I need your loving  \r\nOh! That's when I need it most  \r\nCan't explain, it's something in the blood  \r\nWouldn't change it even if I could\r\n\r\n"

In [12]:
df.shape


(5000, 3)

In [13]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

In [14]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [15]:
import nltk
nltk.download('punkt_tab') # Download the 'punkt_tab' resource for tokenization
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)


In [18]:
similarity[0]

array([1.00000000e+00, 6.52874054e-03, 3.56961215e-02, ...,
       2.57433104e-03, 1.47510726e-02, 7.82570247e-04])

In [19]:
df[df['song'] == 'Crying Over You']

Unnamed: 0,artist,song,text


In [20]:
def recommendation(song_df):
    # Check if the song exists in the DataFrame
    if df[df['song'] == song_df].empty:
        print(f"Song '{song_df}' not found in the dataset.")
        return []  # Return an empty list if song not found

    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])), reverse=True, key=lambda x: x[1])

    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)

    return songs

In [21]:
recommendation('Clear As The Driven Snow')

Song 'Clear As The Driven Snow' not found in the dataset.


[]

In [22]:

import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))
