In [1]:
import csv
import pandas as pd

In [4]:
import pandas as pd

# Try to read the CSV file, handling potential errors
try:
    df = pd.read_csv("/content/spotify_millsongdata.csv")
except pd.errors.ParserError as e:
    # If a ParserError occurs, print the error and problematic line
    print(f"Error: {e}")
    print(f"Problematic line (around line 39906):")

    # Open the file and print the line around 39906
    with open("/content/spotify_millsongdata.csv", 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= 39903 and i <= 39909:  # Print lines around 39906
                print(f"Line {i+1}: {line.strip()}")

    # Suggest potential solutions
    print("\nPotential solutions:")
    print("1. Check the file for unclosed quotes around line 39906 and fix them manually.")
    print("2. Try using the 'error_bad_lines=False' argument in pd.read_csv to skip problematic lines.")
    print("   However, this might result in data loss.")
    print("3. Experiment with different encodings using the 'encoding' argument in pd.read_csv, such as 'latin-1' or 'iso-8859-1'.")

In [5]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [6]:
df =df.sample(5000).drop('link', axis=1).reset_index(drop=True)

In [7]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

In [8]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [10]:
nltk.download('punkt_tab')
df['text'] = df['text'].apply(lambda x: tokenization(x))

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [13]:
similarity[0]

array([1.        , 0.03218783, 0.14576972, ..., 0.0095121 , 0.03651197,
       0.00699855])

In [14]:
df[df['song'] == 'Crying Over You']

Unnamed: 0,artist,song,text


In [15]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])

    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)

    return songs

In [17]:
def recommendation(song_df):
    # Check if the song exists in the DataFrame
    if song_df not in df['song'].values:
        print(f"Song '{song_df}' not found in the dataset.")
        return []  # Return an empty list if the song is not found

    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])), reverse=True, key=lambda x: x[1])

    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)

    return songs

In [19]:
recommendation('Bang')

['Solsbury Hill',
 'Afterparty',
 'Cars Hiss By My Window',
 'Alice In Wonderland',
 'I Saw Her Standing',
 "It's So Cool",
 'Walk Away',
 'Who Will Love Me Now',
 'You',
 'Love',
 'A Lovely Way To Spend An Evening',
 "I've Returned",
 'Love To Love',
 'Crazy Love',
 'Baby Love',
 'All Love',
 "You're So Good To Me",
 'Love It, Love It',
 'Just One Way To Say I Love You',
 'Love Or Money']

In [20]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))