# **`1. Data Preparation`**

### ***Import Libraries***

In [3]:
import pandas as pd
import numpy as np

## ***Read and Inspect data***

In [4]:
data=pd.read_csv('/content/drive/MyDrive/Song recommendation system/songs_2000_2020_50k.csv')

In [5]:
data.head()

Unnamed: 0,Title,Artist,Album,Genre,Release Date,Duration,Popularity
0,Include name this.,Patrick Anderson,Care.,R&B,2008-01-09,262,71
1,Manage west energy.,Eric Miller,Raise get.,Jazz,2011-08-20,187,37
2,Evening court painting.,Richard Curry,Sport.,Electronic,2010-05-30,212,58
3,Section turn hour.,James Smith,Full.,Hip-Hop,2014-10-12,272,59
4,Five agreement teach.,Amy Rodriguez,Eat.,Blues,2005-06-09,131,34


In [6]:
data.shape

(50000, 7)

In [7]:
data.isnull().sum()

Unnamed: 0,0
Title,0
Artist,0
Album,0
Genre,0
Release Date,0
Duration,0
Popularity,0


### ***Reforming Data***

In [8]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Title,Artist,Album,Genre,Release Date,Duration,Popularity
0,Include name this.,Patrick Anderson,Care.,R&B,2008-01-09,262,71
1,Manage west energy.,Eric Miller,Raise get.,Jazz,2011-08-20,187,37
2,Evening court painting.,Richard Curry,Sport.,Electronic,2010-05-30,212,58
3,Section turn hour.,James Smith,Full.,Hip-Hop,2014-10-12,272,59
4,Five agreement teach.,Amy Rodriguez,Eat.,Blues,2005-06-09,131,34


In [9]:
newDF = df.drop(['Album', 'Duration'], axis=1)
newDF.head()

Unnamed: 0,Title,Artist,Genre,Release Date,Popularity
0,Include name this.,Patrick Anderson,R&B,2008-01-09,71
1,Manage west energy.,Eric Miller,Jazz,2011-08-20,37
2,Evening court painting.,Richard Curry,Electronic,2010-05-30,58
3,Section turn hour.,James Smith,Hip-Hop,2014-10-12,59
4,Five agreement teach.,Amy Rodriguez,Blues,2005-06-09,34


In [10]:
newDF['Release Date'] = pd.to_datetime(newDF['Release Date'])
newDF['Release Year'] = newDF['Release Date'].dt.year
newDF.head()

Unnamed: 0,Title,Artist,Genre,Release Date,Popularity,Release Year
0,Include name this.,Patrick Anderson,R&B,2008-01-09,71,2008
1,Manage west energy.,Eric Miller,Jazz,2011-08-20,37,2011
2,Evening court painting.,Richard Curry,Electronic,2010-05-30,58,2010
3,Section turn hour.,James Smith,Hip-Hop,2014-10-12,59,2014
4,Five agreement teach.,Amy Rodriguez,Blues,2005-06-09,34,2005


In [11]:
newDF.isnull().sum()

Unnamed: 0,0
Title,0
Artist,0
Genre,0
Release Date,0
Popularity,0
Release Year,0


In [12]:
newDF.drop('Release Date', axis=1, inplace=True)

In [13]:
newDF.head()

Unnamed: 0,Title,Artist,Genre,Popularity,Release Year
0,Include name this.,Patrick Anderson,R&B,71,2008
1,Manage west energy.,Eric Miller,Jazz,37,2011
2,Evening court painting.,Richard Curry,Electronic,58,2010
3,Section turn hour.,James Smith,Hip-Hop,59,2014
4,Five agreement teach.,Amy Rodriguez,Blues,34,2005


# **`2. Feature Extraction`**

### ***Import Libraries***

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler

In [15]:
# Title and Artist TF-IDF vector
title_artist_vectorizer = TfidfVectorizer()
title_artist_tfidf = title_artist_vectorizer.fit_transform(newDF['Title'] + ' ' + newDF['Artist'])

In [16]:
# Genre MultiLabelBinarizer
genre_vectorizer = MultiLabelBinarizer()
genre_binary = genre_vectorizer.fit_transform(newDF['Genre'].apply(lambda x: x.split(',')))

In [17]:
# Release year Scalar
scaler = StandardScaler()
release_year_scaled = scaler.fit_transform(newDF[['Release Year']])

In [18]:
# Popularity Scalar
popularity_scaled = scaler.fit_transform(newDF[['Popularity']])

# **`3. Feature Engineering`**

### ***Import Library***

In [19]:
import scipy.sparse as sp

In [20]:
# Combine features into a sparse matrix
combined_features = sp.hstack([title_artist_tfidf,
                              sp.csr_matrix(genre_binary),
                              sp.csr_matrix(release_year_scaled),
                              sp.csr_matrix(popularity_scaled)])

In [21]:
combined_features

<50000x2548 sparse matrix of type '<class 'numpy.float64'>'
	with 377357 stored elements in Compressed Sparse Row format>

# **`4. Recommendation Engine`**

### ***Import Library***

In [22]:
!pip install annoy

Collecting annoy
  Downloading annoy-1.17.3.tar.gz (647 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/647.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m645.1/647.5 kB[0m [31m43.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.3-cp310-cp310-linux_x86_64.whl size=552448 sha256=2abdde11d7e3233a3a89eff810a7044cd3413e22a85a1988ee2dc5ca79af7f43
  Stored in directory: /root/.cache/pip/wheels/64/8a/da/f714bcf46c5efdcfcac0559e63370c21abe961c48e3992465a
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.3


In [23]:
from annoy import AnnoyIndex

In [24]:
# Define number of features
num_features = combined_features.shape[1]

In [25]:
# Create Annoy Index
annoy_index = AnnoyIndex(num_features, 'angular')

In [26]:
# Add item to Annoy Index
for i in range(combined_features.shape[0]):
    annoy_index.add_item(i, combined_features[i].toarray()[0])

In [27]:
# Build the index with a specified number of trees
annoy_index.build(10)

True

In [28]:
# Function to get Song recommendation using Annoy
def get_song_recommendations(song_index, annoy_index,newDF,top_n=5):
    recommended_song_indices = annoy_index.get_nns_by_item(song_index, top_n+1)
    return newDF.iloc[recommended_song_indices]

In [29]:
# Giving choice to search through Title or Artist
type_of_search = input("Search by Title or Artist? (Title/Artist): ")
if type_of_search.lower() == "title":
    song_title = input("Enter the song title: ")
    song_index = newDF[newDF['Title'].str.lower() == song_title.lower()].index[0]
elif type_of_search.lower() == "artist":
    artist_name = input("Enter the artist name: ")
    song_index = newDF[newDF['Artist'].str.lower() == artist_name.lower()].index[0]
else:
    print("Invalid choice. Please enter 'Title' or 'Artist'.")

Search by Title or Artist? (Title/Artist): artist
Enter the artist name: James Smith


In [30]:
# Get top 5 song recommendations
recommendations = get_song_recommendations(song_index, annoy_index, newDF)
print ("Song Recommendations:\n",recommendations)

Song Recommendations:
                     Title          Artist    Genre  Popularity  Release Year
3      Section turn hour.     James Smith  Hip-Hop          59          2014
34258               View.     James Smith  Hip-Hop          70          2012
6719       Hour him very.     Peter Weiss  Hip-Hop          58          2019
19842               Upon.  Jonathan Smith  Hip-Hop          63          2015
15450                Its.   James Johnson  Hip-Hop          61          2014
20019      Above quickly.   Gregory Smith  Hip-Hop          68          2018


# **`5. Save the model`**

### ***Import Libraries***

In [32]:
import pickle

In [33]:
annoy_index.save('annoy_index.ann')
newDF.to_pickle('newDF.pkl')