In [7]:
!pip install textblob

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [8]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
import re

In [9]:
playlistDF = pd.read_csv("data.csv")
print(playlistDF.columns)
playlistDF.head()

Index(['id', 'title', 'all_artists', 'popularity', 'release_date',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
       'time_signature'],
      dtype='object')


Unnamed: 0,id,title,all_artists,popularity,release_date,danceability,energy,key,loudness,mode,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,429hKgsnnsIVn9ohgjj0II,Medicine,James Arthur,57,2021-11-05,0.496,0.732,10,-4.529,1,0.236,0.0,0.0905,0.325,143.981,208982,4
1,5nujrmhLynf4yMoMtj8AQF,Levitating (feat. DaBaby),DaBaby,84,2020-03-27,0.702,0.825,6,-3.787,0,0.00883,0.0,0.0674,0.915,102.977,203064,4
2,0ClPIeT6MSgfSgQ9ZrJbAq,Cheating on You,Charlie Puth,75,2019-10-01,0.749,0.535,4,-6.785,1,0.0131,3.6e-05,0.0982,0.601,123.042,196607,4
3,62tHYGbBOvOkpgmEXDlgu8,Who's In Your Head,Jonas Brothers,62,2021-10-29,0.704,0.778,5,-5.331,1,0.00447,0.0,0.0731,0.821,101.025,183537,4
4,6ZuahEctZD6w75peme58hm,Wonder,Shawn Mendes,71,2020-12-04,0.442,0.631,1,-5.096,0,0.136,2.3e-05,0.133,0.129,139.774,172693,4


TO DROP DUPLICATES

In [10]:
def drop_duplicates(df):
    '''
    Drop duplicate songs
    '''
    df['artists_song'] = df.apply(lambda row: row['all_artists']+row['title'],axis = 1)
    return df.drop_duplicates('artists_song')

songDF = drop_duplicates(playlistDF)
print("Are all songs unique: ",len(pd.unique(songDF.artists_song))==len(songDF))

Are all songs unique:  True


Dataframe with only useful columns

In [11]:
def select_cols(df):
       '''
       Select useful columns
       '''
       return df[['all_artists' ,'id', 'title', 'popularity', 'release_date',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo']]
songDF = select_cols(songDF)
songDF.head()

Unnamed: 0,all_artists,id,title,popularity,release_date,danceability,energy,key,loudness,mode,acousticness,instrumentalness,liveness,valence,tempo
0,James Arthur,429hKgsnnsIVn9ohgjj0II,Medicine,57,2021-11-05,0.496,0.732,10,-4.529,1,0.236,0.0,0.0905,0.325,143.981
1,DaBaby,5nujrmhLynf4yMoMtj8AQF,Levitating (feat. DaBaby),84,2020-03-27,0.702,0.825,6,-3.787,0,0.00883,0.0,0.0674,0.915,102.977
2,Charlie Puth,0ClPIeT6MSgfSgQ9ZrJbAq,Cheating on You,75,2019-10-01,0.749,0.535,4,-6.785,1,0.0131,3.6e-05,0.0982,0.601,123.042
3,Jonas Brothers,62tHYGbBOvOkpgmEXDlgu8,Who's In Your Head,62,2021-10-29,0.704,0.778,5,-5.331,1,0.00447,0.0,0.0731,0.821,101.025
4,Shawn Mendes,6ZuahEctZD6w75peme58hm,Wonder,71,2020-12-04,0.442,0.631,1,-5.096,0,0.136,2.3e-05,0.133,0.129,139.774


Pipeline used for feature generation: Sentiment Analysis, One Hot Encoding, Normalization.

SENTIMENT ANALYSIS using subjectivity and polarity:

Subjectivity: The amount of personal opinion and factual information contained in the text. (0,1)

Polarity: The degree of strong or clearly defined sentiment accounting for negation. (-1,1)

In [12]:
def getSubjectivity(text):
  return TextBlob(text).sentiment.subjectivity

def getPolarity(text):
  return TextBlob(text).sentiment.polarity

def getAnalysis(score, task="polarity"):
  if task == "subjectivity":
    if score < 1/3:
      return "low"
    elif score > 1/3:
      return "high"
    else:
      return "medium"
  else:
    if score < 0:
      return 'Negative'
    elif score == 0:
      return 'Neutral'
    else:
      return 'Positive'

def sentimentAnalysis(df, text_col):
  df['subjectivity'] = df[text_col].apply(getSubjectivity).apply(lambda x: getAnalysis(x,"subjectivity"))
  df['polarity'] = df[text_col].apply(getPolarity).apply(getAnalysis)
  return df

In [15]:
sentiment = sentimentAnalysis(songDF, "title")
sentiment.head()

Unnamed: 0,all_artists,id,title,popularity,release_date,danceability,energy,key,loudness,mode,acousticness,instrumentalness,liveness,valence,tempo,subjectivity,polarity
0,James Arthur,429hKgsnnsIVn9ohgjj0II,Medicine,57,2021-11-05,0.496,0.732,10,-4.529,1,0.236,0.0,0.0905,0.325,143.981,low,Neutral
1,DaBaby,5nujrmhLynf4yMoMtj8AQF,Levitating (feat. DaBaby),84,2020-03-27,0.702,0.825,6,-3.787,0,0.00883,0.0,0.0674,0.915,102.977,low,Neutral
2,Charlie Puth,0ClPIeT6MSgfSgQ9ZrJbAq,Cheating on You,75,2019-10-01,0.749,0.535,4,-6.785,1,0.0131,3.6e-05,0.0982,0.601,123.042,low,Neutral
3,Jonas Brothers,62tHYGbBOvOkpgmEXDlgu8,Who's In Your Head,62,2021-10-29,0.704,0.778,5,-5.331,1,0.00447,0.0,0.0731,0.821,101.025,low,Neutral
4,Shawn Mendes,6ZuahEctZD6w75peme58hm,Wonder,71,2020-12-04,0.442,0.631,1,-5.096,0,0.136,2.3e-05,0.133,0.129,139.774,low,Neutral


In [16]:
sentiment['subjectivity'].value_counts()

low       917
high      194
medium      6
Name: subjectivity, dtype: int64

In [22]:
sentiment['polarity'].value_counts()

Neutral     895
Positive    146
Negative     76
Name: polarity, dtype: int64

ONE HOT ENCODING

In [20]:
def ohe(df, column, new_name):
  ohe_df = pd.get_dummies(df[column])
  feature_names = ohe_df.columns
  ohe_df.columns = [new_name + "|" + str(i) for i in feature_names]
  ohe_df.reset_index(drop = True, inplace = True)    
  return ohe_df

One hot encoding for subjectivity

There are only 6 samples with medium so we can take them to be the same class as high

In [21]:
subject_ohe = ohe(sentiment, 'subjectivity','subject')
subject_ohe.iloc[0]

subject|high      0
subject|low       1
subject|medium    0
Name: 0, dtype: uint8

NORMALIZATION

Popularity is not between 0 and 1 so it will cause problems while we're finding cosine similarity.

Audio features are also not normalized.

In [24]:
print(songDF['popularity'].describe())
pop = songDF[['popularity']].reset_index(drop = True)
scaler = MinMaxScaler()
pop_scaled = pd.DataFrame(scaler.fit_transform(pop), columns = pop.columns)
pop_scaled.head()

count    1117.000000
mean       49.017905
std        21.889336
min         0.000000
25%        40.000000
50%        53.000000
75%        64.000000
max        90.000000
Name: popularity, dtype: float64


Unnamed: 0,popularity
0,0.633333
1,0.933333
2,0.833333
3,0.688889
4,0.788889


Now, we get all these features into one single dataframe

In [64]:
def create_feature_set(df, float_cols): #float_cols contains the list of columns that will be scaled

  df = sentimentAnalysis(df, "title")

  #one-hot-encoding
  subject_ohe = ohe(df, 'subjectivity','subject') * 0.3
  polar_ohe = ohe(df, 'polarity','polar') * 0.5
  key_ohe = ohe(df, 'key','key') * 0.5
  mode_ohe = ohe(df, 'mode','mode') * 0.5

  #normalization
  pop = df[["popularity"]].reset_index(drop = True)
  scaler = MinMaxScaler()
  pop_scaled = pd.DataFrame(scaler.fit_transform(pop), columns = pop.columns) * 0.2
  
  # Scale audio columns
  floats = df[float_cols].reset_index(drop = True)
  scaler = MinMaxScaler()
  floats_scaled = pd.DataFrame(scaler.fit_transform(floats), columns = floats.columns) * 0.2

  final = pd.concat([floats_scaled, pop_scaled, subject_ohe, polar_ohe, key_ohe, mode_ohe], axis = 1)

  final['id']=df['id'].values
    
  return final

In [65]:
float_cols = songDF.dtypes[songDF.dtypes == 'float64'].index.values
songDF.to_csv("allsong_data.csv", index = False)

complete_feature_set = create_feature_set(songDF, float_cols=float_cols)
complete_feature_set.to_csv("complete_feature.csv", index = False)
complete_feature_set.head()

Unnamed: 0,danceability,energy,loudness,acousticness,instrumentalness,liveness,valence,tempo,popularity,subject|high,...,key|5,key|6,key|7,key|8,key|9,key|10,key|11,mode|0,mode|1,id
0,0.100745,0.147228,0.18038,0.047436,0.0,0.015802,0.063448,0.133483,0.126667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5,429hKgsnnsIVn9ohgjj0II
1,0.147966,0.16596,0.183848,0.001774,0.0,0.01028,0.18894,0.084937,0.186667,0.0,...,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.5,0.0,5nujrmhLynf4yMoMtj8AQF
2,0.158739,0.107548,0.169833,0.002632,7e-06,0.017643,0.122153,0.108693,0.166667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0ClPIeT6MSgfSgQ9ZrJbAq
3,0.148424,0.156493,0.17663,0.000898,0.0,0.011642,0.168946,0.082626,0.137778,0.0,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,62tHYGbBOvOkpgmEXDlgu8
4,0.088367,0.126885,0.177729,0.027336,5e-06,0.025962,0.021759,0.128502,0.157778,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,6ZuahEctZD6w75peme58hm


Time to do content-based filterng recommendation.

To do this, we need to concatenate all songs in a playlist into one summary vector. Then we find the similarity between this vector and all songs in our database (except the songs from the playlist).

Then, we use the similarity measure retrieved the most relevant song that is not in the playlist to recommend it.

So, our steps are:

Choose Playlist => Extract Features => Find Similarity

Choosing Playlist

In [67]:
playlistDF_test = pd.read_csv('testData.csv')

Extracting Features

In [68]:
def generate_playlist_feature(complete_feature_set, playlist_df):
  complete_feature_set_playlist = complete_feature_set[complete_feature_set['id'].isin(playlist_df['id'].values)]
  complete_feature_set_nonplaylist = complete_feature_set[~complete_feature_set['id'].isin(playlist_df['id'].values)]
  complete_feature_set_playlist_final = complete_feature_set_playlist.drop(columns = "id")
  return complete_feature_set_playlist_final.sum(axis = 0), complete_feature_set_nonplaylist

In [69]:
complete_feature_set_playlist_vector, complete_feature_set_nonplaylist = generate_playlist_feature(complete_feature_set, playlistDF_test)

In [70]:
complete_feature_set_nonplaylist.head()

Unnamed: 0,danceability,energy,loudness,acousticness,instrumentalness,liveness,valence,tempo,popularity,subject|high,...,key|5,key|6,key|7,key|8,key|9,key|10,key|11,mode|0,mode|1,id
0,0.100745,0.147228,0.18038,0.047436,0.0,0.015802,0.063448,0.133483,0.126667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5,429hKgsnnsIVn9ohgjj0II
1,0.147966,0.16596,0.183848,0.001774,0.0,0.01028,0.18894,0.084937,0.186667,0.0,...,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.5,0.0,5nujrmhLynf4yMoMtj8AQF
2,0.158739,0.107548,0.169833,0.002632,7e-06,0.017643,0.122153,0.108693,0.166667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0ClPIeT6MSgfSgQ9ZrJbAq
3,0.148424,0.156493,0.17663,0.000898,0.0,0.011642,0.168946,0.082626,0.137778,0.0,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,62tHYGbBOvOkpgmEXDlgu8
4,0.088367,0.126885,0.177729,0.027336,5e-06,0.025962,0.021759,0.128502,0.157778,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,6ZuahEctZD6w75peme58hm


In [71]:
complete_feature_set_playlist_vector

danceability        0.172722
energy              0.131719
loudness            0.169861
acousticness        0.025527
instrumentalness    0.000017
liveness            0.019268
valence             0.166394
tempo               0.140612
popularity          0.184444
subject|high        0.300000
subject|low         0.000000
subject|medium      0.000000
polar|Negative      0.000000
polar|Neutral       0.000000
polar|Positive      0.500000
key|0               0.000000
key|1               0.000000
key|2               0.000000
key|3               0.000000
key|4               0.000000
key|5               0.000000
key|6               0.000000
key|7               0.000000
key|8               0.500000
key|9               0.000000
key|10              0.000000
key|11              0.000000
mode|0              0.500000
mode|1              0.000000
dtype: float64

Finding Similarity

In [73]:
def generate_playlist_recos(df, features, nonplaylist_features):
  
  #nonplaylist_features contains features of the songs that are not a part of the playlist

  non_playlist_df = df[df['id'].isin(nonplaylist_features['id'].values)]
  non_playlist_df['sim'] = cosine_similarity(nonplaylist_features.drop('id', axis = 1).values, features.values.reshape(1, -1))[:,0]
  non_playlist_df_top_40 = non_playlist_df.sort_values('sim',ascending = False).head(40)
  return non_playlist_df_top_40


Our test playlist

In [78]:
playlistDF_test[["all_artists","title"]][:20]

Unnamed: 0,all_artists,title
0,M.I.A.,The One
1,Kevin Abstract,DEAR MISS HOLLOWAY
2,Lil Baby,Sleazy Flow (with Lil Baby) - Remix
3,XXXTENTACION,True Love
4,Moneybagg Yo,See Wat I’m Sayin
5,Young Thug,Potion (with Dua Lipa & Young Thug)
6,Chance the Rapper,A Bar About a Bar
7,Takeoff,HOTEL LOBBY (Unc & Phew)
8,Lil Durk,Computer Murderers
9,Lil Yachty,Congratulations


Recommendations based on test playlist

In [77]:
x = input("Enter number of recommendations needed: ")


recommend = generate_playlist_recos(songDF, complete_feature_set_playlist_vector, complete_feature_set_nonplaylist)
recommend.head(int(x))

Enter number of recommendations needed: 10


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,all_artists,id,title,popularity,release_date,danceability,energy,key,loudness,mode,acousticness,instrumentalness,liveness,valence,tempo,subjectivity,polarity,sim
491,Shruti Haasan,2MwCoo4GeXpi8soWn9EiPo,Kannazhaga - The Kiss of Love,63,2011-12-23,0.574,0.496,8,-7.153,0,0.431,0.0,0.399,0.61,109.935,high,Positive,0.990726
1065,Grant,3LoyPU4kSmEw1T9pwCT9BV,Better Off Alone,43,2020-06-30,0.63,0.7,8,-4.803,0,0.184,0.000178,0.105,0.204,139.976,high,Positive,0.986745
30,SZA,3DarAbFujv6eYNliUTyqtz,Kiss Me More (feat. SZA),83,2021-06-25,0.764,0.705,8,-3.463,1,0.259,8.9e-05,0.12,0.781,110.97,high,Positive,0.747221
662,The Weeknd,0k4d5YPDr1r7FX77VdqWez,You Right,84,2021-06-25,0.828,0.621,8,-6.414,1,0.0164,0.00233,0.0845,0.436,128.986,high,Positive,0.742564
602,Sid Sriram,6kekHWEQ4SyMe5EjYAlveP,High On Love,59,2018-02-14,0.627,0.779,4,-4.944,0,0.214,9e-06,0.112,0.619,149.929,high,Positive,0.742226
6,Shawn Mendes,5LZtB6nxvjIhUoElp3Zqk0,Teach Me How To Love,67,2020-12-04,0.592,0.885,9,-4.944,0,0.0505,2e-06,0.0843,0.731,98.991,high,Positive,0.740913
454,Shweta Mohan,0xttqqTj7ZAGyVkfOYxQ4F,Nee Paartha Vizhigal - The Touch of Love,64,2011-12-23,0.665,0.596,9,-6.969,0,0.293,3.7e-05,0.247,0.644,110.055,high,Positive,0.739732
332,John K,2FwP6d7xG5Hch65LBsa7Zi,cheap sunglasses,56,2020-11-12,0.801,0.499,8,-5.89,1,0.239,0.0,0.189,0.674,99.018,high,Positive,0.739026
815,Chani Nattan,7HWMIRSmUrhlbveXerIEWk,Unforgettable,61,2021-12-24,0.929,0.78,0,-3.525,0,0.311,0.0,0.119,0.344,114.992,high,Positive,0.738282
164,Panic! At The Disco,7wOmQJeVX6qjNXqqsKOkPx,Lying Is the Most Fun a Girl Can Have Without ...,53,2005-09-27,0.66,0.799,8,-4.644,1,0.106,0.0,0.154,0.475,125.966,high,Positive,0.737706
