# Objective: To Build a song Recommender

# Dataset used: Million Songs Dataset 
    
Source: http://labrosa.ee.columbia.edu/millionsong/ 

# Q1. Load Required Libraries and music data

In [2]:
%matplotlib inline

import pandas
from sklearn.model_selection import train_test_split
import numpy as np
import time
from sklearn.externals import joblib

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Load Music data

In [7]:
#Read userid-songid-listen_count triplets
#This step might take time to download data from external sources
triplets_file = 'https://static.turi.com/datasets/millionsong/10000.txt'
songs_metadata_file = 'https://static.turi.com/datasets/millionsong/song_data.csv'

song_df_1 = pandas.read_table(triplets_file,header=None)
song_df_1.columns = ['user_id', 'song_id', 'listen_count']

#Read song  metadata
song_df_2 =  pandas.read_csv(songs_metadata_file)

#Merge the two dataframes above to create input dataframe for recommender systems
song_df = pandas.merge(song_df_1, song_df_2.drop_duplicates(['song_id']), on="song_id", how="left") 

In [12]:
print(song_df_1.head())
len(song_df_1)

                                    user_id             song_id  listen_count
0  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAKIMP12A8C130995             1
1  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBBMDR12A8C13253B             2
2  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBXHDL12A81C204C0             1
3  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBYHAJ12A6701BF1D             1
4  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SODACBL12A8C13C273             1


2000000

In [13]:
song_df_2.head()
len(song_df_2)
print(song_df_2.head(5))
len(song_df)
print(song_df.head(5))

Unnamed: 0,song_id,title,release,artist_name,year
0,SOQMMHC12AB0180CB8,Silent Night,Monster Ballads X-Mas,Faster Pussy cat,2003
1,SOVFVAK12A8C1350D9,Tanssi vaan,Karkuteillä,Karkkiautomaatti,1995
2,SOGTUKN12AB017F4F1,No One Could Ever,Butter,Hudson Mohawke,2006
3,SOBNYVR12A8C13558C,Si Vos Querés,De Culo,Yerba Brava,2003
4,SOHSBXH12A8C13B0DF,Tangle Of Aspens,Rene Ablaze Presents Winter Sessions,Der Mystic,0


1000000

              song_id              title  \
0  SOQMMHC12AB0180CB8       Silent Night   
1  SOVFVAK12A8C1350D9        Tanssi vaan   
2  SOGTUKN12AB017F4F1  No One Could Ever   
3  SOBNYVR12A8C13558C      Si Vos Querés   
4  SOHSBXH12A8C13B0DF   Tangle Of Aspens   

                                release       artist_name  year  
0                 Monster Ballads X-Mas  Faster Pussy cat  2003  
1                           Karkuteillä  Karkkiautomaatti  1995  
2                                Butter    Hudson Mohawke  2006  
3                               De Culo       Yerba Brava  2003  
4  Rene Ablaze Presents Winter Sessions        Der Mystic     0  


2000000

                                    user_id             song_id  listen_count  \
0  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAKIMP12A8C130995             1   
1  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBBMDR12A8C13253B             2   
2  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBXHDL12A81C204C0             1   
3  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBYHAJ12A6701BF1D             1   
4  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SODACBL12A8C13C273             1   

             title                        release    artist_name  year  
0         The Cove             Thicker Than Water   Jack Johnson     0  
1  Entre Dos Aguas            Flamenco Para Niños  Paco De Lucia  1976  
2         Stronger                     Graduation     Kanye West  2007  
3   Constellations              In Between Dreams   Jack Johnson  2005  
4     Learn To Fly  There Is Nothing Left To Lose   Foo Fighters  1999  


# Q2. Explore data

Music data shows how many times a user listened to a song, as well as the details of the song.

In [14]:
song_df.head()

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Thicker Than Water,Jack Johnson,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Graduation,Kanye West,2007
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,In Between Dreams,Jack Johnson,2005
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999


# Length of the dataset

In [15]:
len(song_df)
song_df = song_df.head(10000)
len(song_df)

2000000

10000

# Q3. Create a subset of the dataset

a. Show the most popular songs in the dataset

b. Count number of unique users in the dataset

c. Count the number of unique songs in the dataset

In [16]:
song_df = song_df.head(10000)
#Merge song title and artist_name columns to make a merged column
song_df['song'] = song_df['title'].map(str) + " - " + song_df['artist_name']

In [17]:
song_grouped = song_df.groupby(['song']).agg({'listen_count': 'count'}).reset_index()
grouped_sum = song_grouped['listen_count'].sum()
song_grouped['percentage']  = song_grouped['listen_count'].div(grouped_sum)*100
song_grouped.sort_values(['listen_count', 'song'], ascending = [0,1])

Unnamed: 0,song,listen_count,percentage
3660,Sehr kosmisch - Harmonia,45,0.45
4678,Undo - Björk,32,0.32
5105,You're The One - Dwight Yoakam,32,0.32
1071,Dog Days Are Over (Radio Edit) - Florence + Th...,28,0.28
3655,Secrets - OneRepublic,28,0.28
4378,The Scientist - Coldplay,27,0.27
4712,Use Somebody - Kings Of Leon,27,0.27
3476,Revelry - Kings Of Leon,26,0.26
1387,Fireflies - Charttraxx Karaoke,24,0.24
1862,Horn Concerto No. 4 in E flat K495: II. Romanc...,23,0.23


# Count number of unique users in the dataset

In [None]:
users = song_df['user_id'].unique()
len(users)

# Count the number of unique songs in the dataset

In [19]:
###Fill in the code here
songs = song_df['song'].unique()
len(songs)

5151

# Q4. Create a song recommender

In [20]:
train_data, test_data = train_test_split(song_df, test_size = 0.20, random_state=0)
print(train_data.head(5))

                                       user_id             song_id  \
7389  94d5bdc37683950e90c56c9b32721edb5d347600  SOXNZOW12AB017F756   
9275  1012ecfd277b96487ed8357d02fa8326b13696a5  SOXHYVQ12AB0187949   
2995  15415fa2745b344bce958967c346f2a89f792f63  SOOSZAZ12A6D4FADF8   
5316  ffadf9297a99945c0513cd87939d91d8b602936b  SOWDJEJ12A8C1339FE   
356   5a905f000fc1ff3df7ca807d57edb608863db05d  SOAMPRJ12A8AE45F38   

      listen_count                 title  \
7389             2      Half Of My Heart   
9275             1  The Beautiful People   
2995             1     Sanctify Yourself   
5316             4     Heart Cooks Brain   
356             20                 Rorol   

                                                release      artist_name  \
7389                                     Battle Studies       John Mayer   
9275             Antichrist Superstar (Ecopac Explicit)   Marilyn Manson   
2995                             Glittering Prize 81/92     Simple Minds   
5316  Ever

# Q5. Build Popularity Recommender model. (Non-personalised)

a. Count of user_id for each unique song as recommendation score 

b. Sort the songs on recommendation score 

c. Get the top 5 recommendations

In [29]:
#Count of user_id for each unique song as recommendation score 
train_data_grouped = train_data.groupby('song_id').agg({'user_id': 'count'}).reset_index()
train_data_grouped.rename(columns = {'user_id': 'score'},inplace=True)
train_data_grouped.head()

Unnamed: 0,song_id,score
0,SOAAFAC12A67ADF7EB,2
1,SOAATLI12A8C13E319,1
2,SOAAUKC12AB017F868,1
3,SOAAVUV12AB0186646,4
4,SOAAWEE12A6D4FBEC8,2


In [30]:
#Sort the songs on recommendation score 
train_data_sort = train_data_grouped.sort_values(['score', 'song_id'], ascending = [0,1]) 
      
#Generate a recommendation rank based upon score 
train_data_sort['Rank'] = train_data_sort['score'].rank(ascending=0, method='first') 
          
#Get the top 5 recommendations 
popularity_recommendations = train_data_sort.head(5) 
popularity_recommendations 

Unnamed: 0,song_id,score,Rank
1006,SOFRQTD12A81C233C0,37,1.0
146,SOAUWYT12A81C206F1,27,2.0
166,SOAXGDH12A8C13F8A1,24,3.0
286,SOBONKR12A58A7A7E0,24,4.0
2481,SONYKOW12AB01849C9,21,5.0


# Q6. Use popularity based recommender model to make predictions and find recommendations for random list of users with inferences

In [31]:
# Use popularity based recommender model to make predictions
def recommend(user_id):     
    user_recommendations = popularity_recommendations 
          
    #Add user_id column for which the recommendations are being generated 
    user_recommendations['userID'] = user_id 
      
    #Bring user_id column to the front 
    cols = user_recommendations.columns.tolist() 
    cols = cols[-1:] + cols[:-1] 
    user_recommendations = user_recommendations[cols] 
          
    return user_recommendations 

In [32]:
find_recom = [15,121,53]   # This list is user choice.
for i in find_recom:
    print("Here is the recommendation for the userId: %d\n" %(i))
    print(recommend(i))    
    print("\n") 

Here is the recommendation for the userId: 15

      userID             song_id  score  Rank
1006      15  SOFRQTD12A81C233C0     37   1.0
146       15  SOAUWYT12A81C206F1     27   2.0
166       15  SOAXGDH12A8C13F8A1     24   3.0
286       15  SOBONKR12A58A7A7E0     24   4.0
2481      15  SONYKOW12AB01849C9     21   5.0


Here is the recommendation for the userId: 121



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


      userID             song_id  score  Rank
1006     121  SOFRQTD12A81C233C0     37   1.0
146      121  SOAUWYT12A81C206F1     27   2.0
166      121  SOAXGDH12A8C13F8A1     24   3.0
286      121  SOBONKR12A58A7A7E0     24   4.0
2481     121  SONYKOW12AB01849C9     21   5.0


Here is the recommendation for the userId: 53

      userID             song_id  score  Rank
1006      53  SOFRQTD12A81C233C0     37   1.0
146       53  SOAUWYT12A81C206F1     27   2.0
166       53  SOAXGDH12A8C13F8A1     24   3.0
286       53  SOBONKR12A58A7A7E0     24   4.0
2481      53  SONYKOW12AB01849C9     21   5.0




Since this is a popularity-based recommender model, recommendations remain the same for all users

We predict the products based on the popularity. It is not personalized to particular user