# Build a song recommender system - Exercise

In [23]:
import turicreate

In [29]:
import pandas as pd

In [136]:
song_data = turicreate.SFrame('song_data.csv')
songs = pd.read_csv('song_data.csv')

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,int,str,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


## 1. Counting unique users

`.unique()`

In [154]:
# artists ==  'Kanye West'
kanye = songs[songs['artist'] == 'Kanye West']
kanye_userCount = kanye['user_id'].unique()
print("Kanye West listeners:" + str(len(kanye_userCount)))

# artists ==  'Taylor Swift' 
taylor = songs[songs['artist'] == 'Taylor Swift' ]
taylor_userCount = taylor['user_id'].unique()
print("Taylor Swift listeners:" + str(len(taylor_userCount)))

# artists ==  'Lady GaGa' 
gaga = songs[songs['artist'] == 'Lady GaGa' ]
gaga_userCount = gaga['user_id'].unique()
print("Lady GaGa listeners:" + str(len(gaga_userCount)))

# artists ==  'Foo Fighters' 
fighters = songs[songs['artist'] == 'Foo Fighters' ]
fighters_userCount = fighters['user_id'].unique()
print("Foo Fighters listeners:" + str(len(fighters_userCount)))

Kanye West listeners:2522
Taylor Swift listeners:3246
Lady GaGa listeners:2928
Foo Fighters listeners:2055


`.groupby()` + `.size()` + `.count()`

In [81]:
kanye.groupby('user_id').size().reset_index(name='counts').count()

user_id    2522
counts     2522
dtype: int64

In [82]:
## artists ==  'Taylor Swift' 
taylor.groupby('user_id').size().reset_index(name='counts').count()

user_id    3246
counts     3246
dtype: int64

In [129]:
## artists ==  'Lady GaGa' 
print(type(gaga.groupby('user_id').size()))
gaga.groupby('user_id').size().reset_index(name = 'counts').sort_values(by='counts', ascending = False)

<class 'pandas.core.series.Series'>


Unnamed: 0,user_id,counts
1179,6689dcba81745af8bdbf3cc4cd9ccfd5a738f5ae,11
1760,99c5f153e78b44db6fd6ad38ed5ede5fb018b888,10
2632,e6c062e89adaeae1220d5157c0db88d0c4395a0e,10
2385,d2723be72aa3fa35bfb83d51f56be55cf4b07d8a,10
209,12de7874c733af9090d9f51d1ec527ef05df2033,9
...,...,...
1081,5cf5fe4d291428d4df2a1dab3327c947de8da182,1
1084,5d283483f27c27f92ed426773b3b20f52f96f9ee,1
1085,5d298f47576edfcaf76ed62d734367dea6505599,1
1087,5d32d8a3f63cee1fbeb0a5851e5134418920c390,1


## 2. Find the most popular and least popular artist
`.groupby()`

*Here, `.agg('sum')` has the same function as `.sum()`*

*`.reset_index(name = 'total_counts')` does not work here.*

In [134]:
popularity = songs.groupby('artist').agg('sum').reset_index().rename(columns={"listen_count":"total_counts"})
popularity.sort_values(by = 'total_counts', ascending = False)

Unnamed: 0,artist,total_counts
1649,Kings Of Leon,43218
913,Dwight Yoakam,40619
344,Björk,38889
648,Coldplay,35362
1106,Florence + The Machine,33387
...,...,...
401,Boggle Karaoke,30
856,Diplo,30
308,Beyoncé feat. Bun B and Slim Thug,26
2410,Reel Feelings,24


### Using Turicreate package with groupby()

In [145]:
count_table_turi = song_data.groupby(key_column_names='artist', operations={'total_count': turicreate.aggregate.SUM('listen_count')}).sort('total_count', ascending = False)
count_table_turi


artist,total_count
Kings Of Leon,43218
Dwight Yoakam,40619
Björk,38889
Coldplay,35362
Florence + The Machine,33387
Justin Bieber,29715
Alliance Ethnik,26689
OneRepublic,25754
Train,25402
The Black Keys,22184


## 3. Find the most recommended songs

In [146]:
train_data,test_data = song_data.random_split(.8,seed=0)

In [147]:
similarity_model = turicreate.item_similarity_recommender.create(train_data,
                                                               user_id = 'user_id',
                                                               item_id = 'song')


In [148]:
# choose a subset data for testing
subset_test_users = test_data['user_id'].unique()[0:10000]

#### compute one recommended song for each of these test users

In [150]:
recommend_table = similarity_model.recommend(subset_test_users, k = 1)

#### Find the most recommended song

In [153]:
recommend_table.groupby(key_column_names='song', operations={'count': turicreate.aggregate.COUNT()}).sort('count', ascending = False)



song,count
Undo - Björk,437
Secrets - OneRepublic,375
Revelry - Kings Of Leon,222
You're The One - Dwight Yoakam ...,162
Fireflies - Charttraxx Karaoke ...,111
Hey_ Soul Sister - Train,99
Sehr kosmisch - Harmonia,94
Horn Concerto No. 4 in E flat K495: II. Romance ...,90
OMG - Usher featuring will.i.am ...,62
Bigger - Justin Bieber,43


## Create a very simple popularity recommender

In [13]:
popularity_model = turicreate.popularity_recommender.create(train_data,
                                                           user_id = 'user_id',
                                                           item_id = 'song')

## Use the popularity model to make some predictions

In [14]:
popularity_model.recommend(users=[users[0]])

user_id,song,score,rank
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Sehr kosmisch - Harmonia,4754.0,1
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Undo - Björk,4227.0,2
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,You're The One - Dwight Yoakam ...,3781.0,3
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Dog Days Are Over (Radio Edit) - Florence + The ...,3633.0,4
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Revelry - Kings Of Leon,3527.0,5
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Horn Concerto No. 4 in E flat K495: II. Romance ...,3161.0,6
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Secrets - OneRepublic,3148.0,7
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Hey_ Soul Sister - Train,2538.0,8
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Fireflies - Charttraxx Karaoke ...,2532.0,9
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Tive Sim - Cartola,2521.0,10


In [15]:
popularity_model.recommend(users=[users[1]])

user_id,song,score,rank
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Sehr kosmisch - Harmonia,4754.0,1
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Undo - Björk,4227.0,2
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,You're The One - Dwight Yoakam ...,3781.0,3
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Dog Days Are Over (Radio Edit) - Florence + The ...,3633.0,4
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Revelry - Kings Of Leon,3527.0,5
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Horn Concerto No. 4 in E flat K495: II. Romance ...,3161.0,6
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Secrets - OneRepublic,3148.0,7
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Hey_ Soul Sister - Train,2538.0,8
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Fireflies - Charttraxx Karaoke ...,2532.0,9
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Tive Sim - Cartola,2521.0,10


# Build a recommender with personalization

In [16]:
personalized_model = turicreate.item_similarity_recommender.create(train_data,
                                                                  user_id = 'user_id',
                                                                  item_id = 'song')

## Apply personalized model to make song recommendations

In [17]:
personalized_model.recommend(users=[users[0]])

user_id,song,score,rank
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Riot In Cell Block Number Nine - Dr Feelgood ...,0.0374999940395355,1
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Sei Lá Mangueira - Elizeth Cardoso ...,0.0331632643938064,2
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,The Stallion - Ween,0.0322580635547637,3
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Rain - Subhumans,0.0314159244298934,4
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,West One (Shine On Me) - The Ruts ...,0.0306771993637084,5
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Back Against The Wall - Cage The Elephant ...,0.0301204770803451,6
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Life Less Frightening - Rise Against ...,0.0284431129693985,7
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,A Beggar On A Beach Of Gold - Mike And The ...,0.023002490401268,8
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Audience Of One - Rise Against ...,0.0193938463926315,9
279292bb36dbfc7f505e36ebf 038c81eb1d1d63e ...,Blame It On The Boogie - The Jacksons ...,0.0189873427152633,10


In [18]:
personalized_model.recommend(users=[users[1]])

user_id,song,score,rank
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Grind With Me (Explicit Version) - Pretty Ricky ...,0.0459424376487731,1
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,There Goes My Baby - Usher ...,0.0331920742988586,2
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Panty Droppa [Intro] (Album Version) - Trey ...,0.031856620311737,3
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Nobody (Featuring Athena Cage) (LP Version) - ...,0.0278467655181884,4
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Youth Against Fascism - Sonic Youth ...,0.0262914180755615,5
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Nice & Slow - Usher,0.0239639401435852,6
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Making Love (Into The Night) - Usher ...,0.0238176941871643,7
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Naked - Marques Houston,0.0228925704956054,8
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,I.nner Indulgence - DESTRUCTION ...,0.0220767498016357,9
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Love Lost (Album Version) - Trey Songz ...,0.0204497694969177,10


# Apply model to find similar songs in the data set

In [19]:
personalized_model.get_similar_items(['With Or Without You - U2'])

song,similar,score,rank
With Or Without You - U2,I Still Haven't Found What I'm Looking For ...,0.0428571701049804,1
With Or Without You - U2,Hold Me_ Thrill Me_ Kiss Me_ Kill Me - U2 ...,0.033734917640686,2
With Or Without You - U2,Window In The Skies - U2,0.032835841178894,3
With Or Without You - U2,Vertigo - U2,0.030075192451477,4
With Or Without You - U2,Sunday Bloody Sunday - U2,0.0271317958831787,5
With Or Without You - U2,Bad - U2,0.0251798629760742,6
With Or Without You - U2,A Day Without Me - U2,0.0237154364585876,7
With Or Without You - U2,Another Time Another Place - U2 ...,0.0203251838684082,8
With Or Without You - U2,Walk On - U2,0.0202020406723022,9
With Or Without You - U2,Get On Your Boots - U2,0.0196850299835205,10


In [20]:
personalized_model.get_similar_items(['Chan Chan (Live) - Buena Vista Social Club'])

song,similar,score,rank
Chan Chan (Live) - Buena Vista Social Club ...,Murmullo - Buena Vista Social Club ...,0.1881188154220581,1
Chan Chan (Live) - Buena Vista Social Club ...,La Bayamesa - Buena Vista Social Club ...,0.1871921420097351,2
Chan Chan (Live) - Buena Vista Social Club ...,Amor de Loca Juventud - Buena Vista Social Club ...,0.1848341226577758,3
Chan Chan (Live) - Buena Vista Social Club ...,Diferente - Gotan Project,0.0214592218399047,4
Chan Chan (Live) - Buena Vista Social Club ...,Mistica - Orishas,0.0205761194229125,5
Chan Chan (Live) - Buena Vista Social Club ...,Hotel California - Gipsy Kings ...,0.0193049907684326,6
Chan Chan (Live) - Buena Vista Social Club ...,Nací Orishas - Orishas,0.0191571116447448,7
Chan Chan (Live) - Buena Vista Social Club ...,Gitana - Willie Colon,0.0187969803810119,8
Chan Chan (Live) - Buena Vista Social Club ...,Le Moulin - Yann Tiersen,0.0187969803810119,9
Chan Chan (Live) - Buena Vista Social Club ...,Criminal - Gotan Project,0.0187793374061584,10


# Compare the models quantitatively
We now formally compare the popularity and the personalized models using precision-recall curves. 

In [22]:
%matplotlib inline
model_performance = turicreate.recommender.util.compare_models(test_data, 
                                                               [popularity_model, personalized_model], 
                                                               user_sample=.05)

compare_models: using 2931 users to estimate model performance
PROGRESS: Evaluate model M0





Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.020470829068577272 | 0.005466198762002244 |
|   2    | 0.018082565677243277 | 0.009870445002000173 |
|   3    | 0.017855112020925767 | 0.013747856683915457 |
|   4    | 0.016461958375980898 | 0.017092512331539387 |
|   5    | 0.015557830092118741 | 0.019950850399703454 |
|   6    | 0.014784487660639138 | 0.02341236765344179  |
|   7    | 0.014280840278793224 | 0.026594067532715253 |
|   8    | 0.013775162060730132 | 0.029744514783499685 |
|   9    | 0.012964858410098947 | 0.03185104580507674  |
|   10   | 0.012760150119413189 | 0.034539154352960094 |
+--------+----------------------+----------------------+
[10 rows x 3 columns]

PROGRESS: Evaluate model M1





Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.023541453428863865 | 0.008750627097607652 |
|   2    | 0.020129648584101007 |  0.0132676694841786  |
|   3    | 0.01842374616171956  | 0.01810255274792574  |
|   4    | 0.017314909587171596 | 0.021637532306446754 |
|   5    | 0.01630842715796658  | 0.02538945913923334  |
|   6    | 0.015296258387353556 | 0.02915856807722664  |
|   7    | 0.014183360140371394 |  0.0310957065292064  |
|   8    | 0.013476629136813374 | 0.03352239959049522  |
|   9    | 0.012699495811061832 | 0.035596018757256695 |
|   10   | 0.01218014329580349  |  0.0376098082582519  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]



The table shows that the personalized model provides much better performance.