In [403]:
import pandas
import numpy as np

# Preparing data

In [404]:
# Data used is a subset of http://labrosa.ee.columbia.edu/millionsong
# triplets_file = 'https://static.turi.com/datasets/millionsong/10000.txt'
# songs_metadata_file = 'https://static.turi.com/datasets/millionsong/song_data.csv'

triplets_file = 'data/10000.txt'
metadata_file = 'data/song_data.csv'

In [405]:
song_df_1 = pandas.read_table(triplets_file,header=None)
song_df_1.columns = ['user_id', 'song_id', 'listen_count']

song_df_2 = pandas.read_csv(metadata_file)
song_df = pandas.merge(song_df_1, song_df_2.drop_duplicates(['song_id']), on="song_id", how="left")

# Observing data

In [406]:
song_df.head()

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Thicker Than Water,Jack Johnson,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Graduation,Kanye West,2007
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,In Between Dreams,Jack Johnson,2005
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999


In [407]:
len(song_df)

2000000

In [408]:
song_df = song_df.head(10000)
song_df['song'] = song_df['title'].map(str) + " - " + song_df['artist_name']

In [409]:
song_grouped = song_df.groupby(['song']).agg({'listen_count': 'count'}).reset_index()
grouped_sum = song_grouped['listen_count'].sum()
song_grouped['percentage']  = song_grouped['listen_count'].div(grouped_sum)*100
song_grouped.sort_values(['listen_count', 'song'], ascending = [0,1])


Unnamed: 0,song,listen_count,percentage
3660,Sehr kosmisch - Harmonia,45,0.45
4678,Undo - Björk,32,0.32
5105,You're The One - Dwight Yoakam,32,0.32
1071,Dog Days Are Over (Radio Edit) - Florence + Th...,28,0.28
3655,Secrets - OneRepublic,28,0.28
4378,The Scientist - Coldplay,27,0.27
4712,Use Somebody - Kings Of Leon,27,0.27
3476,Revelry - Kings Of Leon,26,0.26
1387,Fireflies - Charttraxx Karaoke,24,0.24
1862,Horn Concerto No. 4 in E flat K495: II. Romanc...,23,0.23


In [410]:
users = song_df['user_id'].unique()

In [411]:
len(users)

365

In [412]:
train_data, test_data = train_test_split(song_df, test_size = 0.20, random_state=0)

In [413]:
train_data.head(5)

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year,song
7389,94d5bdc37683950e90c56c9b32721edb5d347600,SOXNZOW12AB017F756,2,Half Of My Heart,Battle Studies,John Mayer,0,Half Of My Heart - John Mayer
9275,1012ecfd277b96487ed8357d02fa8326b13696a5,SOXHYVQ12AB0187949,1,The Beautiful People,Antichrist Superstar (Ecopac Explicit),Marilyn Manson,0,The Beautiful People - Marilyn Manson
2995,15415fa2745b344bce958967c346f2a89f792f63,SOOSZAZ12A6D4FADF8,1,Sanctify Yourself,Glittering Prize 81/92,Simple Minds,1985,Sanctify Yourself - Simple Minds
5316,ffadf9297a99945c0513cd87939d91d8b602936b,SOWDJEJ12A8C1339FE,4,Heart Cooks Brain,Everything Is Nice: The Matador Records 10th A...,Modest Mouse,1997,Heart Cooks Brain - Modest Mouse
356,5a905f000fc1ff3df7ca807d57edb608863db05d,SOAMPRJ12A8AE45F38,20,Rorol,Identification Parade,Octopus Project,2002,Rorol - Octopus Project


# Item similarity model implementation

In [414]:
class item_similarity_model:
    def __init__(self, train_data):
        self.train_data = train_data
    
    def get_user_songs(self, user):
        user_data = self.train_data[self.train_data['user_id'] == user]
        return list(user_data['song'].unique())
    
    def get_song_listeners(self, song):
        song_data = self.train_data[self.train_data['song'] == song]
        return set(song_data['user_id'].unique())
    
    def get_all_songs(self):
        return list(self.train_data['song'].unique())
    
    def build_cooccurence_matrix(self, user_songs, all_songs):
        user_songs_listeners = []
        for song in user_songs:
            user_songs_listeners.append(self.get_song_listeners(song))
            
        matrix = np.matrix(np.zeros(shape=(len(user_songs), len(all_songs))), float)
        for i in range(len(all_songs)):
            song_listeners_set_1 = self.get_song_listeners(all_songs[i])
            for j in range(len(user_songs)):
                song_listeners_set_2 = user_songs_listeners[j]
                intersection = song_listeners_set_1.intersection(song_listeners_set_2)
                if len(intersection) != 0:
                    union = song_listeners_set_1.union(song_listeners_set_2)
                    jaccard_index = float(len(intersection))/float(len(union))
                    matrix[j, i] = jaccard_index
                else:
                    matrix[j, i] = 0
        return matrix
    
    def get_top_recommendations(self, user, matrix, all_songs, user_songs):
        # calculate average weight for every user's song
        user_sim_scores = matrix.sum(axis=0)/float(matrix.shape[0])
        user_sim_scores = np.array(user_sim_scores)[0].tolist()
        
        # sort it by weight
        sort_index = sorted(((e,i) for i,e in enumerate(list(user_sim_scores))), reverse=True)
        
        df = pandas.DataFrame(columns=['user_id', 'song', 'score', 'rank'])
        rank = 1 
        for i in range(0,len(sort_index)):
            if ~np.isnan(sort_index[i][0]) and all_songs[sort_index[i][1]] not in user_songs and rank <= 10:
                df.loc[len(df)]=[user,all_songs[sort_index[i][1]],sort_index[i][0],rank]
                rank = rank+1
        
        if df.shape[0] == 0:
            print("The current user has no songs for training the item similarity based recommendation model.")
            return -1
        else:
            return df

        
    def recommend(self, user):
        user_songs = self.get_user_songs(user)
        all_songs = self.get_all_songs()
        matrix = self.build_cooccurence_matrix(user_songs, all_songs)
        return self.get_top_recommendations(user, matrix, all_songs, user_songs)
    
    

# Creating a model and using it

In [415]:
model = item_similarity_model(train_data)
model.recommend(users[0])

Unnamed: 0,user_id,song,score,rank
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Meet Virginia - Train,0.041223,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Are You In? - Incubus,0.041223,2
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Oil And Water - Incubus,0.041223,3
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Not For You - Pearl Jam,0.041223,4
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Id Die Without You - P.M. Dawn,0.041223,5
5,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Misled - Céline Dion,0.041223,6
6,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Heard Them Stirring - Fleet Foxes,0.041223,7
7,b80344d063b5ccb3212f76538f3d9e43d87dca9e,St. Elsewhere - Dave Grusin,0.041223,8
8,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Meadowlarks - Fleet Foxes,0.041223,9
9,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Tiger Mountain Peasant Song - Fleet Foxes,0.041223,10


# LICENSE

Thierry Bertin-Mahieux, Daniel P.W. Ellis, Brian Whitman, and Paul Lamere. 
The Million Song Dataset. In Proceedings of the 12th International Society
for Music Information Retrieval Conference (ISMIR 2011), 2011.