# **Music Recommendation System, final submission**

## Preliminaries

### **Libraries**

In [2]:
# Mounting the drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Used to ignore the warning given as output of the code
import warnings
warnings.filterwarnings('ignore')

# Basic libraries of python for numeric and dataframe computations
import numpy as np
import pandas as pd
import random # to pick items or users randomly (sometimes useful)

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Import the required library to compute the cosine similarity between two vectors
from sklearn.metrics.pairwise import cosine_similarity

# For label encoding
from sklearn.preprocessing import LabelEncoder

# Import defaultdict from collections A dictionary output that does not raise a key error
from collections import defaultdict

# Linear regression and other useful tools
from sklearn.linear_model import LinearRegression
#from sklearn.model_selection import train_test_split (clash)
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [4]:
# install surprise
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357265 sha256=9e9d6c1cbbdbd2ef7dc2ca6438cdf7c6f3e122295622222c0c0241c04bcfca95
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Install

In [83]:
# Import necessary libraries for Recommender Systems

# To compute the accuracy of models
from surprise import accuracy

# This class is used to parse a file containing play_counts, data should be in structure - user; item; play_count
from surprise import Reader

# Class for loading datasets
from surprise import Dataset

# For tuning model hyperparameters
from surprise.model_selection import GridSearchCV

# For splitting the data in train and test dataset
from surprise.model_selection import train_test_split

# For implementing similarity-based recommendation system
from surprise.prediction_algorithms.knns import KNNBasic

# For implementing matrix factorization based recommendation system
from surprise.prediction_algorithms.matrix_factorization import SVD

# For implementing KFold cross-validation
from surprise.model_selection import KFold

# For implementing clustering-based recommendation system
from surprise import CoClustering

### Useful functions for later

#### show_duplicates

In [6]:
# function that displays all duplicates of a dataframe based on a subset of columns

def show_duplicates(df,subset):
  """
  Identify all duplicated entries and put them in a dataframe.

  Args:
  - df: Pandas dataframe
  - subset: subset of columns to check for duplicates

  Returns:
  - dataframe with all duplicated entries displayed
  """

  duplicated_entries = pd.concat(g for _, g in df.groupby(subset) if len(g) > 1)

  print('There are ' + str(duplicated_entries.shape[0]) + ' duplicated values in the dataframe \
  for the columns {}, \
  corresponding to {} different values \n'.format(subset,df.duplicated(subset=subset).sum()))

  return duplicated_entries

#### get_top

In [7]:
# Build a function to find top n songs
def get_top(df,n=10, min_count=200):
  """
   - returns the top n songs based on average playcount in a dataframe
   - displays number of interactions, as well as total play count of any song in this list
  """

  # define groupby object G
  G=df.groupby(['song_id'])

  # aggregate G with:
  ## 1) 'count' to compute number of interactions
  ## 2) 'sum' to compute total play count
  df1=G.agg(nb_interactions=('play_count','sum'),total_count=('play_count','count'))

  # filter those entries where play_count is larger than threshold
  df1=df1[df1.total_count>=min_count]

  # aggregate G with mean of playcounts
  df2=G.agg(average_play_count=('play_count','mean'))

  # merge the two dataframes
  df_merged=df1.merge(df2, on='song_id').sort_values(by='average_play_count', ascending=False)

  # return first n values
  return df_merged.head(n)

#### precision_recall_at_k

In [8]:
def precision_recall_at_k(model, testset, k=30, threshold=1.5):
    """
    Returns precision and recall at k metrics for each user
    """

    # First map the predictions to each user.
    user_est_true = defaultdict(list)

    #Making predictions on the test data
    predictions = model.test(testset)

    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, playing_count in user_est_true.items():

        # Sort play count by estimated value
        playing_count.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in playing_count)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in playing_count[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in playing_count[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set Precision to 0 when n_rec_k is 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set Recall to 0 when n_rel is 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    #Mean of all the predicted precisions are calculated.
    precision = round((sum(prec for prec in precisions.values()) / len(precisions)),3)
    #Mean of all the predicted recalls are calculated.
    recall = round((sum(rec for rec in recalls.values()) / len(recalls)),3)

    accuracy.rmse(predictions)
    print('Precision: ', precision) #Command to print the overall precision
    print('Recall: ', recall) #Command to print the overall recall
    print('F_1 score: ', round((2*precision*recall)/(precision+recall),3)) # Formula to compute the F-1 score.

#### get_recommendations

In [9]:
def get_recommendations(data, user_id, n, algo):
    """
    Yields the top n items recommended for the given user_id by a given algorithm
    Args:
    - data: the dataset
    - user_id: the user_id for which we want the recommendations
    - n: the number of recommendations to return
    - algo: the algorithm we want to use for predicting the play_count
    Returns:
    - a set of top n items, along with the corresponding play_count
    """

    # Creating an empty list to store the recommended song ids
    L=[]

    # Creating an user item interactions matrix
    M = data.pivot(index='user_id', columns='song_id', values='play_count')

    # Extracting those song ids which the user_id has not played yet
    S=M.loc[user_id][M.loc[user_id].isnull()].index.tolist()

    # Looping through each of the song ids which user_id has not interacted yet
    for iid in S:

        # Predicting the users for those non played song ids by this user
        ## For ease of readability we only take the 2 first decimals
        est=np.round(algo.predict(user_id, iid).est, decimals=2)

        # Appending the predicted play_counts
        L.append((iid, est))

    # Sorting the predicted play_counts in descending order
    L.sort(key=lambda x: x[1], reverse=True)

    return L[:n] # Returning top n highest predicted play_count songs for this user

#### ranking_songs

In [10]:
def ranking_songs(recommendations, playing_count, correction=None):
  """
  - returns the top songs based on average playcount in a dataframe
  - displays number of interactions, as well as total play count of any song in this list

  Args:
  1) recommendations is a dataframe with the following columns
  - song_id
  - predicted_play_count

  2) final_play is a dataframe with the following columns:
  - average_play_count
  - frequency
  - nb_interactions
  - total_count

  3) correction takes the two values 'total_count' and 'nb_interactions'
  It gives two different ways to correct the predicted play_count
  If set to None, there is no correction.

  Returns:
  - a dataframe with the following columns:
  - song_id
  - average_play_count
  - predicted_play_count
  - corrected_play_count (if correction applied)
  """

  # Sort the songs based on play counts
  playing_count.sort_values(by='total_count', ascending=False,inplace=True)

  # Merge with the recommended songs to get predicted play_counts
  recommendations=recommendations.merge(playing_count, on='song_id')

  # Calculate the correction factor
  if correction=='nb_interactions':
    penalty = 1/np.sqrt(recommendations['nb_interactions'])
  elif correction=='total_count':
    penalty = 1/np.sqrt(recommendations['total_count'])
  else:
    penalty=0

  # Correct the predicted play_counts and put in new column
  recommendations['corrected_play_count']=np.round(recommendations['predicted_play_count'] - penalty ,3)

  # Rank the songs based on corrected play_counts
  recommendations.sort_values(by='corrected_play_count', ascending=False, inplace=True)

  # returns dataframe of recommendations
  # 1) case when a correction is applied
  if correction!=None:
    print('The songs have been corrected by subtracting the inverse square-root of ' + str(correction) + ': \n')
    return recommendations[['song_id','average_play_count', 'predicted_play_count', 'corrected_play_count']].set_index('song_id')

  # 2) case when no correction is applied
  else:
    print('No correction has been applied: \n')
  return recommendations[['song_id','average_play_count', 'predicted_play_count']].set_index('song_id')

#### true_rating

In [11]:
# FUNCTION THAT GIVES THE TRUE RATING OF A GIVEN PAIR

def true_rating(df,uid,iid):
  """
  - returns the true rating of a user-item pair if it has interacted in the past
  - otherwise returns None
  """

  # filter data to catch interaction
  data=df[(df['user_id']==uid) & (df['song_id']==iid)]

  # if no interaction return None
  if data['play_count'].tolist()==[]:
    print("This pair has not interacted yet")
    return None

  # if interaction return play_count
  else:
    actual_value=data.play_count.tolist()[0]
    print("This pair has interacted before and the true play count is {0}".format(actual_value))
    return actual_value

## **The dataset**

### Loading

In [12]:
# Importing the datasets and:
## - drop duplicated columns in song_df
## - Drop the column 'Unnamed: 0' in count_df

# song_df
song_df=pd.read_csv('/content/drive/MyDrive/MIT/projects/song_data.csv')
d1=song_df.duplicated().sum()
print('The dataframe song_df has been loaded.')
song_df.drop_duplicates(inplace=True)
print('A number of {} duplicates of the value song_id has been succesfully removed'.format(d1))
song_df.loc[song_df['year']==0,'year']=np.nan
print('The value 0 in year column has been replaced by NaN')
print('The dataframe song_df has {} rows and {} columns'.format(song_df.shape[0], song_df.shape[1]))

print('\n' + '-'*35 + '\n')

# count_df
count_df=pd.read_csv('/content/drive/MyDrive/MIT/projects/count_data.csv')
print('The dataframe count_df has been loaded.')
count_df.drop('Unnamed: 0', axis=1, inplace=True)
print('The column "Unnamed: 0" has been dropped')
print('The dataframe count_df has {} rows and {} columns'.format(count_df.shape[0], count_df.shape[1]))

The dataframe song_df has been loaded.
A number of 498 duplicates of the value song_id has been succesfully removed
The value 0 in year column has been replaced by NaN
The dataframe song_df has 999502 rows and 5 columns

-----------------------------------

The dataframe count_df has been loaded.
The column "Unnamed: 0" has been dropped
The dataframe count_df has 2000000 rows and 3 columns


**Observation :** There are 484424 entries (approximately a half of the songs) whose year attribute is 0. To avoid misinterpretation, we replaced these values with 'np.NaN'.

### Cleaning

We can test our function on the column 'song_id':

In [13]:
# check duplicates on 'song_id'
show_duplicates(song_df, 'song_id') # call function previously defined

There are 882 duplicated values in the dataframe   for the columns song_id,   corresponding to 446 different values 



Unnamed: 0,song_id,title,release,artist_name,year
304966,SOAAEFC12AB01852F1,De Tongbreker (Tineke Schouten & Linda de Mol),16 Liedjes Uit De Tineke Schouten Shows,Tineke Schouten,
963681,SOAAEFC12AB01852F1,De Tongbreker,Alle 40 Goed - Hollandse Duetten,Tineke Schouten/Linda De Mol/Franklin Brown,
347698,SOAGUAI12A8C143EAE,A Cruz,Via Brasil vol.1 (Cristal),Tania Maria,
688094,SOAGUAI12A8C143EAE,A Cruz,Tania Maria vol1,Tania Maria,
143863,SOAHLGV12AF72A6DFC,Adam's Song,Enema Of The State,Blink-182,1999.0
...,...,...,...,...,...
615491,SOZUPDR12A8C137FB6,42,Viva La Vida - Prospekt's March Edition,Coldplay,2008.0
126015,SOZWLJH12AF72A15CD,Lover Man,Live at Woodstock,Jimi Hendrix,1971.0
526778,SOZWLJH12AF72A15CD,Lover Man,Live At Woodstock,Jimi Hendrix,1971.0
689237,SOZYUXF12A6701F7B9,Addicted,Greatest Hits,Enrique Iglesias,2003.0


In [14]:
N=song_df.duplicated(subset='song_id').sum() # number of distinct duplicated values

# drop the duplicates
song_df.drop_duplicates(subset='song_id',inplace=True)
print('A number of {} duplicates of the value song_id have been succesfully removed'.format(N))

A number of 446 duplicates of the value song_id have been succesfully removed


In [15]:
# Left merge count_df and song_df on "song_id" (no duplicated value)

df=count_df.merge(song_df, on='song_id', how='left')

# remove rows with NaN value in column  'year'
df.dropna(subset=['year'], inplace=True)

# reset index
df.reset_index(drop=True, inplace=True)

# display for convenience
df.info()
print(df.head(7))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1627878 entries, 0 to 1627877
Data columns (total 7 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   user_id      1627878 non-null  object 
 1   song_id      1627878 non-null  object 
 2   play_count   1627878 non-null  int64  
 3   title        1627878 non-null  object 
 4   release      1627878 non-null  object 
 5   artist_name  1627878 non-null  object 
 6   year         1627878 non-null  float64
dtypes: float64(1), int64(1), object(5)
memory usage: 86.9+ MB
                                    user_id             song_id  play_count  \
0  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBBMDR12A8C13253B           2   
1  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBXHDL12A81C204C0           1   
2  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBYHAJ12A6701BF1D           1   
3  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SODACBL12A8C13C273           1   
4  b80344d063b5ccb3212f76538f3d9e4

We now address the label encoding.

In [16]:
# Apply label encoding for "user_id" and "song_id"

le=LabelEncoder() # introduce label encoder

# create a copy of the original dataframe
## and redefine column values with encoding
df_le=df.copy()
df_le['user_id']=le.fit_transform(df['user_id'])
df_le['song_id']=le.fit_transform(df['song_id'])

### Filtering

Let us first filter the users.

In [17]:
# Get the column containing the users
users = df_le.user_id

# Create a dictionary that maps users(listeners) to the number of songs that they have listened to
playing_count = dict()

for user in users:
    # If we already have the user, just add 1 to their playing count
    if user in playing_count:
        playing_count[user] += 1

    # Otherwise, set their playing count to 1
    else:
        playing_count[user] = 1

In [18]:
# We want our users to have listened at least 90 songs
SONG_COUNT_CUTOFF = 90

# Create a list of users who need to be removed
remove_users = []

for user, num_songs in playing_count.items():

    if num_songs < SONG_COUNT_CUTOFF:
        remove_users.append(user)

df1 = df_le.loc[ ~ df_le.user_id.isin(remove_users)]

Next, we filter the songs.

In [19]:
# Get the column containing the songs
songs = df1.song_id

# Create a dictionary that maps songs to its number of users(listeners)
playing_count = dict()

for song in songs:
    # If we already have the song, just add 1 to their playing count
    if song in playing_count:
        playing_count[song] += 1

    # Otherwise, set their playing count to 1
    else:
        playing_count[song] = 1

In [20]:
# We want our song to be listened by at least 120 users to be considered
LISTENER_COUNT_CUTOFF = 120

remove_songs = []

for song, num_users in playing_count.items():
    if num_users < LISTENER_COUNT_CUTOFF:
        remove_songs.append(song)

df2= df1.loc[ ~ df1.song_id.isin(remove_songs)] # call it df2 for now (df_final is defined in the next cells)
df2.info() # display information for convenience

<class 'pandas.core.frame.DataFrame'>
Index: 62360 entries, 281 to 1627663
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   user_id      62360 non-null  int64  
 1   song_id      62360 non-null  int64  
 2   play_count   62360 non-null  int64  
 3   title        62360 non-null  object 
 4   release      62360 non-null  object 
 5   artist_name  62360 non-null  object 
 6   year         62360 non-null  float64
dtypes: float64(1), int64(3), object(3)
memory usage: 3.8+ MB


Third filter: treat songs with high playcount (>10)

In [21]:
# show rows of df_final that have more than 10 playcounts
print(df2[df2['play_count']>10].sort_values(by='play_count', ascending=False))

print('This represents {}% of the data'.format(round(df2[df2['play_count']>10].shape[0]/df2.shape[0]*100,2)))

         user_id  song_id  play_count                          title  \
87868      31571      502         271                 You're The One   
294335     32241       80         215  Harder Better Faster Stronger   
291195     62519     6099         194                        Revelry   
294346     32241      972         175                   Learn To Fly   
1323273     3224     5849         156                      Alejandro   
...          ...      ...         ...                            ...   
1022140    23245     5247          11             Sayonara-Nostalgia   
1026615    56035     6099          11                        Revelry   
139063     48379     7127          11                  Una Confusion   
1032990    55049     6099          11                        Revelry   
1626680    71549     1466          11                 Help I'm Alive   

                               release     artist_name    year  
87868               If There Was A Way   Dwight Yoakam  1990.0  
29433

In [22]:
# Replace higher values by putting 10 indifferently

df_final = df2.copy()
df_final.loc[df_final['play_count']>10,'play_count']=10

In [23]:
# Check the shape of the data, number of unique users etc.

print('The dataframe df_final that we are going to work with has {} rows and {} columns'.format(df_final.shape[0], df_final.shape[1]))
print('It consists in {}% of the original data'.format(round(df_final.shape[0]/df.shape[0]*100,2)))
print('\n' + '-'*35 + '\n')
print('The total number of unique user_id is', df_final.user_id.nunique())
print('It consists in', round(df_final.user_id.nunique()/df.user_id.nunique()*100,2), '% of the original set of users')
print('\n' + '-'*35 + '\n')
print('The total number of unique song_id is', df_final.song_id.nunique())
print('It consists in', round(df_final.song_id.nunique()/df.song_id.nunique()*100,2), '% of the original set of songs')
print('\n' + '-'*35 + '\n')
print('The total number of unique artists is', df_final.artist_name.nunique())
print('It consists in', round(df_final.artist_name.nunique()/df.artist_name.nunique()*100,2), '% of the original set of artists')

The dataframe df_final that we are going to work with has 62360 rows and 7 columns
It consists in 3.83% of the original data

-----------------------------------

The total number of unique user_id is 2050
It consists in 2.7 % of the original set of users

-----------------------------------

The total number of unique song_id is 300
It consists in 3.65 % of the original set of songs

-----------------------------------

The total number of unique artists is 141
It consists in 5.35 % of the original set of artists


Most interacted songs and users

In [24]:
# Display the 10 most interacted songs
## and the 10 most interacted users

song_interactions = df_final.song_id.value_counts()

user_interactions = df_final.user_id.value_counts()

print(song_interactions.head(10))
print('\n' + '-'*35 + '\n')

print(user_interactions.head(10))

song_id
7079    603
4516    555
914     531
3395    531
6099    507
5082    492
3625    491
576     479
5171    475
253     445
Name: count, dtype: int64

-----------------------------------

user_id
61240    176
3224     170
15669    165
36906    162
26509    153
42874    153
62519    149
9065     149
23245    148
61233    147
Name: count, dtype: int64


## A numerical feature for songs: analysis of the 'date' variable

In [25]:
song_df_used= df_final[['song_id','title','artist_name','release','year']].drop_duplicates().set_index('song_id')
song_df_used

Unnamed: 0_level_0,title,artist_name,release,year
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
80,Harder Better Faster Stronger,Daft Punk,Discovery,2007.0
156,Rorol,Octopus Project,Identification Parade,2002.0
589,Monkey Man,Amy Winehouse,You Know I'm No Good,2007.0
631,Love,Simian Mobile Disco,Attack Decay Sustain Release,2007.0
914,Clocks,Coldplay,A Rush Of Blood To The Head,2002.0
...,...,...,...,...
5946,Master Of Puppets,Metallica,Master Of Puppets,1986.0
8183,Greece 2000,Three Drives,Greece 2000,1997.0
1991,Señorita,Justin Timberlake,Justified,2002.0
7557,Sala De Recepção,Cartola,O Mundo E Um Moinho,1976.0


Songs released on yearly basis

In [26]:
# Find out the number of songs released in a year, use the songs_df
  # Hint: Use groupby function on the 'year' column

released_counts = song_df_used.groupby('year').count().song_id.rename('songs_released')

# drop first row
released_counts.drop(released_counts.index[0], inplace=True)
released_counts

AttributeError: 'DataFrame' object has no attribute 'song_id'

In [None]:
# Set the figure size
plt.figure(figsize=(12,5))

# Create a barplot plot with y label as "number of titles played" and x -axis year
sns.barplot(x=released_counts.index, y=released_counts.values)

# Set the y label of the plot
plt.ylabel('Number of titles played')

# Set the x label of the plot
plt.xlabel('Year')

# Display fewer labels for readability
ax = plt.gca()
ax.set_xticks(ax.get_xticks()[::7])  # Show every 10th label
plt.xticks(rotation=45)  # Rotate for better readability

# Show the plot
plt.show()

In [None]:
# regplot of play_count vs year
sns.regplot(x='year', y='play_count', data=df_final)

## Main model

### **Popularity-Based Recommendation Systems**

Let's take the count and sum of play counts of the songs and build the popularity recommendation systems based on the sum of play counts.

In [None]:
# Calculating average play_count using groupby function on the song_id column
songs_grouped=df_final.groupby(['song_id','title']).agg(average_play_count=('play_count','mean')).sort_values(by='average_play_count', ascending=False)

# Calculating the frequency a song is played using groupby and agg()
songs_grouped_freq=df_final.groupby(['song_id','title']).agg(total_count=('play_count','sum')).sort_values(by='total_count', ascending=False)

# Add frequency column.
## The frequency is understood here as f=(total count of given song)/(total count of all songs)
songs_grouped_freq[['frequency']]=(songs_grouped_freq[['total_count']]/songs_grouped_freq['total_count'].sum())

print('Total number of songs :' + str(df_final.shape[0]) + '\n')
print('Total number of plays (counting repetitions): ' + str(songs_grouped_freq['total_count'].sum()) + '\n')
print('\n'+'-'*45 + '\n')
print('Songs grouped with average playcount: \n\n',songs_grouped)
print('\n'+'-'*45 + '\n')
print('Songs grouped with frequency: \n\n',songs_grouped_freq)

In [None]:
 # Making a dataframe 'final_play' with the average_count, play_freq (and year)

final_play = songs_grouped.merge(songs_grouped_freq, on=['song_id','title'])

# add the number of interactions for later use
final_play = final_play.merge(song_interactions.rename('nb_interactions'), on='song_id')

# add the year for later use
final_play = final_play.merge(song_df_used[['song_id','year']], on='song_id',how='left')

# re-ordering columns for readability
final_play = final_play[['song_id','average_play_count','frequency','nb_interactions','total_count','year']]
final_play.set_index('song_id', inplace=True)
final_play

In [None]:
# Recommend top 10 songs using get_top function

top10=get_top(df_final)

# add song information for convenience and readability
top10_with_info=top10.merge(song_df_used, on='song_id', how='left').set_index('song_id')
top10_with_info

### Model Based Collaborative Filtering - Matrix Factorization

In [69]:
# Instantiating Reader scale with expected rating scale
 #use rating scale (0, 10)
reader = Reader(rating_scale=(0,10))

# Loading the dataset
 # Take only "user_id","song_id", and "play_count"
data=Dataset.load_from_df(df_final[['user_id','song_id','play_count']], reader)

# Splitting the data into train and test dataset
 # Take test_size = 0.4, random_state = 42
trainset, testset = train_test_split(data, test_size=0.4, random_state=42)

#### Baseline SVD

We use **latent features** to find recommendations for each user:
L_ij= f_latent(u_i,u_j)

In [70]:
# Build baseline model using svd
svd = SVD(random_state=1)

# Fit the model
svd.fit(trainset)

# Evaluate the model performance
precision_recall_at_k(svd,testset)

RMSE: 1.9581
Precision:  0.446
Recall:  0.736
F_1 score:  0.555


#### SVD optimized

We tune hyperparameters to improve performance.

In [None]:
# Set the parameter space to tune and include default values
grid_params = {'n_epochs': [5, 10],
               'lr_all': [0.002, 0.005],
               'reg_all': [0.4, 0.6]}

# Perform 3-fold grid-search cross-validation
gs_svd=GridSearchCV(SVD, grid_params, measures=['rmse'], cv=3)

# Fitting data
gs_svd.fit(data);

# Best RMSE score
print('The best RMSE score is: \n',
      gs_svd.best_score['rmse'])

# Combination of parameters that gave the best RMSE score
best_svd=gs_svd.best_params['rmse']
print('It is obtained for the following combination of parameters :\n',best_svd)

In [None]:
# Building the optimized SVD model using optimal hyperparameters
svd_optimized = SVD(**best_svd, random_state=1)

# Fit the model
svd_optimized.fit(trainset)

# Evaluate the model performance
precision_recall_at_k(svd_optimized, testset)

In [None]:
# Select a user at random
uid = np.random.choice(df_final['user_id'].unique())
print('The user selected has user_id :', uid)
print('It has interacted with {} songs'.format(user_interactions[uid]))

In [None]:
song_df_used

In [None]:
# Getting top 5 recommendations for user uid using "svd_optimized" algorithm
n=5
top_svd=pd.DataFrame(get_recommendations(df_final, uid, n, svd_optimized),columns=['song_id','predicted_play_count'])
top_svd.set_index('song_id', inplace=True)


# add song information for convenience and readability
top_svd.merge(song_df_used, on='song_id', how='left')

### Content-based meets SVD: improving matrix factorization

We are going to test a model of the form
  L_ij = f_obs(xj) + f_latent(u_i,v_j)

where x_j is the release year.

Here
- f_obs is obtained by linear regression
- f_latent is the SVD of the remainder matrix Ldiff_ij


#### Linear regression

In [28]:
# introduce a linear regression object
lreg=LinearRegression()

# set 'year' as the rescaled independent variable
X=df_final[['year']]

# set 'average_playcount' as the output variable
y=df_final[['play_count']]

In [34]:
# fit the model
lreg.fit(X,y)

# predict the output
yhat=lreg.predict(X)

# compute intercept, coefficient, mse, r2
mse=mean_squared_error(y,yhat)
r2=r2_score(y,yhat)
b0=float(lreg.intercept_)
b1=float(lreg.coef_)

# print values
print('Intercept: ', b0)
print('Coefficient: ', b1)
print('Mean squared error: ', mse)
print('R squared: ', r2)

Intercept:  50.302736436911054
Coefficient:  -0.023958443333568606
Mean squared error:  4.745871757199367
R squared:  0.0053397733750065735


In [36]:
# correct playcount for (b0,b1)
L_diff['play_count'] = L['play_count'] - b0 - b1*df_final['year']

m1=df_diff.play_count.min()
m2=df_diff.play_count.max()

print('The minimum and maximum values for play_count are {} and {}'.format(m1, m2))

The minimum and maximum values for play_count are -2.032727739780192 and 7.853734663561845


#### Applying SVD to Ldiff

In [103]:
# create lambda function that maps song_id to year
Year = lambda x: song_df_used.loc[x, 'year']

for iid in song_df_used.index[:9]:
  print(iid, Year(iid))
#  L_diff.loc[iid, 'year'] = Year(iid)

print(Year([80,156]))

80 2007.0
156 2002.0
589 2007.0
631 2007.0
914 2002.0
1026 2008.0
1194 2006.0
1375 2005.0
1485 2009.0
song_id
80     2007.0
156    2002.0
Name: year, dtype: float64


In [112]:
# Extract the raw ratings
raw = data.raw_ratings.copy()
Raw=pd.DataFrame(raw, columns=['user_id', 'song_id', 'play_count', 'timestamp'])
Raw.drop('timestamp', axis=1, inplace=True)

# correct  playcount for b0,b1
new_value= Raw['play_count'] - b0
new_value2=Year(Raw['song_id'])
new_value2

#Raw['play_count'] = b1*Year(Raw['song_id'])
#Raw
# Create a new Dataset object from the raw ratings
#reader = Reader(rating_scale=(m1, m2))  # Use your rating scale here
#data_diff = Dataset.load_from_df(RAW, reader)
#Dataset.load_from_df(raw_diff, reader)

Unnamed: 0_level_0,year
song_id,Unnamed: 1_level_1
80,2007.0
156,2002.0
589,2007.0
631,2007.0
914,2002.0
...,...
6943,2009.0
7300,2009.0
7490,2006.0
7557,1976.0


In [None]:
# prompt: in data_diff, replace playcount by playcount - f(song_id) for a certain map f, so as to have again a dataset autofold

def f(intercept, coeff, song_id):
  return song_id: intercept - coeff*song_id

fobs = fobs(b0, b1)

# Iterate over the data and apply the transformation
for user, song, rating in data_diff.raw_ratings:
    data_diff.raw_ratings['user_id', 'song_id', 'play_count'] = rating - f(song)

# Rebuild the Dataset object
data_diff = Dataset.load_from_raw_ratings(data_diff.raw_ratings, reader)


In [None]:
for user, song, rating, _ in data.raw_ratings:
    data_diff

#data.raw_ratings

In [73]:
# store predictions of baseline model 'svd'
svd.fit(trainset)

predictions_svd=svd.test(testset)

# Evaluate the model performance
metrics_k(predictions=predictions_svd)

Precision:  0.178
Recall:  0.13
F_1 score:  0.15


In [48]:
predictions = svd.test(testset)
predictions[:7]

[Prediction(uid=39287, iid=7999, r_ui=-0.5056419864416881, est=1.4066458274004447, details={'was_impossible': False}),
 Prediction(uid=62061, iid=951, r_ui=-1.290015996439564, est=-1.5144696845185237, details={'was_impossible': False}),
 Prediction(uid=68903, iid=3922, r_ui=-0.3379328831067028, est=1.6512033109246644, details={'was_impossible': False}),
 Prediction(uid=16734, iid=1026, r_ui=0.8058177768947061, est=-0.10908126543572766, details={'was_impossible': False}),
 Prediction(uid=36906, iid=3782, r_ui=-0.17022377977172454, est=0.6795087053772468, details={'was_impossible': False}),
 Prediction(uid=7238, iid=5068, r_ui=-1.1462653364381552, est=-0.026926084877099077, details={'was_impossible': False}),
 Prediction(uid=47416, iid=6846, r_ui=-1.3379328831067028, est=-0.612570106719276, details={'was_impossible': False})]

In [63]:
def compute_rmse(predictions):
    l=[]
    for uid, iid, true_r, est, _ in predictions:
      l.append((true_r- est)**2)
    return np.sqrt(sum(l)/len(l))

compute_rmse(predictions)


1.9583806690198282

In [67]:
def metrics_k(predictions, k=30, threshold=1.5):
    """
    Returns precision and recall at k metrics for each user
    """

    # First map the predictions to each user.
    user_est_true = defaultdict(list)

    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, playing_count in user_est_true.items():

        # Sort play count by estimated value
        playing_count.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in playing_count)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in playing_count[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in playing_count[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set Precision to 0 when n_rec_k is 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set Recall to 0 when n_rel is 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    #Mean of all the predicted precisions are calculated.
    precision = round((sum(prec for prec in precisions.values()) / len(precisions)),3)
    #Mean of all the predicted recalls are calculated.
    recall = round((sum(rec for rec in recalls.values()) / len(recalls)),3)

    compute_rmse(predictions)
    print('Precision: ', precision) #Command to print the overall precision
    print('Recall: ', recall) #Command to print the overall recall
    print('F_1 score: ', round((2*precision*recall)/(precision+recall),3)) # Formula to compute the F-1 score.

In [68]:
predictions=svd.test(testset)
metrics_k(predictions=predictions)

Precision:  0.178
Recall:  0.13
F_1 score:  0.15


# Report


The final submission should focus on the key takeaways from the project. This is the final proposal to solve the problem that a business leader or a decision-maker should consume.

This final report should have three key parts

- Executive Summary - What are the most important findings from the analysis done in Milestone? Describe the final proposed model specifications?

- Problem and solution summary - Provide the summary of the problem?  State the reasons for the proposed solution design? How it would affect the problem/business?

- Recommendations for implementation - What are some key recommendations to implement the solution? What are the key actionables for stakeholders? What is the expected benefit and/or costs (List the benefits of the solution. State some rational assumptions to put forward some numbers on costs/benefits for stakeholders)? What are the potential risks or challenges of the proposed solution design? What further analysis needs to be done or what other associated problems need to be solved?


## Executive summary

## Problem and solution summary

## Recommendations for implementation