In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import pymssql
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
#Allows me to see all of the columns

pd.set_option('display.max_columns', None)

# Connect to SQL Database

In [8]:
database = 'landodatalakes-group4'
Track_table = 'dbo.Track'
Artist_table = 'dbo.Artist'
PlaylistTrack_table = 'dbo.PlaylistTrack'
Playlist_table = 'dbo.Playlist'

user = 'spotify'
password  = 'T35TPA55W0RD!'
server = 'gen10-data-fundamentals-22-07-sql-server.database.windows.net'

In [9]:
conn = pymssql.connect(server,user,password,database)
cursor = conn.cursor()


# Get the Tracks Table 

In [10]:
#Select the tracks table and join with the artists table to get genres
Tracks = pd.read_sql(f'SELECT Track.TrackID, TrackName, Popularity,DurationMS, ReleaseYear,\
    Danceability,Energy,MusicalKey,Loudness,Mode, Speechiness,Acousticness,\
        Instrumentalness, Liveness, Valence,Tempo,TimeSignature,Genres FROM {Track_table} INNER JOIN {Artist_table} ON Artist.ArtistID = Track.ArtistID', conn)



# Get the PlaylistTracks table

In [11]:
# Select all from PlaylistTrack table
Playlist_Tracks = pd.read_sql(f'SELECT * FROM {PlaylistTrack_table}' ,conn)



In [12]:
# Select the playlist 
Playlist_Tracks.head()

Unnamed: 0,PlaylistID,TrackID
0,433,04u0BTFwWhVIeRWSUe7Jr3
1,473,11aQvMQ8Rk0KWcSLd3lkVq
2,2053,2z2LNks0FO7Q4tWmZrwbgH
3,11702,1fDjCFlAuTWNSfPueyATTX
4,5180,0u56clbegXVe6daER5ZHm4


# Get the Playlist Table

In [13]:
Playlist = pd.read_sql(f'SELECT * FROM {Playlist_table}' ,conn)



# Functions to create content based cosine similarity model

In [14]:
#Function to scale down popularity by the number of groups you'd like (popularity ranges from 0-100)
def scale_popularity(df,number_of_buckets):
    df['Popularity-Grouped'] = df['Popularity'].apply(lambda x: int(x/number_of_buckets))

    return df

In [15]:
#Function to create dummies for columns that we want to represent as categorical

def get_dummies_for_column(df, current_column_name, new_name):   
    
    tf_df = pd.get_dummies(df[current_column_name])
    feature_names = tf_df.columns
    tf_df.columns = [new_name + "|" + str(i) for i in feature_names]
    tf_df.reset_index(drop = True, inplace = True)    
    return tf_df


In [16]:
#Function to build the entire feature set for all of the tracks

def create_feature_set(df):

    # Convert the dataframe into a pivot table with only track id and genres
    df_1 = df[['TrackID','Genres']]
    df_1['Values'] = 1

    pivot = df_1.pivot_table(index='TrackID', #these are the rows
                                columns='Genres', #these are the columns
                                values='Values') #bianary if the Genre is a characteristic of that song (0 = no, 1 = yes)
    pivot.fillna(0, inplace=True)
    genre_df = pd.DataFrame(pivot)

    #For each column rename it to genre|genre-name

    genre_df.columns = ['genre' + "|" + i for i in genre_df.columns]
    genre_df.reset_index(drop = True, inplace=True)

    #scale float columns using the minmaxscalar
    float_cols = df.dtypes[(df.dtypes == 'float64')].index.values
    floats = df[float_cols].reset_index(drop = True)
    scaler = MinMaxScaler()
    floats_scaled = pd.DataFrame(scaler.fit_transform(floats), columns = floats.columns) 
    
    # Scale down features because they are not as relevant to recommending a song as other features
    df = scale_popularity(df, 5)

    ReleaseYear_scaled = get_dummies_for_column(df, 'ReleaseYear','ReleaseYear') * 0.5 #Scale to 0.5
    TimeSignature_scaled = get_dummies_for_column(df, 'TimeSignature','TimeSignature') * 0.3 #Scale to 1/3 of its value
    Mode_scaled = get_dummies_for_column(df, 'Mode','Mode') * 0.3 #Scale to 1/3 of its value
    MusicalKey_scaled = get_dummies_for_column(df, 'MusicalKey','MusicalKey') * 0.3 #Scale to 1/3 of its value
    Popularity_scaled = get_dummies_for_column(df, 'Popularity-Grouped', 'Popularity-Grouped') * 0.15 #Scale to 0.15

    #concanenate all features
    final = pd.concat([genre_df, floats_scaled,ReleaseYear_scaled,TimeSignature_scaled, Mode_scaled,MusicalKey_scaled, Popularity_scaled], axis = 1)
     
    #Add the TrackID and fill in any na values with zeros
    final['TrackID']=df['TrackID'].values
    final.fillna(0, inplace=True)
    
    return final

In [17]:
#Function to create an eigenvector for the playlist and also get the vectors for all tracks not on the playlist

def generate_playlist_feature(complete_feature_set, playlist_df):

    
    #Get the track features for the songs that are on the playlist
    complete_feature_set_playlist = complete_feature_set[complete_feature_set['TrackID'].isin(playlist_df['TrackID'].values)]
    complete_feature_set_playlist = complete_feature_set_playlist.merge(playlist_df[['TrackID']], how = 'inner', left_on = 'TrackID', right_on='TrackID')

    #Get the track features that are not on someone's playlist
    complete_feature_set_nonplaylist = complete_feature_set[~complete_feature_set['TrackID'].isin(playlist_df['TrackID'].values)]

    #Drop TrackID 
    complete_feature_set_playlist.drop(columns=['TrackID'],inplace=True)

    #Sum the feature playlist features, since it's technically an eigenvector
    return complete_feature_set_playlist.sum(axis = 0), complete_feature_set_nonplaylist

In [18]:
#Function that uses cosine similarity to generate recommendations

def generate_playlist_recommendations(df, features, nonplaylist_features):

    non_playlist_df = df[df['TrackID'].isin(nonplaylist_features['TrackID'].values)]
    non_playlist_df['sim'] = cosine_similarity(nonplaylist_features.drop('TrackID', axis = 1).values, features.values.reshape(1, -1))[:,0]
    non_playlist_df_top_10 = non_playlist_df.sort_values('sim',ascending = False).head(10)
    return non_playlist_df_top_10


In [19]:
# Combines all the function and only has an input of playlist number

def display_recommendations(playlist_number):
    playlist_selection= Playlist_Tracks[Playlist_Tracks['PlaylistID']==playlist_number]
    complete_feature_set = create_feature_set(Tracks)
    complete_feature_set_playlist_vector, complete_feature_set_nonplaylist = generate_playlist_feature(complete_feature_set, playlist_selection)
    top_10_recoomendations= generate_playlist_recommendations(Tracks, complete_feature_set_playlist_vector, complete_feature_set_nonplaylist)
    return top_10_recoomendations


# Select Playlist Number and Get Recommendations

In [20]:
Playlist

Unnamed: 0,PlaylistID,UserID,PlaylistTitle
0,14,7,Rock 'N Roll
1,5017,1592,The Vangelis Playlist
2,2404,842,"grunge , garage and graffiti"
3,16,9,Feel The Machine
4,16244,6113,много хорошей музыки
...,...,...,...
2627,4957,1571,1
2628,4960,1574,rock bottom
2629,5004,1583,Fey-Folken’s playlist
2630,5005,1584,Payne's Awesome Playlist


In [21]:
# Select Desired Playlist Number

playlist_number = 4960

display_recommendations(playlist_number)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1['Values'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_playlist_df['sim'] = cosine_similarity(nonplaylist_features.drop('TrackID', axis = 1).values, features.values.reshape(1, -1))[:,0]


Unnamed: 0,TrackID,TrackName,Popularity,DurationMS,ReleaseYear,Danceability,Energy,MusicalKey,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,TimeSignature,Genres,Popularity-Grouped,sim
8280,1JcGNoiwifg0MdJMVgJQYx,Get Free,63,126573,2002,0.502,0.978,7,-3.278,1,0.0649,0.000163,1e-06,0.287,0.603,138.811005,4,alternative rock,12,0.919833
8253,22yk17jzpiaWLvCQxA3U0A,Spokesman,51,152773,2002,0.541,0.903,4,-5.903,1,0.0526,0.000147,0.0,0.25,0.643,102.442001,4,pop punk,10,0.914828
8212,11I3KKHv7g2pXiuqRDabRA,Give Me Novacaine / She's a Rebel,58,326200,2004,0.312,0.949,9,-3.533,1,0.0784,0.00185,1.8e-05,0.0569,0.65,154.794998,4,permanent wave,11,0.911863
8288,1u0YEo1iVcper7ok5kJ6Ii,Drogado,27,214960,2004,0.604,0.873,5,-4.94,1,0.0461,0.0277,0.0,0.106,0.871,176.975998,4,argentine rock,5,0.910832
8231,1O7ZpUgCW71ss5fZIfgp5u,Mas Es Amar (Sad Eyes),33,253600,1999,0.633,0.906,3,-3.891,1,0.0372,0.137,2.7e-05,0.307,0.566,114.987999,4,dance pop,6,0.910441
8233,6ns2oVPYMdlftelbSHlzvO,Finally Found You,48,220920,2014,0.622,0.886,4,-4.828,1,0.0883,0.0107,0.0,0.354,0.603,127.969002,4,dance pop,9,0.909199
8205,10lT3pp9QERGOWiIzLx4We,Jesus of Suburbia,58,548253,2004,0.3,0.929,1,-3.214,1,0.0636,0.000259,2e-06,0.329,0.59,147.115997,4,permanent wave,11,0.907728
8232,0ds5D8mJ5K2jOxCdKFJHEb,Sad Eyes,35,248027,1999,0.604,0.895,3,-4.852,1,0.0302,0.135,7.8e-05,0.369,0.616,114.751999,4,dance pop,7,0.9072
8236,2QyGSbGdT9suKuDKwnjwH9,The Touch,51,236560,1985,0.594,0.919,2,-6.387,1,0.0572,0.0441,9e-06,0.178,0.529,126.57,4,melodic hard rock,10,0.904636
8200,49NGNNggN0R8OmsedNGY0g,Walking Alone,43,165013,1997,0.566,0.863,0,-3.415,1,0.0302,0.000307,0.00376,0.274,0.799,112.888,4,permanent wave,8,0.903187
