## Load Libriries

In [30]:
import pandas as pd
import numpy as np
from joblib import dump, load
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KDTree

import os
import sys

### Load dataset

In [16]:
df = pd.read_csv('data/df_prepare.csv')

In [17]:
df.head()

Unnamed: 0,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,2,0.346,-1.828,1,0.0525,166.969,4,0.814
1,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,5,0.151,-5.559,0,0.0868,174.003,4,0.816
2,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,2,0.103,-13.879,0,0.0362,99.488,5,0.368
3,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,2,0.0985,-12.178,1,0.0395,171.758,4,0.227
4,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,5,0.202,-21.15,1,0.0456,140.576,4,0.39


In [78]:
df.artist_name.value_counts()[:10]

Giuseppe Verdi             1312
Giacomo Puccini            1095
Kimbo Children's Music      971
Wolfgang Amadeus Mozart     800
Richard Wagner              778
Nobuo Uematsu               773
Juice Music                 684
Georges Bizet               677
Randy Newman                667
Johann Sebastian Bach       646
Name: artist_name, dtype: int64

In [77]:
df.dtypes

artist_name          object
track_name           object
track_id             object
popularity             int8
acousticness        float16
danceability        float16
duration_ms           int32
energy              float16
instrumentalness    float16
key                    int8
liveness            float16
loudness            float16
mode                   int8
speechiness         float16
tempo               float16
time_signature         int8
valence             float16
dtype: object

In [95]:
df['danceability'].unique().sum()

514.0

### Create reduce memory function

In [18]:
def reduce_mem_usage(df, verbose=True):
    """ Function iterates through all the columns of a dataframe and modify the data type
        to reduce memory usage.
        Credit to: https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
        Parameters
        ----------
        df : Pandas DataFrame
        verbose: (True) by default, prints out before and after memory usage
        Returns
        -------
        df : Reduced Memory Pandas DataFrame
    """

    if verbose:
        start_mem = df.memory_usage().sum() / 1024**2
        print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    if verbose:
        end_mem = df.memory_usage().sum() / 1024**2
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(
            100 * (start_mem - end_mem) / start_mem))

    return df

### Reduce memory

In [19]:
df = reduce_mem_usage(df)  # Reduces Memory Usage

Memory usage of dataframe is 24.75 MB
Memory usage after optimization is: 9.97 MB
Decreased by 59.7%


In [20]:
df.head()

Unnamed: 0,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.61084,0.388916,99373,0.910156,0.0,2,0.345947,-1.828125,1,0.05249,167.0,4,0.813965
1,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.245972,0.589844,137373,0.736816,0.0,5,0.151001,-5.558594,0,0.086792,174.0,4,0.815918
2,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952148,0.663086,170267,0.130981,0.0,2,0.103027,-13.882812,0,0.036194,99.5,5,0.36792
3,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703125,0.23999,152427,0.325928,0.0,2,0.098511,-12.179688,1,0.03949,171.75,4,0.227051
4,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.950195,0.331055,82625,0.224976,0.122986,5,0.202026,-21.15625,1,0.045593,140.625,4,0.389893


### Wrangle the data

In [21]:
def wrangle(df):

    # Set to avoid SettingWithCopyWarning
    df = df.copy()
    
    # Drop Duplicates
    df = df.drop_duplicates(keep='first')
    
    # isolating values with a high level of 'speechiness' as they are generally not music but other types of recordings.
    # converting unwanted values to null then removing those rows
    df['speechiness'] = df['speechiness'].where(df['speechiness'] < .66)
    
    # Drop missing values
    df = df.dropna()

    return df

In [22]:
df.shape

(180223, 17)

## Create Model

In [23]:
# wrangle data
wrangled = wrangle(df)

In [26]:
# print data shape after wrangling
wrangled.shape

(170867, 17)

In [27]:
wrangled.head()

Unnamed: 0,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.61084,0.388916,99373,0.910156,0.0,2,0.345947,-1.828125,1,0.05249,167.0,4,0.813965
1,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.245972,0.589844,137373,0.736816,0.0,5,0.151001,-5.558594,0,0.086792,174.0,4,0.815918
2,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952148,0.663086,170267,0.130981,0.0,2,0.103027,-13.882812,0,0.036194,99.5,5,0.36792
3,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703125,0.23999,152427,0.325928,0.0,2,0.098511,-12.179688,1,0.03949,171.75,4,0.227051
4,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.950195,0.331055,82625,0.224976,0.122986,5,0.202026,-21.15625,1,0.045593,140.625,4,0.389893


In [28]:
# select numerical features, drop string columns
wrangled_numeric = wrangled.drop(columns= ['artist_name', 'track_name', 'track_id'])

In [31]:
# Apply Standard Scaler
# Create a numpy array where the numerical features are scaled to a mean of 0 and a standard deviation of 1.
wrangled_scaled = StandardScaler().fit_transform(wrangled_numeric)

In [32]:
# Apply the model
tree_model = KDTree(wrangled_scaled)

## Pickle Model

In [35]:
model_path = 'models/kdtree_model.joblib'

In [36]:
dump(tree_model, model_path)

['models/kdtree_model.joblib']

### Get nearest neighbors distance and indices

In [37]:
# Query the KDTree model for k nearest neighbors.
# (Alternatively) we can use query_radius
# Check sklearn docs 
# https://scikit-learn.org/stable/modules/generated/
# sklearn.neighbors.KDTree.html#sklearn.neighbors.KDTree
num_of_songs=5
dist, indx = tree_model.query(wrangled_scaled, k=num_of_songs+1)

### Check distance of k=5 similar songs

In [38]:
# Create column names first
column_names = ['Searched_Song']
for i in range(1, num_of_songs+1):
    column_names.append(f'Nearest_Song_{i}')
    
# Calculate scores from distances
scores = (1 - ((dist - dist.min()) / (dist.max() - dist.min()))) * 100
dist_df = pd.DataFrame(scores.tolist(), columns=column_names)

# Limit decimal place output to 1
dist_df = dist_df.round(decimals=1)

# Replace Searched Song score with the track_id
dist_df['Searched_Song'] = df.track_id

# Print the shape of the dataframe
print(f'df.shape = {dist_df.shape}')

# Check distances of similar top 10 nearest songs
dist_df.head()

df.shape = (170867, 6)


Unnamed: 0,Searched_Song,Nearest_Song_1,Nearest_Song_2,Nearest_Song_3,Nearest_Song_4,Nearest_Song_5
0,0BRjO6ga9RKCKjfDqeFgWV,83.8,83.7,83.6,83.4,83.2
1,0BjC1NfoEOOusryehmNudP,90.2,88.3,87.0,86.1,86.1
2,0CoSDzoNIKCRs124s9uTVy,84.5,84.2,82.7,82.0,81.1
3,0Gc6TVm52BwZD07Ki6tIvf,89.5,88.8,88.7,88.6,88.5
4,0IuslXpMROHdEPvSl1fTQK,91.9,91.3,89.2,88.5,88.2


### Save Similar Songs

In [40]:
# Making an array of the Track IDs
track_ids = np.array(df.track_id)

# A function that creates list of the each song with its nearest neighbors
def find_similars(track_ids, indxs):
    """For each track in the dataset find similar songs"""
    similars = []
    for row in indx:
        ids = [track_ids[i] for i in row]
        similars.append(ids)
    return similars

# The first item in each row is a song in the original database.  
# The next 5 rows are its closest neighbors in the KDTree.
nearest_neighbors = find_similars(track_ids, indx)

# Convert it to dataframe
nearest_neighbors_df = pd.DataFrame(nearest_neighbors, columns=column_names)
nearest_neighbors_df.head()

Unnamed: 0,Searched_Song,Nearest_Song_1,Nearest_Song_2,Nearest_Song_3,Nearest_Song_4,Nearest_Song_5
0,0BRjO6ga9RKCKjfDqeFgWV,0E5vyUbKoQzGtYVUSSLTNY,5vnuEyEfl4mEJO0Gc0n0Oz,1HeBGMouZrkglUFdtLwGuN,1mz9ZrRYu3EPVg9ZHFtjjf,5DEoPzI4ZzHVbMJwPR9VKF
1,0BjC1NfoEOOusryehmNudP,6BgDdlr4mdML2ws4DUE7ao,3l44IqBPrUx2KNv09bqumH,7m7Ubq8L3QGK5bRFfdmoqf,5CXBgDA0UEIpgXUxEtuLcA,2kM3ckYysOoQJ37vpHLe9H
2,0CoSDzoNIKCRs124s9uTVy,1GBgNFmIOCFvCvx3vCK107,2EophOpSBiQkq8tKm5Mva0,3fEk4Tw7LWo3TMR8KnJqnI,2PphKjc6zTMqkGBZ6YFAbQ,37RAmgIV3dMKQL5sgBykq3
3,0Gc6TVm52BwZD07Ki6tIvf,6OpiPBmgBTTzxVwLyTrmiV,06HhlTB8s3dTVAGYxgQqIC,6Xxn6VjtGKrPyOOJVqGKPk,3g77KoEs1P5WurlrMWGIV6,4KtHxcTlIXEKinToSNzPPO
4,0IuslXpMROHdEPvSl1fTQK,76XU74fn8ePn7Qqb2yelfA,5CQHU35O0vIjjIragKMXSz,3uFGK2sp47ltsVXQsGItkX,1bywY09aQZ61pz8bTBBl2F,5Ku5VjYhM5G4Oybf0vdzMm


In [41]:
nearest_neighbors_df.shape

(170867, 6)

### Drop duplicates

In [42]:
nearest_neighbors_df[nearest_neighbors_df.duplicated()].sample(1, random_state=42)

Unnamed: 0,Searched_Song,Nearest_Song_1,Nearest_Song_2,Nearest_Song_3,Nearest_Song_4,Nearest_Song_5
70786,632gDlnZ9q07F41jDQmHJ7,6lUY6MoqGgPnA27PHYxem5,5UfJ752VKXn9B6r4fVSQb9,5p3NP9mQblDsVJLdfNZJ3I,5jsDxDkJ1PqyYUWhDMr86B,6o2g1BJvtYQssH84kBYs7y


In [43]:
nearest_neighbors_df.query('Searched_Song == "632gDlnZ9q07F41jDQmHJ7"')

Unnamed: 0,Searched_Song,Nearest_Song_1,Nearest_Song_2,Nearest_Song_3,Nearest_Song_4,Nearest_Song_5
1434,632gDlnZ9q07F41jDQmHJ7,6lUY6MoqGgPnA27PHYxem5,5UfJ752VKXn9B6r4fVSQb9,5p3NP9mQblDsVJLdfNZJ3I,5jsDxDkJ1PqyYUWhDMr86B,6o2g1BJvtYQssH84kBYs7y
70786,632gDlnZ9q07F41jDQmHJ7,6lUY6MoqGgPnA27PHYxem5,5UfJ752VKXn9B6r4fVSQb9,5p3NP9mQblDsVJLdfNZJ3I,5jsDxDkJ1PqyYUWhDMr86B,6o2g1BJvtYQssH84kBYs7y


In [44]:
final_nearest_neighbors_df = nearest_neighbors_df.drop_duplicates(subset=['Searched_Song'], keep='first')
final_nearest_neighbors_df.shape

(170780, 6)

### Save it as csv file

In [45]:
def save_data_frame_as_csv(df=None, filename=None):
    """
    Saves data frame to csv format
    
    Parameters
    ----------
    df: Pandas DataFrame
    filename: File path or name
    
    Returns
    -------
    csv file
    """
    try:
        if not filename.endswith('.csv'):
            filename += '.csv'
        df.to_csv(filename, index=False)
        print(f"Data Frame saved @:{filename}")
    except Exception as e:
        print("Data Frame couldn't be saved: ", sys.exc_info()[0])
        raise

In [47]:
save_data_frame_as_csv(final_nearest_neighbors_df, 'data/spotify_dataset_recommendations.csv')

Data Frame saved @:data/spotify_dataset_recommendations.csv


### Save it as JSON file

In [49]:
def save_data_frame_as_json(df=None, filename=None, orient="records"):
    """
    Saves data frame to JSON format
    
    Parameters
    ----------
    df: Pandas DataFrame
    filename: File path or name
    
    Returns
    -------
    JSON file
    """
    try:
        if not filename.endswith('.json'):
            filename += '.json'
        df.to_json(filename, orient=orient)
        print(f"Data Frame saved @:{filename}")
    except Exception as e:
        print("Data Frame couldn't be saved: ", sys.exc_info()[0])
        raise

In [50]:
save_data_frame_as_json(final_nearest_neighbors_df, 'data/spotify_dataset_recommendations.json')

Data Frame saved @:data/spotify_dataset_recommendations.json


In [51]:
json_df = pd.read_json('data/spotify_dataset_recommendations.json')
json_df.head()

Unnamed: 0,Searched_Song,Nearest_Song_1,Nearest_Song_2,Nearest_Song_3,Nearest_Song_4,Nearest_Song_5
0,0BRjO6ga9RKCKjfDqeFgWV,0E5vyUbKoQzGtYVUSSLTNY,5vnuEyEfl4mEJO0Gc0n0Oz,1HeBGMouZrkglUFdtLwGuN,1mz9ZrRYu3EPVg9ZHFtjjf,5DEoPzI4ZzHVbMJwPR9VKF
1,0BjC1NfoEOOusryehmNudP,6BgDdlr4mdML2ws4DUE7ao,3l44IqBPrUx2KNv09bqumH,7m7Ubq8L3QGK5bRFfdmoqf,5CXBgDA0UEIpgXUxEtuLcA,2kM3ckYysOoQJ37vpHLe9H
2,0CoSDzoNIKCRs124s9uTVy,1GBgNFmIOCFvCvx3vCK107,2EophOpSBiQkq8tKm5Mva0,3fEk4Tw7LWo3TMR8KnJqnI,2PphKjc6zTMqkGBZ6YFAbQ,37RAmgIV3dMKQL5sgBykq3
3,0Gc6TVm52BwZD07Ki6tIvf,6OpiPBmgBTTzxVwLyTrmiV,06HhlTB8s3dTVAGYxgQqIC,6Xxn6VjtGKrPyOOJVqGKPk,3g77KoEs1P5WurlrMWGIV6,4KtHxcTlIXEKinToSNzPPO
4,0IuslXpMROHdEPvSl1fTQK,76XU74fn8ePn7Qqb2yelfA,5CQHU35O0vIjjIragKMXSz,3uFGK2sp47ltsVXQsGItkX,1bywY09aQZ61pz8bTBBl2F,5Ku5VjYhM5G4Oybf0vdzMm


In [52]:
def save_data_as_sqlite_database(df, sql_uri, table_name, index=True, index_label=None, verbose=True):
    """
    Saves data frame to SQLite database
    
    Parameters
    ----------
    df: Pandas DataFrame
    filename: File path or name
    
    Returns
    -------
    db.sqlite3
    """
    try:
        if not sql_uri.endswith('.sqlite3'):
            sql_uri += '.sqlite3'
        if not sql_uri.startswith('sqlite:///'):
            raise ValueError('Database URL should start with "sqlite:///"')
        
        from sqlalchemy import create_engine
        engine = create_engine(sql_uri)
        print(f"Data Frame saved @:{sql_uri[10:]}")
        df.to_sql(table_name, con=engine, index=index, index_label=index_label, if_exists='replace')
        
        if verbose:
            sql = f"SELECT * FROM {table_name} LIMIT 10;"
            print(pd.read_sql(sql, con=engine))
    except Exception as e:
        print("Data Frame couldn't be saved: ", sys.exc_info()[0])
        raise

In [55]:
sql_uri = "sqlite:///data/db.sqlite3"
table_name = "recommendations"
save_data_as_sqlite_database(final_nearest_neighbors_df, sql_uri, table_name, index=True, index_label=None, verbose=True)

Data Frame saved @:data/db.sqlite3
   index           Searched_Song          Nearest_Song_1  \
0      0  0BRjO6ga9RKCKjfDqeFgWV  0E5vyUbKoQzGtYVUSSLTNY   
1      1  0BjC1NfoEOOusryehmNudP  6BgDdlr4mdML2ws4DUE7ao   
2      2  0CoSDzoNIKCRs124s9uTVy  1GBgNFmIOCFvCvx3vCK107   
3      3  0Gc6TVm52BwZD07Ki6tIvf  6OpiPBmgBTTzxVwLyTrmiV   
4      4  0IuslXpMROHdEPvSl1fTQK  76XU74fn8ePn7Qqb2yelfA   
5      5  0Mf1jKa8eNAf1a4PwTbizj  4FlNoh0omZYXNyTLPdsw5J   
6      6  0NUiKYRd6jt1LKMYGkUdnZ  2TMRrRWVDRKN7IRUEJb1IC   
7      7  0PbIF9YVD505GutwotpB5C  2PBeZKKVTlEWXoP7Cqas2I   
8      8  0ST6uPfvaPpJLtQwhE6KfC  3ieKPhI7eEOtIeVJshRCvD   
9      9  0VSqZ3KStsjcfERGdcWpFO  3Knohqfb9jeYzL6wMZiWLM   

           Nearest_Song_2          Nearest_Song_3          Nearest_Song_4  \
0  5vnuEyEfl4mEJO0Gc0n0Oz  1HeBGMouZrkglUFdtLwGuN  1mz9ZrRYu3EPVg9ZHFtjjf   
1  3l44IqBPrUx2KNv09bqumH  7m7Ubq8L3QGK5bRFfdmoqf  5CXBgDA0UEIpgXUxEtuLcA   
2  2EophOpSBiQkq8tKm5Mva0  3fEk4Tw7LWo3TMR8KnJqnI  2PphKjc6zTMqkGBZ6YFAbQ