# Spotify Exploratory Data Analysis and Recommendation System

Today, Spotify is the world's most popular audio streaming subscription service with 433m active users, including 188m subscribers, across 183 markets. <p>
    
We will perform Exploratory Data Analysis (EDA) in Python using Pandas, NumPy, Matplotlib, and Seaborn on Spotify’s dataset, such as sampling, correlation, etc.to identify missing values, plots, correlation heat map. <p>

Will explore and quantify the music data to gain insights and correlations among the diiferent song attributes using heatmap and regression plots with Python libraries and functions. 

After the EDA we will built a Music Recommendation System using different the Spotify dataset.   

<img src="https://storage.googleapis.com/pr-newsroom-wp/1/2018/11/Spotify_Logo_RGB_Green-768x231.png" width="450" align="left"/>

### Importing Libraries

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px 
import matplotlib.pyplot as plt
#%matplotlib inline

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist

import warnings
warnings.filterwarnings("ignore")

### Reading Spotify Dataset

In [None]:
data = pd.read_csv("D:/DATA ANALYST PROJECTS/Spotify EDA and Recommendation System/Dataset/data.csv")
genre_data = pd.read_csv("D:/DATA ANALYST PROJECTS/Spotify EDA and Recommendation System/Dataset/data_by_genres.csv")
year_data = pd.read_csv("D:/DATA ANALYST PROJECTS/Spotify EDA and Recommendation System/Dataset/data_by_year.csv")
artist_data = pd.read_csv("D:/DATA ANALYST PROJECTS/Spotify EDA and Recommendation System/Dataset/data_by_artist.csv")

In [None]:
data.head()

In [None]:
genre_data.head()

In [None]:
year_data.head()

In [None]:
artist_data.head()

In [None]:
# checking size of data

data.shape

In [None]:
# checking null value

pd.isnull(data).sum()

In [None]:
# concise summary of the DataFrame

data.info()

In [None]:
# concise summary of the DataFrame

genre_data.info()

In [None]:
# concise summary of the DataFrame

year_data.info()

In [None]:
# concise summary of the DataFrame

artist_data.info()

In [None]:
#### Descriptive statistics of the numerical variables present in columns

data.describe().transpose()

### Top10 least popular songs

In [None]:
least_popular = data.sort_values('popularity', ascending = True).head(10) 
least_popular[['year','name','release_date','artists']]

### Top10 most popular songs

In [None]:
most_popular = data.sort_values('popularity', ascending = False).head(10) 
most_popular[['year','name','release_date','artists']]

### Top10 most popular artists (not based on monthly listeners)

One might mistake this as the monthly listeners but it's a different.The higher your popularity index, the more likely the algorithm is to recommend you to new listeners, and place you in algorithmic playlists like Release Radar and Discover Weekly.

In [None]:
popular_artists = artist_data.sort_values('popularity', ascending=False).head(10)
popular_artists[['popularity','energy','artists','danceability']]

#### Converting song duration from milliseconds to minutes

In [None]:
data['duration']=data['duration_ms'].apply(lambda x : round((x/60000),2))
genre_data['duration']=genre_data['duration_ms'].apply(lambda x : round((x/60000),2))
year_data['duration']=year_data['duration_ms'].apply(lambda x : round((x/60000),2))
artist_data['duration']=artist_data['duration_ms'].apply(lambda x : round((x/60000),2))
data.duration.head()

### Visualizing through a correlation map

In [None]:
from yellowbrick.target import FeatureCorrelation

feature_names = ['acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence','duration','explicit']

X, y = data[feature_names], data['popularity']

# Create a list of the feature names
features = np.array(feature_names)

# Instantiate the visualizer
visualizer = FeatureCorrelation(labels=features)

visualizer.fit(X, y)     # Fit the data to the visualizer
visualizer.show();        # Finalize and render the figure

In [None]:
corr_df=data.drop(['key','mode','explicit','duration_ms'], axis=1).corr(method='pearson')
plt.figure(figsize=(14,8))
heatmap=sns.heatmap(corr_df, annot=True, fmt='.1g', vmin=-1, vmax=1, center=0, cmap="BrBG", linewidths=2, linecolor="Black"
)
heatmap.set_title('Correlation Heatmap Between Variable')
heatmap.set_xticklabels(heatmap.get_xticklabels(), rotation=90);

### Regression plot between Loudness and Energy

In [None]:
sns.regplot(data=data, y='loudness', x='energy', scatter_kws={"color": "violet", 's':2}, marker='2', line_kws={"color": "black", 'linewidth':1.5}).set(title='Loudness Vs Energy Correlation');

### Regression plot between Popularity and Acousticness

In [None]:
sns.regplot(data=data, y='popularity', x='acousticness', scatter_kws={"color": "c", 's':3}, marker='X', line_kws={"color": "black", 'linewidth':1.5}).set(title='Popularity Vs Acousticness Correlation');

### Regression plot between Speechiness and Acousticness

In [None]:
sns.regplot(data=data, y='speechiness', x='acousticness', scatter_kws={"color": "goldenrod", 's':3}, marker='+', line_kws={"color": "black", 'linewidth':1.5}).set(title='Speechiness Vs Acousticness Correlation');

### Regression plot between Popularity and Danceability

In [None]:
sns.regplot(data=data, y='popularity', x='danceability', scatter_kws={"color": "red", 's':3}, marker='*', line_kws={"color": "black", 'linewidth':1.5}).set(title='Popularity Vs Danceability Correlation');

### Bar plot to visualize the correlation between the duration of songs and their different genres


In [None]:
top_genres = genre_data.nlargest(20, 'popularity')
plt.title("Duration of the Songs in Different Genres")
#sns.color_palette("rocket", as_cmap= True)
sns.barplot(y='genres', x='duration', data=top_genres, color='lightsalmon')
plt.xlabel ("Duration in seconds")
plt.ylabel("Genres");

## *Music Over Time*

### Bar plot to visualize the duration of songs over the years

In [None]:
plt.figure(figsize=(20,6))
sns.barplot(x="year",y="duration", errwidth=False, data=data).set(title='Year Vs Duration')
plt.xticks(rotation=90);

### Distribution plot to visualize the total number of songs in each year since 1921 in our Spotify database

In [None]:
fig = px.histogram(data, x="year")
fig.show()

### *How the overall sound of music has changed from 1921 to 2020...*

In [None]:
def extract_decade(year):
    start_period = int(year/10) * 10
    decade = '{}s'.format(start_period)
    return decade

data['decade'] = data['year'].apply(extract_decade)

sns.set(rc={'figure.figsize':(10 ,5)})
sns.countplot(data['decade']);

In [None]:
sound_features = ['acousticness', 'liveness', 'instrumentalness', 'energy', 'danceability', 'valence']
fig = px.line(year_data, x='year', y=sound_features)
fig.show()

### Characteristics of various genres over the years
Using the audio features for different genres we can compare them and their unique differences in music.

In [None]:
top_genres = genre_data.nlargest(10, 'popularity')

fig = px.bar(top_genres, x='genres', y=['valence', 'energy', 'danceability', 'acousticness'], barmode='group')
fig.show()

### Clustering Genres with K-Means

Using simple K-means clustering algorithm to divide the genres in this dataset into ten clusters based on the numerical audio features of each genres.

In [None]:
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=10))])
X = genre_data.select_dtypes(np.number)
cluster_pipeline.fit(X)
genre_data['cluster'] = cluster_pipeline.predict(X)

In [None]:
# Visualizing the Clusters with t-SNE


from sklearn.manifold import TSNE

tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=False))])
genre_embedding = tsne_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding)
projection['genres'] = genre_data['genres']
projection['cluster'] = genre_data['cluster']

fig = px.scatter(projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'genres'])
fig.show();

### Clustering Songs with K-Means

In [None]:
number_cols = ['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit',
 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=20, verbose=False))], verbose=False)

X = data[number_cols].select_dtypes(np.number)
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
data['cluster_label'] = song_cluster_labels

In [None]:
# Visualizing the Clusters with PCA

from sklearn.decomposition import PCA

pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
song_embedding = pca_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)
projection['title'] = data['name']
projection['cluster'] = data['cluster_label']

fig = px.scatter(projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])
fig.show();

### Building the Recommendation System

Based on the analysis and visualizations, it’s clear that similar genres tend to have data points that are located close to each other while similar types of songs are also clustered together.

This observation makes perfect sense. Similar genres will sound similar and will come from similar time periods while the same can be said for songs within those genres. 

We can use this idea to build a recommendation system by taking the data points of the songs a user has listened to and then recommending songs corresponding to nearby data points.

We will use Spotipy. It is a Python client for the Spotify Web API that makes it easy for developers to fetch data and query Spotify’s catalog for songs.

In [None]:
!pip install spotipy

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="88f16ab0579f46e68276b5ea6febf994", 
                                                           client_secret="577807c725b74c649817aa6c9eb926de"))

def find_song(name, year):
    song_data = defaultdict()
    results = sp.search(q= 'track: {} year: {}'.format(name,year), limit=1)
    if results['tracks']['items'] == []:
        return None

    results = results['tracks']['items'][0]
    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]

    song_data['name'] = [name]
    song_data['year'] = [year]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]

    for key, value in audio_features.items():
        song_data[key] = value

    return pd.DataFrame(song_data)

In [None]:
from collections import defaultdict
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import difflib


def get_song_data(song, spotify_data):
    
    try:
        song_data = spotify_data[(spotify_data['name'] == song['name']) 
                                & (spotify_data['year'] == song['year'])].iloc[0]
        return song_data
    
    except IndexError:
        return find_song(song['name'], song['year'])
        

def get_mean_vector(song_list, spotify_data):
    
    song_vectors = []
    
    for song in song_list:
        song_data = get_song_data(song, spotify_data)
        if song_data is None:
            print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
            continue
        song_vector = song_data[number_cols].values
        song_vectors.append(song_vector)  
    
    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)


def flatten_dict_list(dict_list):
    
    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []
    
    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
            
    return flattened_dict


def recommend_songs( song_list, spotify_data, n_songs=10):
    
    metadata_cols = ['name', 'year', 'artists']
    song_dict = flatten_dict_list(song_list)
    
    song_center = get_mean_vector(song_list, spotify_data)
    scaler = song_cluster_pipeline.steps[0][1]
    scaled_data = scaler.transform(spotify_data[number_cols])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')
    index = list(np.argsort(distances)[:, :n_songs][0])
    
    rec_songs = spotify_data.iloc[index]
    rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]
    return rec_songs[metadata_cols].to_dict(orient='records')

#### Recommended Songs:

In [None]:
recommend_songs([{'name': 'Come As You Are', 'year':1991},
                {'name': 'Smells Like Teen Spirit', 'year': 1991},
                {'name': 'Lithium', 'year': 1992},
                 {'name': 'All Apologies', 'year': 1993},
                  {'name': 'Stay Away', 'year': 1993}], data)