In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import time
import lightgbm as lgbm
from pathlib import Path
import pickle
from catboost import CatBoostClassifier, cv, Pool
import scikitplot as skplt
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, tpe, partial
import plotly.graph_objects as go

import itertools
from itertools import combinations

import scipy as sp
from scipy.stats import pearsonr, chi2_contingency

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.colors import ListedColormap
from matplotlib import rc

import datetime
from dateutil import relativedelta

from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.max_colwidth = None
pd.options.display.max_rows = None

import statsmodels.api as sm 
from statsmodels.graphics.api import abline_plot # For visualling evaluating predictions.
from statsmodels.stats.proportion import proportion_confint

import warnings # For handling error messages.
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.filterwarnings('ignore')

import sklearn.metrics as met
from sklearn import linear_model, preprocessing, model_selection, svm, datasets
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel, SelectKBest, chi2, RFE
from sklearn.linear_model import LassoCV, LogisticRegression, Lasso
from sklearn.metrics import plot_confusion_matrix, auc, confusion_matrix, classification_report, accuracy_score, roc_curve, roc_auc_score, plot_roc_curve
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import scale, StandardScaler, LabelEncoder, MinMaxScaler, Binarizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from bayes_opt import BayesianOptimization
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events
from bayes_opt.util import load_logs

## The dataset is split into 1,000 JSON files, each about 35MB.  Each contains 1,000 complete playlists.  
## Each playlist has 4>x>251 tracks, with x>2 unique artists, and with x>1 unique albums.  

number of playlists 1000000  
number of tracks 66346428  
number of unique tracks 2262292  
number of unique albums 734684  
number of unique artists 295860  
mean playlist length 66.346428  
mode playlist length 20   
top track: HUMBLE. by Kendrick Lamar in 46574 playlists  
top artist: 847160 tracks by Drake   

# Here's the ReadMe

## Detailed description
The Million Playlist Dataset consists of 1,000 slice files. These files have the naming convention of:

mpd.slice._STARTING\_PLAYLIST\_ID\_-\_ENDING\_PLAYLIST\_ID_.json

For example, the first 1,000 playlists in the MPD are in a file called 
`mpd.slice.0-999.json` and the last 1,000 playlists are in a file called
`mpd.slice.999000-999999.json`.

Each slice file is a JSON dictionary with two fields:
*info* and *playlists*.

### `info` Field
The info field is a dictionary that contains general information about the particular slice:

   * **slice** - the range of slices that in in this particular file - such as 0-999
   * ***version*** -  - the current version of the MPD (which should be v1)
   * ***description*** - a description of the MPD
   * ***license*** - licensing info for the MPD
   * ***generated_on*** - a timestamp indicating when the slice was generated.

### `playlists` field 
This is an array that typically contains 1,000 playlists. Each playlist is a dictionary that contains the following fields:


* ***pid*** - integer - playlist id - the MPD ID of this playlist. This is an integer between 0 and 999,999.
* ***name*** - string - the name of the playlist 
* ***description*** - optional string - if present, the description given to the playlist.  Note that user-provided playlist descrptions are a relatively new feature of Spotify, so most playlists do not have descriptions.
* ***modified_at*** - seconds - timestamp (in seconds since the epoch) when this playlist was last updated. Times are rounded to midnight GMT of the date when the playlist was last updated.
* ***num_artists*** - the total number of unique artists for the tracks in the playlist.
* ***num_albums*** - the number of unique albums for the tracks in the playlist
* ***num_tracks*** - the number of tracks in the playlist
* ***num_followers*** - the number of followers this playlist had at the time the MPD was created. (Note that the follower count does not including the playlist creator)
* ***num_edits*** - the number of separate editing sessions. Tracks added in a two hour window are considered to be added in a single editing session.
* ***duration_ms*** - the total duration of all the tracks in the playlist (in milliseconds)
* ***collaborative*** -  boolean - if true, the playlist is a collaborative playlist. Multiple users may contribute tracks to a collaborative playlist.
* ***tracks*** - an array of information about each track in the playlist. Each element in the array is a dictionary with the following fields:
   * ***track_name*** - the name of the track
   * ***track_uri*** - the Spotify URI of the track
   * ***album_name*** - the name of the track's album
   * ***album_uri*** - the Spotify URI of the album
   * ***artist_name*** - the name of the track's primary artist
   * ***artist_uri*** - the Spotify URI of track's primary artist
   * ***duration_ms*** - the duration of the track in milliseconds
   * ***pos*** - the position of the track in the playlist (zero-based)

# Trying out the first 1000 tracks

In [4]:
with open('data/mpd.slice.0-999.json') as f:
  df = json.load(f)

In [7]:
df = df['playlists'] 
len(df)

1000

# Here's the first playlist

In [8]:
df[0]

{'name': 'Throwbacks',
 'collaborative': 'false',
 'pid': 0,
 'modified_at': 1493424000,
 'num_tracks': 52,
 'num_albums': 47,
 'num_followers': 1,
 'tracks': [{'pos': 0,
   'artist_name': 'Missy Elliott',
   'track_uri': 'spotify:track:0UaMYEvWZi0ZqiDOoHU3YI',
   'artist_uri': 'spotify:artist:2wIVse2owClT7go1WT98tk',
   'track_name': 'Lose Control (feat. Ciara & Fat Man Scoop)',
   'album_uri': 'spotify:album:6vV5UrXcfyQD1wu4Qo2I9K',
   'duration_ms': 226863,
   'album_name': 'The Cookbook'},
  {'pos': 1,
   'artist_name': 'Britney Spears',
   'track_uri': 'spotify:track:6I9VzXrHxO9rA9A5euc8Ak',
   'artist_uri': 'spotify:artist:26dSoYclwsYLMAKD3tpOr4',
   'track_name': 'Toxic',
   'album_uri': 'spotify:album:0z7pVBGOD7HCIB7S8eLkLI',
   'duration_ms': 198800,
   'album_name': 'In The Zone'},
  {'pos': 2,
   'artist_name': 'Beyoncé',
   'track_uri': 'spotify:track:0WqIKmW4BTrj3eJFmnCKMv',
   'artist_uri': 'spotify:artist:6vWDO969PvNqNYHIOW5v0m',
   'track_name': 'Crazy In Love',
   'alb

## I guess the best way to find similar songs is by finding which songs are on the same playlists.  I'll try making a list of tracks first.  Tracks by different artists could have the same name, so I'll use the Spotify track URI (User Resource Indicator) as the unique label.  We end up with 67,503 tracks from the first 1,000 playlists.

In [9]:
allTracks = []
for x in df:
    for y in x['tracks']:
        allTracks.append([y['track_uri'], y['artist_name'], y['track_name'], x['pid']])

In [11]:
allTracks = pd.DataFrame(allTracks, columns=['Spotify Track URI', 'Artist Name', 'Track Name', 'Playist ID #'])
len(allTracks)

67503

In [12]:
allTracks.head()

Unnamed: 0,Spotify Track URI,Artist Name,Track Name,Playist ID #
0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Missy Elliott,Lose Control (feat. Ciara & Fat Man Scoop),0
1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Britney Spears,Toxic,0
2,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Beyoncé,Crazy In Love,0
3,spotify:track:1AWQoqb9bSvzTjaLralEkT,Justin Timberlake,Rock Your Body,0
4,spotify:track:1lzr43nnXAijIGYnCT8M8H,Shaggy,It Wasn't Me,0


In [13]:
allTracks.tail()

Unnamed: 0,Spotify Track URI,Artist Name,Track Name,Playist ID #
67498,spotify:track:5uCax9HTNlzGybIStD3vDh,James Arthur,Say You Won't Let Go,999
67499,spotify:track:0P1oO2gREMYUCoOkzYAyFu,Big Words,The Answer,999
67500,spotify:track:2oM4BuruDnEvk59IvIXCwn,Allan Rayman,25.22,999
67501,spotify:track:4Ri5TTUgjM96tbQZd5Ua7V,Jon Jason,Good Feeling,999
67502,spotify:track:5RVuBrXVLptAEbGJdSDzL5,Grizfolk,Cosmic Angel - Acoustic From Capitol Studios,999


## So let's make a way that we could add more data.  Let's start with the first 2 parts of the dataset.  We end up getting 134,125 tracks from 2,000 playlists.

In [14]:
everything = []
for x in range(0,2):
    low = 1000 * x
    high = low + 999
    path = "data/mpd.slice." + str(low) + "-" + str(high) + ".json"
    with open(path) as f:
        df = json.load(f)
        df = df['playlists'] 
        for y in df:
            for z in y['tracks']:
                everything.append([z['track_uri'], z['artist_name'], z['track_name'], y['pid']])

In [15]:
everything = pd.DataFrame(everything, columns=['Spotify Track URI', 'Artist Name', 'Track Name', 'Playist ID #'])
len(everything)

134125

In [17]:
everything.head()

Unnamed: 0,Spotify Track URI,Artist Name,Track Name,Playist ID #
0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Missy Elliott,Lose Control (feat. Ciara & Fat Man Scoop),0
1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Britney Spears,Toxic,0
2,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Beyoncé,Crazy In Love,0
3,spotify:track:1AWQoqb9bSvzTjaLralEkT,Justin Timberlake,Rock Your Body,0
4,spotify:track:1lzr43nnXAijIGYnCT8M8H,Shaggy,It Wasn't Me,0


In [18]:
everything.tail()

Unnamed: 0,Spotify Track URI,Artist Name,Track Name,Playist ID #
134120,spotify:track:2G8gLafzmoHwa6RBih74J3,Seether,Broken - Featuring Amy Lee,1999
134121,spotify:track:1bPRhOz3nOfEIoLjB1YYT3,Three Days Grace,No More,1999
134122,spotify:track:1o4Pf2GIMDCD7ifIM2yI77,Three Days Grace,Last to Know,1999
134123,spotify:track:0nNVR2iDM3eVzEgMi78vQm,3 Doors Down,Let Me Go,1999
134124,spotify:track:6XzJXUQpwPkp8g5KW9TRoP,Tech N9ne Collabos,Actin Like You Know,1999


In [19]:
everything.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Spotify Track URI,134125,57884.0,spotify:track:1xznGGDReH1oQq0xzbwXa3,98.0,,,,,,,
Artist Name,134125,14914.0,Drake,1929.0,,,,,,,
Track Name,134125,48774.0,Closer,136.0,,,,,,,
Playist ID #,134125,,,,1000.73,572.94,0.0,503.0,992.0,1497.0,1999.0


## So... if I have only the Playlist ID # and the Spotify Track URI, I could... pivot the dataframe?  And create a giant df with tracks as the dummies?

In [20]:
playlistPivot = everything.drop(['Artist Name', 'Track Name'], axis=1)
playlistPivot['value']=1

In [23]:
playlistPivot.head()

Unnamed: 0,Spotify Track URI,Playist ID #,value
0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,0,1
1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,0,1
2,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,0,1
3,spotify:track:1AWQoqb9bSvzTjaLralEkT,0,1
4,spotify:track:1lzr43nnXAijIGYnCT8M8H,0,1


In [24]:
playlistPivot = pd.pivot_table(playlistPivot, index='Playist ID #', columns='Spotify Track URI', values='value', aggfunc=max, fill_value=0)

In [None]:
# playlistPivot.head()
# I can't display this since it take 42MB on disk to save!

# So now there are 57,884 tracks (as columns) with 2000 playlists (as rows).  How should I compare how similar these tracks are?  Once I figure that out, I can maybe mark the tracks from the most similar playlist with .5 points, the next most similar with .25 points, and so on...

In [27]:
playlistPivot.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 1999
Columns: 57884 entries, spotify:track:000mA0etY38nKdvf1N04af to spotify:track:7zzBEZBTJejWeL6EqWmCD9
dtypes: int64(57884)
memory usage: 883.3 MB
