This is where the metadata, feature space, and genres are read in and stored in the "songs" dictionary by `track_id`.

In [1]:
import sklearn
import numpy as np
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import LinearSVC
from random import sample as rand_sample
from sklearn.cluster import KMeans

In [2]:
def read_data(file):
    '''
    description

    :param file:        file we're reading in with the data (.csv)

    :return             a 'songs' dictionary
                        key: track_id
                        value(s): a 'data' dictionary, which contains:
                            metadata,
                            label,
                            features,
                            genres
    '''
    
    songs = {}

    with open(file, 'r') as f:

        for song in f:
            
            line = song.split(',')
                        
            # ignore first line + ensure that label exists
            if line[0] != 'label' and line[0]:
                label = int(line[0])
                metadata, data = {}, {}
                genres = []
                track_id = line[1]
                
                features = {}
                features['artist_popularity'] = line[4]
                features['artist_followers'] = line[5]

                metadata['artist_id'] = line[2]
                metadata['artist_name'] = line[3]
                
                # check if genre field has multiple genres or just one
                if line[6]:
                    genres.append(line[6][1:]) if '"' in line[6] else genres.append(line[6])

                count = 0
                
                # if '"' present or next value is a string only containing alphabets,
                # then add to genres list. set count to i+1 when you reach last genre
                for i in range(7, len(line)):
                    if '"' in line[i]:
                        genres.append(line[i][:-1])
                        count = i+1
                        break
                    if line[i].isalpha():
                        genres.append(line[i])
                
                # single or no genres, get all other features
                if count == 0:
                    features['instrumentalness'] = float(line[7])
                    features['duration_ms'] = float(line[8])
                    features['time_signature'] = float(line[9])
                    features['acousticness'] = float(line[10])
                    features['speechiness'] = float(line[11])
                    features['energy'] = float(line[12])
                    features['loudness'] = float(line[13])
                    features['tempo'] = float(line[14])
                    features['key'] = float(line[15])
                    features['valence'] = float(line[16])
                    features['danceability'] = float(line[17])
                    features['liveness'] = float(line[18][:-1])
                
                # multiple genres, get all other features using count
                else:
                    features['instrumentalness'] = float(line[count])
                    features['duration_ms'] = float(line[count+1])
                    features['time_signature'] = float(line[count+2])
                    features['acousticness'] = float(line[count+3])
                    features['speechiness'] = float(line[count+4])
                    features['energy'] = float(line[count+5])
                    features['loudness'] = float(line[count+6])
                    features['tempo'] = float(line[count+7])
                    features['key'] = float(line[count+8])
                    features['valence'] = float(line[count+9])
                    features['danceability'] = float(line[count+10])
                    features['liveness'] = float(line[count+11][:-1])
                
                # add metadata, features, genres, and label to data
                data['metadata'] = metadata
                data['features'] = features
                data['genres'] = genres
                data['label'] = label
                
                # add data to songs by track_id
                songs[track_id] = data

    return songs

This is where genres are read in, with each genre getting a unique `genre_id`.

In [3]:
def isFloat(string):
    '''
    description

    :param string:      the string we're testing

    :return             True if the string is a float, False otherwise
    '''
    try:
        float(string)
        return True
    except ValueError:
        return False

def read_genres(file):
    '''
    description
    
    :param file:        file we're reading in with the data (.csv)
    
    :return             a 'genres' dictionary
                        key: genre
                        value: unique genre_id
    '''
    genre_mapping = {}
    genre_id = 0
    
    with open(file, 'r') as f:

        for song in f:

            line = song.split(',')

            if line[0]:

                genres = []
                
                # check if genres field is empty or contains one/multiple genres
                if line[6]:
                    genres.append(line[6][1:]) if '"' in line[6] else genres.append(line[6])

                count = 0
                
                # account for multiple genres
                for i in range(7, len(line)):
                    if '"' in line[i]:
                        genres.append(line[i][:-1])
                        count = i+1
                        break
                    if line[i].isalpha():
                        genres.append(line[i])
                
                # for genres in the genres list, ensure that the genre is not a float, create unique genre_id
                for genre in genres:
                    if (genre not in genre_mapping) and ('"' not in genre) and (not isFloat(genre)):
                        genre_mapping[genre] = genre_id
                        genre_id += 1
        
    return genre_mapping

This is where the data is actually read in. We read in the `likes.csv` + `dislikes.csv` data, and merge them into one dictionary, `song_data`.

In [4]:
song_data = read_data('data.csv')
genres = read_genres('data.csv')
feature_names = tuple(next(iter(song_data.values()))['features'].keys())

In [5]:
song_data

{'4R3BvvUm5fe8SCPs6287dY': {'features': {'acousticness': 0.112,
   'artist_followers': '0.416491',
   'artist_popularity': '0.65',
   'danceability': 0.607,
   'duration_ms': 0.077205218,
   'energy': 0.507,
   'instrumentalness': 0.0,
   'key': 0.909090909,
   'liveness': 0.103,
   'loudness': 0.831764572,
   'speechiness': 0.0242,
   'tempo': 0.386050232,
   'time_signature': 0.8,
   'valence': 0.275},
  'genres': ['contemporary country', 'country', 'traditional country'],
  'label': 0,
  'metadata': {'artist_id': '6rJqqRce0Kvo2dJUXoHleC',
   'artist_name': 'Alabama'}},
 '5mqzhMuUpvnMfwNz6iepmO': {'features': {'acousticness': 0.114,
   'artist_followers': '1',
   'artist_popularity': '0.89',
   'danceability': 0.618,
   'duration_ms': 0.050386913,
   'energy': 0.845,
   'instrumentalness': 0.0,
   'key': 0.181818182,
   'liveness': 0.415,
   'loudness': 0.991834883,
   'speechiness': 0.132,
   'tempo': 0.781395159,
   'time_signature': 0.8,
   'valence': 0.486},
  'genres': ['bmore',

In [6]:
# Sanity check
l, dl = 0, 0
for x in song_data.values():
    if x['label'] == 1: l += 1
    else: dl += 1
print(' likes:', l, '\n', 'dislikes:', dl)

 likes: 1919 
 dislikes: 3790


In [7]:
genres

{'abstract': 246,
 'abstract beats': 132,
 'abstract hip hop': 440,
 'acid house': 127,
 'acid idm': 51,
 'acid jazz': 111,
 'acid techno': 203,
 'acoustic pop': 397,
 'acousticness': 2,
 'adoracao': 407,
 'adult standards': 67,
 'afro house': 32,
 'afrobeat': 115,
 'afropop': 116,
 'alaska indie': 348,
 'alberta country': 347,
 'album rock': 173,
 'alternative ccm': 410,
 'alternative country': 336,
 'alternative dance': 29,
 'alternative hip hop': 92,
 'alternative metal': 236,
 'alternative pop': 276,
 'alternative rock': 103,
 'ambient': 42,
 'ambient fusion': 368,
 'ambient idm': 271,
 'anthem worship': 390,
 'art pop': 75,
 'artist_genres': 0,
 'aussietronica': 58,
 'austindie': 128,
 'australian alternative rock': 170,
 'australian country': 324,
 'australian dance': 186,
 'australian indie': 100,
 'australian indigenous': 345,
 'australian pop': 315,
 'australian rock': 346,
 'baile pop': 490,
 'balearic': 73,
 'ballet class': 163,
 'bass music': 22,
 'bass trap': 250,
 'bass t

In [8]:
feature_names

('valence',
 'loudness',
 'liveness',
 'duration_ms',
 'artist_followers',
 'energy',
 'key',
 'danceability',
 'tempo',
 'acousticness',
 'time_signature',
 'artist_popularity',
 'speechiness',
 'instrumentalness')

In [9]:
class Classifier(object):
    def __init__(self, algorithm, train_data):
        '''
        description
        
        :param algorithm:   the name of the learning algorithm to use
        :param train_data:  training data, dict with following relevant keys: {
                                    features: { feature_name: feature_value},
                                    label: label_value
                            }
        
        :return             Classifier instance
        '''
        
        x_train, y_train = [x['features'] for x in train_data.values()], [x['label'] for x in train_data.values()]
     
        self.algorithm = algorithm
        
        if algorithm == 'svm':
            
            self.vectorizer = DictVectorizer(sort=True)
            
            self.svm = LinearSVC(penalty='l2', loss='hinge')
            self.svm.fit(self.vectorizer.fit_transform(x_train), y_train)
        
        else: raise ValueError('unsupported algorithm: ' + algorithm)
    
    def validate(self, validation_data):
        '''
        Predicts on all instances in validation_data. Returns accuracy.
        '''
        num_instances = len(validation_data)
        x_test, y_test = [x['features'] for x in validation_data.values()], [x['label'] for x in validation_data.values()]
        
        correct = 0
        for i in range(num_instances):
            if self.predict(x_test[i]) == y_test[i]: correct += 1
        
        return correct / num_instances
                
    def predict(self, x):
        if self.algorithm == 'svm': return self.predict_SVM(x)
        raise ValueError('not implemented')
    
    def predict_SVM(self, x):
        x = self.vectorizer.transform([x])
        return self.svm.predict(x)[0]
    

In [10]:
def filter_features(data, discard):
    '''
    Filters out features from data. Does not modify passed-in object (creates a copy).
    
    :param data:        dict of data, same format as song_data
    :param discard:     feature names to discard
                        
    :return             copy of data, with filtered features
    '''
    out = data.copy()
    
    for id_ in data:
        out[id_]['features'] = { k:v for k, v in out[id_]['features'].items() if k not in discard }
    
    return out

def split_data(data, p):
    '''
    Splits data into training and validation sets for simple classification.
    
    :param data:    complete labeled data
    :param p:       proportion of data to use for validation
                    
    :return         train_data, validation_data
    '''
    validation_labels = rand_sample(list(data), int(len(data) * p))
    
    validation_data = {k:data[k] for k in validation_labels}
    train_data = {k:v for k, v in data.items() if k not in validation_data}

    return train_data, validation_data


In [11]:
def run_basic_svm(data, p, discard=None):
    '''
    Trains svm on song_data (with or without filtering features), splits into train/validation,
    and returns accuracy on validation data.
    '''
    if discard: data = filter_features(data, discard)
    
    train_data, validation_data = split_data(data, p)
    
    clf = Classifier('svm', train_data)
    
    return clf

In [12]:
clf = run_basic_svm(song_data, 0.2)



In [13]:
def test_cluster_size(data, max_cluster):
    for i in range(2, max_cluster+1):
        km = KMeans(i, init='random', max_iter=300, random_state=0, n_init=30)
        km.fit(data)
        print(i, km.inertia_)

# test_cluster_size(x_train, 10)

In [14]:
def get_kmeans_clusters(data, n_clusters):
    train_data = {k:v for k, v in data.items()}
    track_ids = list(data)

    x_train = np.array([list(x['features'].values()) for x in train_data.values()])
    y_train = [x['label'] for x in train_data.values()]

    km = KMeans(n_clusters, init='random', max_iter=300, random_state=0, n_init=30)
    km.fit(x_train)

    cluster_map = pd.DataFrame()

    cluster_map['data'], cluster_map['cluster'] = x_train.tolist(), km.labels_
    cluster_map['label'], cluster_map['track_id'] = y_train, track_ids
    
    for track_id in track_ids:
        new_cluster_map = cluster_map[cluster_map['track_id'] == track_id]
        track_id_data = data[track_id]
        track_id_data['cluster'] = new_cluster_map['cluster'].tolist()[0]
        data[track_id] = track_id_data
    
    return data

get_kmeans_clusters(song_data, 10)

{'4R3BvvUm5fe8SCPs6287dY': {'cluster': 1,
  'features': {'acousticness': 0.112,
   'artist_followers': '0.416491',
   'artist_popularity': '0.65',
   'danceability': 0.607,
   'duration_ms': 0.077205218,
   'energy': 0.507,
   'instrumentalness': 0.0,
   'key': 0.909090909,
   'liveness': 0.103,
   'loudness': 0.831764572,
   'speechiness': 0.0242,
   'tempo': 0.386050232,
   'time_signature': 0.8,
   'valence': 0.275},
  'genres': ['contemporary country', 'country', 'traditional country'],
  'label': 0,
  'metadata': {'artist_id': '6rJqqRce0Kvo2dJUXoHleC',
   'artist_name': 'Alabama'}},
 '5mqzhMuUpvnMfwNz6iepmO': {'cluster': 8,
  'features': {'acousticness': 0.114,
   'artist_followers': '1',
   'artist_popularity': '0.89',
   'danceability': 0.618,
   'duration_ms': 0.050386913,
   'energy': 0.845,
   'instrumentalness': 0.0,
   'key': 0.181818182,
   'liveness': 0.415,
   'loudness': 0.991834883,
   'speechiness': 0.132,
   'tempo': 0.781395159,
   'time_signature': 0.8,
   'valence