In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
import sklearn.utils, sklearn.preprocessing, sklearn.decomposition, sklearn.svm

import os
import ast

In [4]:
def load(filepath):

    filename = os.path.basename(filepath)

    if 'features' in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])

    if 'echonest' in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])

    if 'genres' in filename:
        return pd.read_csv(filepath, index_col=0)

    if 'tracks' in filename:
        tracks = pd.read_csv(filepath, index_col=0, header=[0, 1])

        COLUMNS = [('track', 'tags'), ('album', 'tags'), ('artist', 'tags'),
                   ('track', 'genres'), ('track', 'genres_all')]
        for column in COLUMNS:
            tracks[column] = tracks[column].map(ast.literal_eval)

        COLUMNS = [('track', 'date_created'), ('track', 'date_recorded'),
                   ('album', 'date_created'), ('album', 'date_released'),
                   ('artist', 'date_created'), ('artist', 'active_year_begin'),
                   ('artist', 'active_year_end')]
        for column in COLUMNS:
            tracks[column] = pd.to_datetime(tracks[column])

        SUBSETS = ('small', 'medium', 'large')
        try:
            tracks['set', 'subset'] = tracks['set', 'subset'].astype(
                    'category', categories=SUBSETS, ordered=True)
        except (ValueError, TypeError):
            # the categories and ordered arguments were removed in pandas 0.25
            tracks['set', 'subset'] = tracks['set', 'subset'].astype(
                     pd.CategoricalDtype(categories=SUBSETS, ordered=True))

        COLUMNS = [('track', 'genre_top'), ('track', 'license'),
                   ('album', 'type'), ('album', 'information'),
                   ('artist', 'bio')]
        for column in COLUMNS:
            tracks[column] = tracks[column].astype('category')

        return tracks


In [5]:
tracks = load('/content/drive/MyDrive/DataEng/Demo/tracks.csv')
features = load('/content/drive/MyDrive/DataEng/Demo/features.csv')

In [6]:
features.shape, tracks.shape

((106574, 518), (106574, 52))

In [7]:
small = tracks['set', 'subset'] <= 'small'

train = tracks['set', 'split'] == 'training'
val = tracks['set', 'split'] == 'validation'
test = tracks['set', 'split'] == 'test'

y_train = tracks.loc[small & train, ('track', 'genre_top')]
y_test = tracks.loc[small & test, ('track', 'genre_top')]
X_train = features.loc[small & train, 'mfcc']
X_test = features.loc[small & test, 'mfcc']

print('{} training examples, {} testing examples'.format(y_train.size, y_test.size))
print('{} features, {} classes'.format(X_train.shape[1], np.unique(y_train).size))

6400 training examples, 800 testing examples
140 features, 8 classes


In [8]:
# Be sure training samples are shuffled.
X_train, y_train = skl.utils.shuffle(X_train, y_train, random_state=42)

# Standardize features by removing the mean and scaling to unit variance.
scaler = skl.preprocessing.StandardScaler(copy=False)
scaler.fit_transform(X_train)
scaler.transform(X_test)

array([[ 2.02407870e+00,  7.28341156e-01,  2.04241274e+00, ...,
        -5.21931263e-01, -8.24199676e-01, -7.80570294e-01],
       [ 1.40072301e-02, -3.89110770e-01, -2.80515927e-01, ...,
         2.49215243e-01,  1.43903422e-01,  7.42925519e-01],
       [-3.27751541e-02, -4.64044606e-01,  1.57717431e-01, ...,
         4.74054870e-02, -6.77014218e-03, -1.44250996e-01],
       ...,
       [-3.04122154e-01, -4.97753390e-01, -8.27489341e-06, ...,
         7.09121936e-01,  2.35810789e+00,  1.56903540e+00],
       [-2.59806394e-01, -2.07064389e-01,  1.37721511e-01, ...,
        -4.98516891e-01,  1.47795198e-01,  2.76447211e-01],
       [-2.95374053e-01, -7.25148255e-01,  3.32040621e-01, ...,
         4.31275266e-02, -1.65205994e-01,  8.37615946e-01]])

In [None]:


# Support vector classification.
clf = skl.svm.SVC()
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
print('Accuracy: {:.2%}'.format(score))

Accuracy: 46.38%


In [None]:
import pickle
picklepath = '/content/drive/MyDrive/DataEng/Demo/model.pkl'
with open(picklepath, 'wb') as f:
  pickle.dump(clf, f)