# Laboratorio 3 
Hacemos uso de un dataset sobre generos musicales basado en datos como (volumen, popularidad, duracion, etc.) 

In [1]:

# Cálculo científico y vectorial para python
import numpy as np

# Libreria para graficos
from matplotlib import pyplot as plt

# Modulo de optimizacion en scipy
from scipy import optimize

import pandas as pd

from sklearn.preprocessing import LabelEncoder

import warnings

# modulo para cargar archivos en formato MATLAB
# from scipy.io import loadmat

# le dice a matplotlib que incruste gráficos en el cuaderno
%matplotlib inline

Importamos el dataset y eliminamos las columnas que no son necesarias

In [2]:
warnings.filterwarnings('ignore')

# importar dataset

data = pd.read_csv('music_genre.csv')

data = data.drop_duplicates(keep=False)

# separar las características de los datos

data = data.drop('instance_id', axis=1)
data = data.drop('artist_name', axis=1)
data = data.drop('track_name', axis=1)
data = data.drop('obtained_date', axis=1)
input_layer_size = 14



Aqui mostramos la distribucion de las clases, que en este caso resultan tener las mismas cantidades de datos

In [3]:
display(data.groupby(["music_genre"])["music_genre"].count())   

music_genre
Alternative    5000
Anime          5000
Blues          5000
Classical      5000
Country        5000
Electronic     5000
Hip-Hop        5000
Jazz           5000
Rap            5000
Rock           5000
Name: music_genre, dtype: int64

Validamos que no existan datos repetidos ni nulos

In [4]:
not (data.isnull().values.any() and data.duplicated())

True

Vemos una lista de llos generos existentes en el dataset

In [5]:
# generamos una lista con los generos existentes en el dataset
generos = sorted(list(pd.unique(data["music_genre"])))

num_labels = len(generos)
generos


['Alternative',
 'Anime',
 'Blues',
 'Classical',
 'Country',
 'Electronic',
 'Hip-Hop',
 'Jazz',
 'Rap',
 'Rock']

In [6]:
modo = sorted(list(pd.unique(data["mode"])))
modo

['Major', 'Minor']

In [7]:
clave = sorted(list(pd.unique(data["key"])))
clave

['A', 'A#', 'B', 'C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#']

transformamos los datos de Y a valores numericos usando LabelEncoder

In [8]:
data["music_genre"] = LabelEncoder().fit_transform(data["music_genre"])
data

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,music_genre
0,27.0,0.00468,0.652,-1.0,0.941,0.79200,A#,0.115,-5.201,Minor,0.0748,100.889,0.759,5
1,31.0,0.01270,0.622,218293.0,0.890,0.95000,D,0.124,-7.043,Minor,0.0300,115.00200000000001,0.531,5
2,28.0,0.00306,0.620,215613.0,0.755,0.01180,G#,0.534,-4.617,Major,0.0345,127.994,0.333,5
3,34.0,0.02540,0.774,166875.0,0.700,0.00253,C#,0.157,-4.498,Major,0.2390,128.014,0.270,5
4,32.0,0.00465,0.638,222369.0,0.587,0.90900,F#,0.157,-6.266,Major,0.0413,145.036,0.323,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50000,59.0,0.03340,0.913,-1.0,0.574,0.00000,C#,0.119,-7.022,Major,0.2980,98.02799999999999,0.330,6
50001,72.0,0.15700,0.709,251860.0,0.362,0.00000,B,0.109,-9.814,Major,0.0550,122.04299999999999,0.113,6
50002,51.0,0.00597,0.693,189483.0,0.763,0.00000,D,0.143,-5.443,Major,0.1460,131.079,0.395,6
50003,65.0,0.08310,0.782,262773.0,0.472,0.00000,G,0.106,-5.016,Minor,0.0441,75.88600000000001,0.354,6


Lo mismo para "mode" y "key"

In [9]:
data["mode"] = LabelEncoder().fit_transform(data["mode"])
data

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,music_genre
0,27.0,0.00468,0.652,-1.0,0.941,0.79200,A#,0.115,-5.201,1,0.0748,100.889,0.759,5
1,31.0,0.01270,0.622,218293.0,0.890,0.95000,D,0.124,-7.043,1,0.0300,115.00200000000001,0.531,5
2,28.0,0.00306,0.620,215613.0,0.755,0.01180,G#,0.534,-4.617,0,0.0345,127.994,0.333,5
3,34.0,0.02540,0.774,166875.0,0.700,0.00253,C#,0.157,-4.498,0,0.2390,128.014,0.270,5
4,32.0,0.00465,0.638,222369.0,0.587,0.90900,F#,0.157,-6.266,0,0.0413,145.036,0.323,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50000,59.0,0.03340,0.913,-1.0,0.574,0.00000,C#,0.119,-7.022,0,0.2980,98.02799999999999,0.330,6
50001,72.0,0.15700,0.709,251860.0,0.362,0.00000,B,0.109,-9.814,0,0.0550,122.04299999999999,0.113,6
50002,51.0,0.00597,0.693,189483.0,0.763,0.00000,D,0.143,-5.443,0,0.1460,131.079,0.395,6
50003,65.0,0.08310,0.782,262773.0,0.472,0.00000,G,0.106,-5.016,1,0.0441,75.88600000000001,0.354,6


In [10]:
data["key"] = LabelEncoder().fit_transform(data["key"])
data

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,music_genre
0,27.0,0.00468,0.652,-1.0,0.941,0.79200,1,0.115,-5.201,1,0.0748,100.889,0.759,5
1,31.0,0.01270,0.622,218293.0,0.890,0.95000,5,0.124,-7.043,1,0.0300,115.00200000000001,0.531,5
2,28.0,0.00306,0.620,215613.0,0.755,0.01180,11,0.534,-4.617,0,0.0345,127.994,0.333,5
3,34.0,0.02540,0.774,166875.0,0.700,0.00253,4,0.157,-4.498,0,0.2390,128.014,0.270,5
4,32.0,0.00465,0.638,222369.0,0.587,0.90900,9,0.157,-6.266,0,0.0413,145.036,0.323,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50000,59.0,0.03340,0.913,-1.0,0.574,0.00000,4,0.119,-7.022,0,0.2980,98.02799999999999,0.330,6
50001,72.0,0.15700,0.709,251860.0,0.362,0.00000,2,0.109,-9.814,0,0.0550,122.04299999999999,0.113,6
50002,51.0,0.00597,0.693,189483.0,0.763,0.00000,5,0.143,-5.443,0,0.1460,131.079,0.395,6
50003,65.0,0.08310,0.782,262773.0,0.472,0.00000,10,0.106,-5.016,1,0.0441,75.88600000000001,0.354,6


Eliminamos las filas con los valores "?" ya que podrian variar los resultados de los calculos

In [11]:
# tempo "?" a 0
data.drop(data[data['tempo'] == '?'].index , inplace = True)
data["tempo"] = data["tempo"].astype(float)
data


Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,music_genre
0,27.0,0.00468,0.652,-1.0,0.941,0.79200,1,0.115,-5.201,1,0.0748,100.889,0.759,5
1,31.0,0.01270,0.622,218293.0,0.890,0.95000,5,0.124,-7.043,1,0.0300,115.002,0.531,5
2,28.0,0.00306,0.620,215613.0,0.755,0.01180,11,0.534,-4.617,0,0.0345,127.994,0.333,5
3,34.0,0.02540,0.774,166875.0,0.700,0.00253,4,0.157,-4.498,0,0.2390,128.014,0.270,5
4,32.0,0.00465,0.638,222369.0,0.587,0.90900,9,0.157,-6.266,0,0.0413,145.036,0.323,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50000,59.0,0.03340,0.913,-1.0,0.574,0.00000,4,0.119,-7.022,0,0.2980,98.028,0.330,6
50001,72.0,0.15700,0.709,251860.0,0.362,0.00000,2,0.109,-9.814,0,0.0550,122.043,0.113,6
50002,51.0,0.00597,0.693,189483.0,0.763,0.00000,5,0.143,-5.443,0,0.1460,131.079,0.395,6
50003,65.0,0.08310,0.782,262773.0,0.472,0.00000,10,0.106,-5.016,1,0.0441,75.886,0.354,6


lo mismo para los valores -1 en la columna de duracion ya que no representan un valor util

In [12]:

data.drop(data[data['duration_ms'] == -1].index , inplace = True)
data["duration_ms"] = data["duration_ms"].astype(float)
data


Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,music_genre
1,31.0,0.01270,0.622,218293.0,0.890,0.950000,5,0.124,-7.043,1,0.0300,115.002,0.531,5
2,28.0,0.00306,0.620,215613.0,0.755,0.011800,11,0.534,-4.617,0,0.0345,127.994,0.333,5
3,34.0,0.02540,0.774,166875.0,0.700,0.002530,4,0.157,-4.498,0,0.2390,128.014,0.270,5
4,32.0,0.00465,0.638,222369.0,0.587,0.909000,9,0.157,-6.266,0,0.0413,145.036,0.323,5
6,46.0,0.02890,0.572,214408.0,0.803,0.000008,2,0.106,-4.294,0,0.3510,149.995,0.230,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49999,56.0,0.13300,0.849,237667.0,0.660,0.000008,3,0.296,-7.195,0,0.0516,99.988,0.629,6
50001,72.0,0.15700,0.709,251860.0,0.362,0.000000,2,0.109,-9.814,0,0.0550,122.043,0.113,6
50002,51.0,0.00597,0.693,189483.0,0.763,0.000000,5,0.143,-5.443,0,0.1460,131.079,0.395,6
50003,65.0,0.08310,0.782,262773.0,0.472,0.000000,10,0.106,-5.016,1,0.0441,75.886,0.354,6


In [13]:
train_data = data.sample(frac=0.8, random_state=200)
test_data = data.drop(train_data.index)

In [14]:
X = train_data.drop("music_genre", axis=1)

# separar las etiquetas de los datos
Y = train_data["music_genre"]
X.shape, Y.shape


((32448, 13), (32448,))

In [15]:
def  featureNormalize(X):
    X_norm = X.copy()
    mu = np.zeros(X.shape[1])
    sigma = np.zeros(X.shape[1])

    mu = np.mean(X, axis = 0)
    sigma = np.std(X, axis = 0)
    X_norm = (X - mu) / sigma

    return X_norm, mu, sigma

In [16]:
# llama featureNormalize con los datos cargados
X_norm, mu, sigma = featureNormalize(X)
X_norm

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
22815,0.945870,-0.590160,0.270198,-0.502657,0.813100,-0.558867,-0.675431,-0.078241,0.798472,-0.750989,-0.644424,-0.351113,0.319387
41944,-0.595827,2.001689,-1.491933,-0.585280,-2.045329,2.204818,1.343555,-0.363516,-3.092732,-0.750989,-0.547580,0.577077,-0.113753
15357,0.753158,0.774588,0.359703,-0.479591,-0.213022,-0.558801,1.055128,-0.208475,0.461217,-0.750989,-0.102884,-1.295860,0.327483
44897,-0.531589,1.811328,-1.827577,0.385187,-1.842745,1.999305,-0.098578,0.225640,-2.098105,-0.750989,-0.531768,-1.091461,-1.065041
15345,1.652481,-0.884167,-0.333961,-0.125395,0.869687,-0.558867,1.055128,-0.338710,0.644557,1.331577,-0.509039,-0.992244,-0.012552
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8134,-1.559387,-0.664548,0.376485,-0.231443,0.990407,-0.558867,-0.098578,-0.594838,0.749646,-0.750989,-0.167118,1.896888,1.088514
18980,0.367734,-0.657226,0.533119,-1.079127,-0.824168,-0.558644,-0.387004,-0.115451,0.028574,-0.750989,-0.419112,0.661529,-1.307923
16869,-0.338877,1.383746,0.678565,0.254019,-1.363637,-0.558542,0.189849,-0.468944,-0.478440,-0.750989,-0.625648,-0.286804,-1.324115
29058,0.688920,-0.615932,1.215595,-0.552026,-0.563865,-0.558867,-0.387004,-0.555767,0.063334,-0.750989,0.181721,-1.297164,0.396300


In [17]:
# Configurar la matriz adecuadamente, y agregar una columna de unos que corresponde al termino de intercepción.
m, n = X.shape
# Agraga el termino de intercepción a A

X = X_norm

In [18]:
def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

In [19]:
def lrCostFunction(theta, X, y, lambda_):
    m = y.size
    if y.dtype == bool:
        y = y.astype(int)

    J = 0
    grad = np.zeros(theta.shape)

    h = sigmoid(X.dot(theta.T))

    temp = theta
    temp[0] = 0
    J = (1 / m) * np.sum(-y.dot(np.log(h)) - (1 - y).dot(np.log(1 - h)))
    grad = (1 / m) * (h - y).dot(X)

    return J, grad

In [20]:
def oneVsAll(X, y, num_labels, lambda_):
    # algunas variables utiles
    m, n = X.shape

    all_theta = np.zeros((num_labels, n + 1))

    # Agrega unos a la matriz X
    X = np.concatenate([np.ones((m, 1)), X], axis=1)

    for c in np.arange(num_labels):
        initial_theta = np.zeros(n + 1)
        options = {'maxiter': 50}
        res = optimize.minimize(lrCostFunction,
                                initial_theta,
                                (X, (y == c), lambda_),
                                jac=True,
                                method='CG',
                                options=options)

        all_theta[c] = res.x

    return all_theta

In [21]:
lambda_ = 0.1
all_theta = oneVsAll(X, Y, num_labels, lambda_)
print(all_theta.shape)

(10, 14)


In [22]:
print(all_theta)

[[-2.56512391e+00  4.08468588e-01 -2.29823530e-01 -3.92550486e-01
  -1.77716339e-01  2.40814563e-01 -4.05035986e-01  2.26850304e-02
  -6.81384747e-02  3.14783283e-01  5.83347565e-02 -1.91353280e-01
  -9.60896698e-02 -1.96442717e-01]
 [-3.92865505e+00 -2.14600354e+00  1.31617997e-01 -4.72789700e-01
  -3.91179483e-01  2.35837702e-01  1.94580717e-01  3.34247564e-02
  -2.43564720e-01  1.24736028e+00  8.19144967e-02 -5.75020279e-01
   1.25186554e-01 -1.04798902e-01]
 [-2.84747657e+00 -7.94115361e-01 -1.84820312e-01 -4.59657915e-01
   1.20982210e-01 -5.11187621e-01 -6.45821340e-01 -5.95248565e-02
   2.00303901e-01  2.23524077e-01 -8.73405534e-02 -4.15816658e-01
  -6.39569729e-02  9.11040006e-01]
 [-4.50349038e+00 -4.25507517e-01  1.10139632e+00 -1.22087875e+00
   2.14885467e-01  9.60711869e-01  5.33864335e-02  4.93048279e-02
  -6.31161068e-02 -1.48808324e+00 -1.29817543e-01 -8.52404342e-02
  -7.22649617e-02  1.79630746e-02]
 [-4.28178151e+00 -5.31069049e-02  5.06085629e-02  5.26237837e-02
  

In [23]:
def predictOneVsAll(all_theta, X):

    m = X.shape[0]
    num_labels = all_theta.shape[0]

    p = np.zeros(m)

    # Add ones to the X data matrix
    X = np.concatenate([np.ones((m, 1)), X], axis=1)
    p = np.argmax(sigmoid(X.dot(all_theta.T)), axis = 1)

    return p

In [24]:
print(X.shape)
pred = predictOneVsAll(all_theta, X)
print('Precision del conjuto de entrenamiento: {:.2f}%'.format(np.mean(pred == Y) * 100))
X_train = test_data.drop("music_genre", axis=1)
Y_train = test_data["music_genre"]
print(X_train.shape)

X_train = np.concatenate([np.ones((X_train.shape[0], 1)), X_train], axis=1)
p = np.argmax(sigmoid(X_train.dot(all_theta.T)), axis = 1)
print(p)

print('Precision del conjuto de prueba: {:.2f}%'.format(np.mean(p == Y_train) * 100))

(32448, 13)
Precision del conjuto de entrenamiento: 51.11%
(8112, 13)
[2 2 2 ... 2 2 2]
Precision del conjuto de prueba: 10.38%


# No se observan cambios con respecto al metodo anterior