In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.metrics import classification_report
import lightgbm as lgbm
import xgboost as xgb

In [3]:
from sklearn.neighbors import KNeighborsClassifier

In [4]:
from imblearn.over_sampling import SMOTE, ADASYN

In [5]:
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

In [6]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from torch import optim
import torch.nn.functional as F

In [7]:
from sklearn.model_selection import train_test_split

df_data = pd.read_csv('data/recleaned_data_stdscle_v2.csv')


In [8]:
labels = df_data.decade

df_data = df_data.drop(['decade','year','popularity'], axis=1)
df_data.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,...,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11
0,1.290829,0.34309,-0.586049,-0.946776,0,-0.578414,5,0.929594,-0.152122,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.277648,-1.888467,-0.431375,-0.909987,0,1.401622,8,0.123664,-0.645386,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.298737,-0.83563,-0.620337,-1.299951,0,-0.117816,2,-0.630575,-0.608907,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.293465,-1.110283,-0.554578,-1.049785,0,-0.566669,0,-0.304312,-0.388445,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.027206,-0.732635,-0.434248,-0.681894,1,-0.579976,0,-0.671149,-0.372584,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
df_data.columns

Index(['acousticness', 'danceability', 'duration_ms', 'energy', 'explicit',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'valence', 'key_0', 'key_1', 'key_2', 'key_3',
       'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9', 'key_10',
       'key_11'],
      dtype='object')

In [10]:
# label_bin = LabelBinarizer()
# labels = label_bin.fit_transform(labels)

label_enc = LabelEncoder()
labels = label_enc.fit_transform(labels)

In [11]:
labels

array([ 0,  0,  0, ..., 10, 10, 10], dtype=int64)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df_data, labels, random_state=2, test_size=0.20)

In [13]:
X_train.shape, y_train.shape, y_test.shape

((125683, 25), (125683,), (31421,))

In [14]:
model = nn.Sequential(nn.Linear(25,1000),
                      nn.ReLU(),
                      nn.Linear(1000,100),
                      nn.ReLU(),
                      nn.Linear(100,11))

In [15]:
model

Sequential(
  (0): Linear(in_features=25, out_features=1000, bias=True)
  (1): ReLU()
  (2): Linear(in_features=1000, out_features=100, bias=True)
  (3): ReLU()
  (4): Linear(in_features=100, out_features=11, bias=True)
)

In [16]:
optimizer = optim.SGD(model.parameters(), lr=3e-2)

In [17]:
loss = nn.CrossEntropyLoss()

In [18]:
X_train, X_test, y_train, y_test = map(torch.tensor, (X_train.to_numpy(dtype=np.float32),
                                                      X_test.to_numpy(dtype=np.float32),
                                                      y_train,
                                                      y_test)
                                      )

In [19]:
train_dataset = TensorDataset(X_train, y_train)
test_dataset  = TensorDataset(X_test, y_test)

In [20]:
train_dl = DataLoader(train_dataset, batch_size=100, shuffle=True)
test_dl  = DataLoader(test_dataset, batch_size=100, shuffle=False)

In [21]:
epochs = 10

In [22]:
def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.xavier_normal_(m.weight)

model.apply(init_weights);

In [23]:
torch.tensor(3).item()

3

In [24]:
from tqdm import tqdm

In [25]:
train_loss = []
val_loss = []
acc = []
for epoch in range(epochs):
    epoch_train_loss = []
    for X,y in train_dl:
        model_loss = loss(model(X), y)
        model_loss.backward()
        epoch_train_loss.append(model_loss.item())
        optimizer.step()
        optimizer.zero_grad()
        
    epoch_val_loss = []
    with torch.no_grad():
        for X,y in test_dl:
            epoch_val_loss.append(loss(model(X), y).item())
            
            
    
    train_loss.append(sum(epoch_train_loss)/len(epoch_train_loss))
    val_loss.append(sum(epoch_val_loss)/len(epoch_val_loss))
    
    print(f'train loss: {train_loss[-1]:.3f}\n validation loss: {val_loss[-1]:.3f}')
        
    
        
        

train loss: 1.855
 validation loss: 1.783
train loss: 1.746
 validation loss: 1.735
train loss: 1.712
 validation loss: 1.702
train loss: 1.690
 validation loss: 1.685
train loss: 1.673
 validation loss: 1.679
train loss: 1.661
 validation loss: 1.669
train loss: 1.651
 validation loss: 1.655
train loss: 1.642
 validation loss: 1.652
train loss: 1.635
 validation loss: 1.646
train loss: 1.628
 validation loss: 1.635


In [26]:
def accuracy(y_hat, y):  #@save
    """Compute the number of correct predictions."""
    if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
        y_hat = y_hat.argmax(axis=-1)
    cmp = y_hat.type(y.dtype) == y
    return float(cmp.type(torch.float).mean())

In [27]:
acc = []

with torch.no_grad():
    for X,y in test_dl:
        acc.append(accuracy(model(X), y))

In [28]:
y_pred = []
with torch.no_grad():
    for X,y in test_dl:
        y_pred.append(torch.argmax(model(X), -1).numpy())

In [29]:
sum(acc)/len(acc)

0.3866152670648363

In [76]:
len(y_pred)

315

In [30]:
y_pred1 = np.zeros_like(y_test)

In [34]:
for j,i in enumerate(range(0,y_test.shape[0], 100)):
    y_pred1[i:i+100] = np.array(y_pred[j])

In [35]:
from sklearn.metrics import precision_recall_fscore_support, recall_score

In [36]:
recall_score(y_test, y_pred1, average='weighted')

0.3862703287610197

In [37]:
precision_recall_fscore_support(y_test,y_pred1, average='weighted')

(0.40020522130704594, 0.3862703287610197, 0.37198412745646553, None)