In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import sys, os
import matplotlib.pyplot as plt
import random

sys.path.append(os.path.abspath('../util'))
# sys.path.append(os.path.abspath('../'))

%matplotlib inline

In [2]:
data_path = Path('../data/')
# data_genre = data_path / Path('data_w_genres.csv')
df_path= data_path / Path('df_cleaned__by_artist.csv')

CSV file contains float columns as features and using artist as a label for multiclass prediction.
If a song was produced by >1 artist the row has been split into mulitple rows so that each row only
has one artist.

In [3]:
df = pd.read_csv(df_path,index_col=0)
df.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence,artists
0,0.998996,0.716599,0.028442,0.195,0.0,0.563,0.909091,0.151,0.745,1.0,0.0,0.052219,0.485348,0.779,Carl Woitschach
1,0.997992,0.383603,0.051316,0.0135,0.0,0.901,0.727273,0.0763,0.494026,1.0,0.0,0.047678,0.344019,0.0767,Robert Schumann
2,0.997992,0.383603,0.051316,0.0135,0.0,0.901,0.727273,0.0763,0.494026,1.0,0.0,0.047678,0.344019,0.0767,Vladimir Horowitz
3,0.606426,0.758097,0.018374,0.22,0.0,0.0,0.454545,0.119,0.627609,0.0,0.0,0.95872,0.439086,0.88,Seweryn Goszczyński
4,0.998996,0.790486,0.032538,0.13,0.0,0.887,0.090909,0.111,0.708887,0.0,0.0,0.095562,0.44247,0.72,Francisco Canaro


In [4]:
artists_names = df.artists

In [5]:
index_artist = dict(((i, artist) for i,artist in  enumerate(artists_names.unique())))
artist_index = dict((val, key) for key,val in index_artist.items() )

In [6]:
df = df.drop(['artists'], axis =1)

In [7]:
index_artist[2176] , artist_index['Queen']

('Queen', 2176)

# Model

- Create a generator with True/ Fake samples
    - Fake samples: Choose a random vector and assign a label that is not its true artist label (1/-1)
- Create an embedding layer for the artists and pass the remaining features to a FC layer
- Train the model to see if it can detect fakes (BCE)
- User inputs song
    - Featurize the song using spotify api
    - Input the features into model(Artist in dict | Artist not in dict)
    - Works for Artist in dict, if artist not in dict???

In [8]:
artists_names[5000], artist_index[artists_names[5000]], index_artist[2032]

('Paul McCartney', 2032, 'Paul McCartney')

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [10]:
class Dataset(torch.utils.data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, df, cols, is_truth = 1):
        'Initialization'
        self.df = df
#         self.false_pct = false_pct
        self.cols = cols
        self.is_truth = is_truth

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.df)

    def __getitem__(self, index):
        'Generates one sample of data'
        true_label = 1
        fake_label = -1
        
        if self.is_truth: 
            return torch.tensor([self.df[self.cols].iloc[index]]).double(), \
                   artist_index[artists_names[index]],\
                   torch.tensor([true_label])
        else:
            return torch.rand(len(list(df.columns))).unsqueeze(0).double(), \
                   artist_index[artists_names[random.choice(range(0,len(df)))]], \
                   torch.tensor([fake_label])

In [39]:
class DeepEmbed(nn.Module):

    def __init__(self, feature_size, lookup_size, embedding_dim, batch_size):
        super(DeepEmbed, self).__init__()
        # Should Input size be (None,1) ?         
        self.embeddings = nn.Embedding(lookup_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.head = nn.Linear(feature_size, 128)
        self.op_layer = nn.Linear(256, 1)
        self.batch_size = batch_size

    def forward(self, feature_input, lookup_input):
        embeds = self.embeddings(lookup_input).view(self.batch_size, 1, -1)
        out1 = F.relu(self.linear1(embeds))
        out2 = F.relu(self.head(feature_input))
        output_concat = torch.cat([out1, out2], axis =-1)
        out = torch.sigmoid(self.op_layer(output_concat))
        return out.squeeze(1)

In [40]:
# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True

In [41]:
truth_set = Dataset(df, list(df.columns), is_truth=True)
fake_set = Dataset(df, list(df.columns), is_truth=False)
full_ds = torch.utils.data.ConcatDataset([truth_set, fake_set])
# training_generator = torch.utils.data.DataLoader(training_set, **params)

In [54]:
lookup_size = len(artists_names.unique())
emb_dim = 50
feature_size = len(df.columns)
learning_rate = 1e-4
batch_size = 256
epochs = 50 

criterion = nn.BCELoss()

In [55]:
train_dl = torch.utils.data.DataLoader(full_ds, batch_size=batch_size, shuffle=True, num_workers=0)
_iter = iter(train_dl)

In [56]:
features, artist, label = next(_iter)
# (features.shape), artist, index_artist[int(artist)] , label
features.shape

torch.Size([256, 1, 14])

In [57]:
model = DeepEmbed(feature_size = feature_size, lookup_size=lookup_size, \
                  embedding_dim=emb_dim, batch_size=batch_size)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

print(model)

DeepEmbed(
  (embeddings): Embedding(31465, 50)
  (linear1): Linear(in_features=50, out_features=128, bias=True)
  (head): Linear(in_features=14, out_features=128, bias=True)
  (op_layer): Linear(in_features=256, out_features=1, bias=True)
)


In [58]:
def train(model, device, train_loader, optimizer, criterion):
    model.train()
    y_true = []
    y_pred = []
    for epoch in range(epochs):
        print(f"--EPOCH {epoch}--")
        for i in train_loader:
            feature, artist, label = i
            feature, artist, label = feature.cuda(), artist.cuda(),label.cuda()
            feature = feature.float()
            label = label.float()
            output = model(feature, artist)
            output = output

            loss = criterion(output, label) 

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            print(f"{loss.item()}")
            break

In [59]:
train(model.cuda(), device, train_dl, optimizer, criterion)

--EPOCH 0--
0.6565296649932861
--EPOCH 1--
0.6468492746353149
--EPOCH 2--
0.6615175008773804
--EPOCH 3--
0.6654900908470154
--EPOCH 4--
0.6526414155960083
--EPOCH 5--
0.664181113243103
--EPOCH 6--
0.6336917281150818
--EPOCH 7--
0.6464664340019226
--EPOCH 8--
0.6458895206451416
--EPOCH 9--
0.6296395659446716
--EPOCH 10--
0.6242449283599854
--EPOCH 11--
0.6332712173461914
--EPOCH 12--
0.6397718191146851
--EPOCH 13--
0.6118695735931396
--EPOCH 14--
0.6287440657615662
--EPOCH 15--
0.5983577966690063
--EPOCH 16--
0.6065517663955688
--EPOCH 17--
0.5939013957977295
--EPOCH 18--
0.6081289052963257
--EPOCH 19--
0.6013389825820923
--EPOCH 20--
0.5984784364700317
--EPOCH 21--
0.6073423624038696
--EPOCH 22--
0.5866132974624634
--EPOCH 23--
0.5968610644340515
--EPOCH 24--
0.5926480293273926
--EPOCH 25--
0.585694432258606
--EPOCH 26--
0.5940492153167725
--EPOCH 27--
0.5955565571784973
--EPOCH 28--
0.5734363198280334
--EPOCH 29--
0.5417684316635132
--EPOCH 30--
0.5950939655303955
--EPOCH 31--
0.57295