# Setup

In [31]:
#import libraries and functions
import pandas as pd
from sklearn.model_selection import train_test_split

dataset = 2

#import dataset as a dataframe from the combined csv
if (dataset == 0):
    df = pd.read_csv('../../Project1/combined0_17.csv')
elif (dataset == 1):
    df = pd.read_csv('../../Project1/completeData0.csv')
else: 
    df = pd.read_csv('../../Project1/completeData17.csv')

#print first 5 rows of df
print(df.head(), "\n", df.shape)

   user_id              artist_name                  release_name  \
0    16493     Greg MacPherson Band  Good Times Coming Back Again   
1     8793  Wolfgang Amadeus Mozart     The World of Sacred Music   
2     6263                    Japan                      Tin Drum   
3     5838                   Enigma          The Cross of Changes   
4     1061               Paul Simon                     Graceland   

                                      recording_name        date      time  
0                                            Numbers  2006-11-29  13:19:10  
1                                   Ave Verum Corpus  2006-11-29  13:52:16  
2                                             Ghosts  2006-11-29  13:59:42  
3                   Age of Loneliness (Carly’s Song)  2006-11-29  13:55:42  
4  All Around the World or the Myth of Fingerprin...  2006-11-29  14:04:29   
 (66936, 6)


In [32]:
#encode the categorical variables:
#artist_name, release_name, recording_name

df['artist_name'] = df['artist_name'].astype('category').cat.codes
df['release_name'] = df['release_name'].astype('category').cat.codes
df['recording_name'] = df['recording_name'].astype('category').cat.codes

#print new head and shape of df
print(df.head(), "\n", df.shape)

#print the translation from code to artists, so we can see it worked

print(dict(zip(df['artist_name'].astype('category').cat.categories, df['artist_name'].astype('category').cat.codes)))

   user_id  artist_name  release_name  recording_name        date      time
0    16493         3665          4240            9463  2006-11-29  13:19:10
1     8793         9899         10764            1240  2006-11-29  13:52:16
2     6263         4286         10945            5040  2006-11-29  13:59:42
3     5838         2913          9918             676  2006-11-29  13:55:42
4     1061         6645          4260             759  2006-11-29  14:04:29 
 (66936, 6)
{0: 3665, 1: 9899, 2: 4286, 3: 2913, 4: 6645, 5: 2913, 6: 6797, 7: 6645, 8: 6593, 9: 7122, 10: 3747, 11: 8978, 12: 6257, 13: 8978, 14: 1138, 15: 3956, 16: 5510, 17: 1620, 18: 6203, 19: 7122, 20: 7716, 21: 9445, 22: 2813, 23: 5709, 24: 499, 25: 8599, 26: 1855, 27: 8599, 28: 9553, 29: 3024, 30: 5815, 31: 7122, 32: 5310, 33: 9060, 34: 2528, 35: 196, 36: 5472, 37: 3361, 38: 6715, 39: 342, 40: 1138, 41: 5449, 42: 6715, 43: 8297, 44: 9606, 45: 3252, 46: 6445, 47: 950, 48: 6645, 49: 10082, 50: 10154, 51: 7, 52: 9535, 53: 4289, 54: 6

In [33]:
#drop tall columns except artist since we will only be looking at artists first
dfNoTime = df.drop(['date', 'time', 'release_name', 'recording_name'], axis = 1)

#drop duplicate results (user/artist interactions)
dfNoTime = dfNoTime.drop_duplicates()

print(dfNoTime.head(), "\n", dfNoTime.shape)

   user_id  artist_name
0    16493         3665
1     8793         9899
2     6263         4286
3     5838         2913
4     1061         6645 
 (30266, 2)


In [34]:
#train/test split (randomized every time!)
#80/20
train, test = train_test_split(dfNoTime, test_size = 0.2)

#print head of each set followed by maxs of each set
print(train.head(), "\n\n", test.head(), "\n")

print(f"\tSizes:\n\ntrain:\t{train.shape[0]}\t\ttest:\t{test.shape[0]}")

       user_id  artist_name
5412      7956         5746
34118      700         6603
9533      5287         8228
20338      694         7140
8523     14446         4384 

        user_id  artist_name
7048     22536         8480
49225     3540         4665
33375    16189         1876
20046     5544         6858
30225     1039         7790 

	Sizes:

train:	24212		test:	6054


In [35]:
#will make a series of item interaction matrix factorizations and store them with the appropriate names for only artists
#First obtains the interaction matrix between all userse and artists by count
#then changes the mapping to 0,1 with 1 indicating there was an interaction between the user and the artist

#train user item interaction matrices
user_artist_train = train.pivot_table(index = 'user_id', columns = 'artist_name', aggfunc = 'size', fill_value = 0).map(lambda x: 1 if x != 0 else 0)

print(f"user_artist_train: {user_artist_train.shape}")
print("\n\n")

#test user item interaction matrices
user_artist_test = test.pivot_table(index = 'user_id', columns = 'artist_name', aggfunc = 'size', fill_value = 0).map(lambda x: 1 if x != 0 else 0)

print(f"user_artist_test: {user_artist_test.shape}")

user_artist_train: (1608, 9069)



user_artist_test: (1330, 3405)


In [36]:
#lets see a sample of the user interaction matrix (pivot table)
print(user_artist_train)

#save to csv for viewing in excel
# import numpy as np
# np.savetxt("test.csv", user_artist_train, delimiter=',')

artist_name  0      1      2      3      4      5      6      7      8      \
user_id                                                                      
5                0      0      0      0      0      0      0      0      0   
6                0      0      0      0      0      0      0      0      0   
10               0      0      0      0      0      0      0      0      0   
11               0      0      0      0      0      0      0      0      0   
14               0      0      0      0      0      0      0      0      0   
...            ...    ...    ...    ...    ...    ...    ...    ...    ...   
29635            0      0      0      0      0      0      0      0      0   
29839            0      0      0      0      0      0      0      0      0   
29854            0      0      0      0      0      0      0      0      0   
29886            0      0      0      0      0      0      0      0      0   
29911            0      0      0      0      0      0      0    

In [37]:
#print the values for the rows (user_id's) and the columns (artists)
print(user_artist_train.index.to_numpy())
print()
print(user_artist_train.columns.to_numpy())

[    5     6    10 ... 29854 29886 29911]

[    0     1     2 ... 10499 10500 10501]


In [38]:
#decompose the train matrix into parts
import numpy as np

#flatten user item interaction matrix into 2 parts: 
#   user_ids_flat = for all user_id's, repeats it the number of times per column => each user gets a column for each artist
#   item_ids_flat = for all artists, repeats it the number of times per row => each artist gets a row for each user
user_ids_flat = np.repeat(user_artist_train.index.to_numpy(), len(user_artist_train.columns.to_numpy()))
item_ids_flat = np.tile(user_artist_train.columns.to_numpy(), len(user_artist_train.index.to_numpy()))

#get flattened labels for entire matrix
labels = user_artist_train.values.flatten()

#verify length of labels = number of rows * number of cols
# i.e. it holds all the data in a flattened array
print(len(labels) == len(user_artist_train.index.to_numpy()) * len(user_artist_train.columns.to_numpy()))

True


# Algorithms on Artists

## NCF algorithm

Make sure to change the interpreter!

In [9]:
import tensorflow as tf

if tf.config.list_physical_devices('GPU'):
    print("GPU is available")
else:
    print("GPU is not available")

# List the available GPUs and their memory information
gpus = tf.config.get_visible_devices('GPU')
for gpu in gpus:
    memory_info = tf.config.experimental.get_memory_info('GPU:0')
    print(f"GPU: {gpu.name}")
    print(f"Current memory usage: {memory_info['current']} bytes")
    print(f"Peak memory usage: {memory_info['peak']} bytes")

#ensure we're able to use the GPU for processing the stuff
#I'm utilizing my home PC which has an NVIDIA RTX 3070

GPU is available
GPU: /physical_device:GPU:0
Current memory usage: 0 bytes
Peak memory usage: 0 bytes


In [10]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dot, Dense, Flatten, Dropout

#create NCF model structure

#input layers for user and item
user_input = Input(shape=(1,), name='user_input')
item_input = Input(shape=(1,), name='item_input')

#embedding layers to flatten the information from user-item interaction matrix
user_embedding = Embedding(input_dim=len(user_ids_flat), output_dim=5, name='user_embedding')(user_input)
item_embedding = Embedding(input_dim=len(item_ids_flat), output_dim=5, name='item_embedding')(item_input)

#flatten embedding layers for dot product
user_vec = Flatten()(user_embedding)
item_vec = Flatten()(item_embedding)

#dot product between user and item vectors (matrices)
dot_product = Dot(axes=1)([user_vec, item_vec])


#hidden layer with 512 neurons
h1 = Dense(2048, activation = 'relu') (dot_product)

#add 20% dropout to reduce overfitting
h1 = Dropout(0.2)(h1)


#hidden layer with 512 neurons
h2 = Dense(1024, activation = 'relu')(h1)

#add 20% dropout to reduce overfitting
h2 = Dropout(0.2)(h2)


#hidden layer with 512 neurons
h3 = Dense(512, activation = 'relu')(h2)

#add 20% dropout to reduce overfitting
h3 = Dropout(0.2)(h3)

#output with sigmoid activation function for binary classification (user likes artist)
output = Dense(1, activation='sigmoid')(h3)

#defining the model
ncf_model = Model(inputs=[user_input, item_input], outputs=output)

In [11]:
from tensorflow.keras import metrics

#model compilation
ncf_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', metrics.Precision(), metrics.Recall()])

#print summary of model too
ncf_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 item_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 user_embedding (Embedding)     (None, 1, 5)         77220560    ['user_input[0][0]']             
                                                                                                  
 item_embedding (Embedding)     (None, 1, 5)         77220560    ['item_input[0][0]']             
                                                                                              

In [12]:
#insert data as a single table into the model
user_item_data = [user_ids_flat, item_ids_flat]

#train the model on the combined data
ncf_model.fit(user_item_data, labels, epochs=5, batch_size=32768) # 65536 32768

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x26ec90c20a0>

In [13]:
#My notes upon changing the model for each significant change:

#Using the CPU

#111 minutes for 5 epochs with batch size 32768 for binary cross entropy loss function

#221 minutes for 10 epochs with batch size 32768 for MSE

#dataset 0: 424 mins for 5 epochs with batch size 16384



#Using the GPU

#dataset 17: 5 minutes for 5 epochs with batch size 32768
#5 minutes with adjusted neuron counts for same epochs and batch size

In [14]:
#decompose the test matrix into parts
import numpy as np

#flatten user item interaction matrix into 2 parts: 
#   user_ids_flat = for all user_id's, repeats it the number of times per column => each user gets a column for each artist
#   item_ids_flat = for all artists, repeats it the number of times per row => each artist gets a row for each user
user_ids_flat = np.repeat(user_artist_test.index.to_numpy(), len(user_artist_test.columns.to_numpy()))
item_ids_flat = np.tile(user_artist_test.columns.to_numpy(), len(user_artist_test.index.to_numpy()))

#get flattened labels for entire matrix
labels = user_artist_test.values.flatten()

#verify length of labels = number of rows * number of cols
# i.e. it holds all the data in a flattened array
print(len(labels) == len(user_artist_test.index.to_numpy()) * len(user_artist_test.columns.to_numpy()))

True


In [15]:
#model prediction on test data
pred = ncf_model.predict([user_ids_flat, item_ids_flat])

#takes around 15 mins on parquet 17

#5 minutes with adjusted neuron counts for hidden layers



In [16]:
#reshape the prediction so we can see the results
pred = pred.reshape(len(user_artist_test.index.to_numpy()),len(user_artist_test.columns.to_numpy()))
print(pred.shape)
print(pred)

#save to csv for viewing on machine
np.savetxt("test.csv", pred, delimiter=',')

(1422, 4595)
[[6.41770894e-05 1.50387571e-03 1.14942246e-04 ... 3.81117308e-04
  6.41770894e-05 6.85204053e-03]
 [3.93285554e-05 4.01007431e-03 6.11971179e-03 ... 3.90591379e-03
  2.04896159e-03 2.10656915e-02]
 [3.27590038e-03 1.05575717e-03 2.87834046e-05 ... 7.01974437e-04
  1.40420525e-04 1.94723054e-03]
 ...
 [1.51610207e-02 3.65556101e-03 9.83655264e-05 ... 3.49230453e-04
  5.85206086e-04 4.67651896e-03]
 [4.98800888e-04 1.44600589e-02 9.85409133e-04 ... 1.17245328e-03
  2.76998297e-04 9.75499395e-03]
 [1.29936016e-04 8.16146086e-04 8.52018013e-04 ... 2.89623014e-04
  2.62918533e-04 8.34032334e-03]]


## Collaborative Filtering

Make sure to change the interpreter!

In [39]:
#we will be using the same user-artist interaction matrix from before: See `train` earlier in the code
print(train.head())

#append a new column with a 1 in each value indicating an interaction between the user and artist
train['interaction'] = 1

print("\n", train.head())

       user_id  artist_name
5412      7956         5746
34118      700         6603
9533      5287         8228
20338      694         7140
8523     14446         4384

        user_id  artist_name  interaction
5412      7956         5746            1
34118      700         6603            1
9533      5287         8228            1
20338      694         7140            1
8523     14446         4384            1


In [40]:
#convert from a dataframe to a csr_matrix (compressed sparse row matrix)
from scipy.sparse import csr_matrix

row = train['user_id']
col = train['artist_name']
interaction = train['interaction']

user_artist_train_matrix = csr_matrix((interaction, (row, col)))
print(user_artist_train_matrix)

  (5, 3617)	1
  (5, 4580)	1
  (5, 8272)	1
  (5, 10094)	1
  (6, 344)	1
  (6, 1396)	1
  (6, 2571)	1
  (6, 2685)	1
  (6, 3431)	1
  (6, 4560)	1
  (6, 5746)	1
  (6, 6133)	1
  (6, 6346)	1
  (6, 6380)	1
  (6, 6595)	1
  (6, 6799)	1
  (6, 6994)	1
  (6, 7180)	1
  (6, 8284)	1
  (6, 8294)	1
  (6, 8964)	1
  (6, 9173)	1
  (6, 9549)	1
  (6, 9764)	1
  (10, 165)	1
  :	:
  (29854, 4592)	1
  (29854, 4907)	1
  (29854, 6983)	1
  (29854, 8190)	1
  (29886, 409)	1
  (29886, 1010)	1
  (29886, 1358)	1
  (29886, 2456)	1
  (29886, 3129)	1
  (29886, 3584)	1
  (29886, 4171)	1
  (29886, 6432)	1
  (29886, 7769)	1
  (29886, 7812)	1
  (29886, 7822)	1
  (29886, 8194)	1
  (29886, 8929)	1
  (29886, 9439)	1
  (29886, 9600)	1
  (29886, 10204)	1
  (29911, 1092)	1
  (29911, 3893)	1
  (29911, 9815)	1
  (29911, 9964)	1
  (29911, 10246)	1


In [49]:
import implicit

#model fitting
model = implicit.als.AlternatingLeastSquares(factors = 150, regularization = 0.001, iterations = 100)
model.fit(user_artist_train_matrix)

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [00:44<00:00,  2.26it/s]


In [50]:
#adjust the test data to be a csr

#append a new column with a 1 in each value indicating an interaction between the user and artist
test['interaction'] = 1

#prep for turning test into a csr
row = test['user_id']
col = test['artist_name']
interaction = test['interaction']

#creating the csr
user_artist_test_matrix = csr_matrix((interaction, (row, col)))

In [51]:
#model predictions
import implicit.evaluation

implicit.evaluation.AUC_at_k(model, train_user_items = user_artist_train_matrix, test_user_items = user_artist_test_matrix, K = 10, show_progress = True, num_threads = 4)

#ROC score of 0.5210884671362229 ~ random === guessing

100%|██████████| 1330/1330 [00:00<00:00, 10472.59it/s]


0.5202834576915786

In [56]:
#perform grid search to find the best params:

#define param grid to search
param_grid = {
    'factors' : [8, 10, 15],
    'regularization' : [0.15, 0.18, 0.2, 0.23, 0.27, 0.3],
    'iterations' : [60, 65, 75, 80, 85],
    'alphas' : [10]
}

#define best AUC and params found
best_AUC = -1
best_params = None

#perform Grid Search
for factor in param_grid['factors']:
    for reg in param_grid['regularization']:
        for iter in param_grid['iterations']:
            for alp in param_grid['alphas']:
                #define model with given params
                #utilizing 4 cores on my CPU to speed up process
                model = implicit.als.AlternatingLeastSquares(factors = factor, regularization = reg, iterations = iter, alpha = alp, num_threads = 4)
                
                #model training
                model.fit(user_artist_train_matrix)

                #obtain AUC (eval)
                #k = 10 => model bases performance on recommendation for top 10 artists
                auc = implicit.evaluation.AUC_at_k(model, train_user_items = user_artist_train_matrix, test_user_items = user_artist_test_matrix, K = 10, show_progress = True, num_threads = 4)

                #compare AUC score and if auc is better, we update current stored bests
                if auc > best_AUC:
                    best_AUC = auc
                    best_params = {'factor' : factor, 'reg' : reg, 'iter' : iter, 'alpha' : alp}

print(f"Best hyperparams via grid search: {best_params}\nBest AUC: {best_AUC}")
#order for best_params is: factor, reg, iter, alpha

100%|██████████| 60/60 [00:08<00:00,  7.12it/s]
100%|██████████| 1330/1330 [00:00<00:00, 13999.89it/s]
100%|██████████| 65/65 [00:09<00:00,  6.70it/s]
100%|██████████| 1330/1330 [00:00<00:00, 13854.31it/s]
100%|██████████| 75/75 [00:11<00:00,  6.78it/s]
100%|██████████| 1330/1330 [00:00<00:00, 13711.36it/s]
100%|██████████| 80/80 [00:11<00:00,  6.84it/s]
100%|██████████| 1330/1330 [00:00<00:00, 13996.38it/s]
100%|██████████| 85/85 [00:12<00:00,  6.98it/s]
100%|██████████| 1330/1330 [00:00<00:00, 13964.11it/s]
100%|██████████| 60/60 [00:08<00:00,  6.72it/s]
100%|██████████| 1330/1330 [00:00<00:00, 13571.49it/s]
100%|██████████| 65/65 [00:09<00:00,  6.72it/s]
100%|██████████| 1330/1330 [00:00<00:00, 14176.54it/s]
100%|██████████| 75/75 [00:11<00:00,  6.60it/s]
100%|██████████| 1330/1330 [00:00<00:00, 14061.08it/s]
100%|██████████| 80/80 [00:11<00:00,  7.08it/s]
100%|██████████| 1330/1330 [00:00<00:00, 13999.33it/s]
100%|██████████| 85/85 [00:12<00:00,  6.69it/s]
100%|██████████| 1330/133

Best hyperparams via grid search: {'factor': 8, 'reg': 0.2, 'iter': 75, 'alpha': 10}
Best AUC: 0.5472981589956554





In [None]:
#documenting best performance

#params: 10, 0.1, 25
#auc: 0.5384916964587049

# Best hyperparams via grid search: {'factor': 10, 'reg': 0.05, 'iter': 30}
# Best AUC: 0.5373458879176055

In [None]:
#added alpha to grid search

# Best hyperparams via grid search: {'factor': 10, 'reg': 0.2, 'iter': 75, 'alpha': 10}
# Best AUC: 0.5467240334957757

# Best hyperparams via grid search: {'factor': 8, 'reg': 0.2, 'iter': 75, 'alpha': 10}
# Best AUC: 0.5472981589956554