# Setup

In [1]:
#import libraries and functions
import pandas as pd
from sklearn.model_selection import train_test_split

#import dataset as a dataframe from a csv
df = pd.read_csv('../../Project1/completeData17.csv')

#print first 5 rows of df
print(df.head(), "\n", df.shape)

   user_id              artist_name                  release_name  \
0    16493     Greg MacPherson Band  Good Times Coming Back Again   
1     8793  Wolfgang Amadeus Mozart     The World of Sacred Music   
2     6263                    Japan                      Tin Drum   
3     5838                   Enigma          The Cross of Changes   
4     1061               Paul Simon                     Graceland   

                                      recording_name        date      time  
0                                            Numbers  2006-11-29  13:19:10  
1                                   Ave Verum Corpus  2006-11-29  13:52:16  
2                                             Ghosts  2006-11-29  13:59:42  
3                   Age of Loneliness (Carly’s Song)  2006-11-29  13:55:42  
4  All Around the World or the Myth of Fingerprin...  2006-11-29  14:04:29   
 (66936, 6)


In [2]:
#encode the categorical variables:
#artist_name, release_name, recording_name

df['artist_name'] = df['artist_name'].astype('category').cat.codes
df['release_name'] = df['release_name'].astype('category').cat.codes
df['recording_name'] = df['recording_name'].astype('category').cat.codes

#print new head and shape of df
print(df.head(), "\n", df.shape)

   user_id  artist_name  release_name  recording_name        date      time
0    16493         3665          4240            9463  2006-11-29  13:19:10
1     8793         9899         10764            1240  2006-11-29  13:52:16
2     6263         4286         10945            5040  2006-11-29  13:59:42
3     5838         2913          9918             676  2006-11-29  13:55:42
4     1061         6645          4260             759  2006-11-29  14:04:29 
 (66936, 6)


In [3]:
df2 = df.drop(['date', 'time'], axis = 1)

In [4]:
#train/test split
#80/20
train, test = train_test_split(df2, test_size = 0.2)

#print head of each set followed by sizes of each set
print(train.head(), "\n", test.head())

print("Sizes:\t train\t\t test")
print("\t", train.shape[0], "\t", test.shape[0])

       user_id  artist_name  release_name  recording_name
25722      366         1165          4698           12845
63742     2891         7625           819           11501
43332      677          599          4970           12926
41491     9441          134           440           11053
37914     1953         4657         11353            2414 
        user_id  artist_name  release_name  recording_name
8707       261         8817          8721           11601
43439    12040         9266          2025            2530
25257    28549         4581          6824            7951
42095     2967         4940          9597            7280
11149      642         7023         12248            9059
Sizes:	 train		 test
	 53548 	 13388


In [5]:
#will make a series of matrix factorizations and store them with the appropriate names

user_artist = df.pivot_table(index = 'user_id', columns = 'artist_name', aggfunc = 'size', fill_value = 0)
user_release = df.pivot_table(index = 'user_id', columns = 'release_name', aggfunc = 'size', fill_value = 0)
user_recording = df.pivot_table(index = 'user_id', columns = 'recording_name', aggfunc = 'size', fill_value = 0)

print(user_artist.head())
print("\n\n")
print(user_release.head())
print("\n\n")
print(user_recording.head())

artist_name  0      1      2      3      4      5      6      7      8      \
user_id                                                                      
5                0      0      0      0      0      0      0      0      0   
6                0      0      0      0      0      0      0      0      0   
10               0      0      0      0      0      0      0      0      0   
11               0      0      0      0      0      0      0      0      0   
14               0      0      0      0      0      0      0      0      0   

artist_name  9      ...  10493  10494  10495  10496  10497  10498  10499  \
user_id             ...                                                    
5                0  ...      0      0      0      0      0      0      0   
6                0  ...      0      0      0      0      0      0      0   
10               0  ...      0      0      0      0      0      0      0   
11               0  ...      0      0      0      0      0      0      0 

# Algorithms

In [3]:
# PARQUET 0

#import dataset as a dataframe from a csv
df = pd.read_csv('../../Project1/completeData0.csv')

df['artist_name'] = df['artist_name'].astype('category').cat.codes
df['release_name'] = df['release_name'].astype('category').cat.codes
df['recording_name'] = df['recording_name'].astype('category').cat.codes

df2 = df.drop(['date', 'time'], axis = 1)

#train/test split
#80/20
train, test = train_test_split(df2, test_size = 0.2)

## DBSCAN

In [5]:
# PARQUET 0

from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

# Define a range of eps and minPts values to search over
eps_range = [200000, 500000] #eps_range = [0.1, 0.5, 1.0, 1.5]
minPts_range = [5, 10, 15, 20] #minPts_range = [5, 10, 15, 20]

best_score = -1
best_eps = None
best_minPts = None

for eps in eps_range:
    for minPts in minPts_range:
        print(f"Current eps is: {eps} and current minPts is: {minPts}")
        dbscan = DBSCAN(eps=eps, min_samples=minPts)
        labels = dbscan.fit_predict(train)
        silhouette = silhouette_score(train, labels)
        if silhouette > best_score:
            best_score = silhouette
            best_eps = eps
            best_minPts = minPts

print("Best eps:", best_eps)
print("Best minPts:", best_minPts)
print("Best Score is:", best_score)

print("Previous bests were: eps = 2000, minPts = 5, score = 0.30843316542674687")

ModuleNotFoundError: No module named 'cuml'

In [89]:
# PARQUET 17

from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

# Define a range of eps and minPts values to search over
eps_range = [2000, 3000, 4000] #eps_range = [0.1, 0.5, 1.0, 1.5]
minPts_range = [5, 10, 15, 20] #minPts_range = [5, 10, 15, 20]

best_score = -1
best_eps = None
best_minPts = None

for eps in eps_range:
    for minPts in minPts_range:
        print(f"Current eps is: {eps} and current minPts is: {minPts}")
        dbscan = DBSCAN(eps=eps, min_samples=minPts)
        labels = dbscan.fit_predict(train)
        silhouette = silhouette_score(train, labels)
        if silhouette > best_score:
            best_score = silhouette
            best_eps = eps
            best_minPts = minPts

print("Best eps:", best_eps)
print("Best minPts:", best_minPts)
print("Best Score is:", best_score)

print("Previous bests were: eps = 2000, minPts = 5, score = 0.30843316542674687")

Current eps is: 2000 and current minPts is: 5
checking validity
Current eps is: 2000 and current minPts is: 10
checking validity
Current eps is: 2000 and current minPts is: 15
checking validity
Current eps is: 2000 and current minPts is: 20
checking validity
Current eps is: 3000 and current minPts is: 5
checking validity


ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

## NCF algorithm

### New attempt on user item interactions data

In [6]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dot, Dense, Flatten, Dropout
import numpy as np

#flatten user item interaction matrix
user_ids_flat = np.repeat(user_artist.index.to_numpy(), len(user_artist.columns.to_numpy()))
item_ids_flat = np.tile(user_artist.columns.to_numpy(), len(user_artist.index.to_numpy()))

# Flattened labels
labels = user_artist.values.flatten()




In [7]:
#create NCF model structure

#input layers for user and item
user_input = Input(shape=(1,), name='user_input')
item_input = Input(shape=(1,), name='item_input')

#embedding layers to flatten the information from user-item interaction matrix
user_embedding = Embedding(input_dim=len(user_ids_flat), output_dim=16, name='user_embedding')(user_input)
item_embedding = Embedding(input_dim=len(item_ids_flat), output_dim=16, name='item_embedding')(item_input)

#flatten embedding layers for dot product
user_vec = Flatten()(user_embedding)
item_vec = Flatten()(item_embedding)

#dot product between user and item vectors (matrices)
dot_product = Dot(axes=1)([user_vec, item_vec])

#add 1 hidden layer with 512 neurons
x = Dense(512, activation='relu')(dot_product)

#add 20% dropout to reduce overfitting
x = Dropout(0.2)(x)

#output with sigmoid activation function for binary classification (user likes artist)
output = Dense(1, activation='sigmoid')(x)

#defining the model
ncf_model = Model(inputs=[user_input, item_input], outputs=output)




In [8]:
#model compilation
ncf_model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])




In [10]:
#insert data as a single table into the model
user_item_data = [user_ids_flat, item_ids_flat]

#train the model on the combined data
ncf_model.fit(user_item_data, labels, epochs=10, batch_size=32768)

#111 minutes for 5 epochs with batch size 32768 for binary cross entropy loss function

#221 minutes for 10 epochs with batch size 32768 for MSE

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x17b6f00dbd0>

In [11]:
#model eval
ncf_model.evaluate(user_item_data, labels)



[0.0314859077334404, 0.9981666803359985]

In [44]:
#why is my loss negative?
#does the data not have only binary values??

#Binary Cross-Entropy Loss = − (y_actual * log(y_pred) + (1 - y_actual) * log(1 - y_pred))

print(np.unique(labels))

#it does lol

#will change the loss function from BCE to MSE

[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  40  41  42  43  44  45  46  47  51  52  53  56  58  59  60
  61  62  63  64  66  72  76  78  86  87  91  94  96  99 118 198]


### Previous attempt

In [50]:
#input layers
user_id_input = Input(shape=(1,), name='user_id_input')
artist_name_input = Input(shape=(1,), name='artist_name_input')
release_name_input = Input(shape=(1,), name='release_name_input')
recording_name_input = Input(shape=(1,), name='recording_name_input')

In [51]:
#embedding size for embedding layers
embeddingSize = 50

#embedding layers
user_embedding = Embedding(input_dim=len(df['user_id'].unique()), output_dim=embeddingSize)(user_id_input)
artist_embedding = Embedding(input_dim=len(df['artist_name'].unique()), output_dim=embeddingSize)(artist_name_input)
release_embedding = Embedding(input_dim=len(df['release_name'].unique()), output_dim=embeddingSize)(release_name_input)
recording_embedding = Embedding(input_dim=len(df['recording_name'].unique()), output_dim=embeddingSize)(recording_name_input)

In [52]:
#concatenate embeddings then flatten them
concatenated = Concatenate()([user_embedding, artist_embedding, release_embedding, recording_embedding])
flatten = Flatten()(concatenated)

In [53]:
#add dense layers for learning
hidden1 = Dense(128, activation = 'relu')(flatten)
hidden2 = Dense(64, activation = 'relu')(hidden1)
output = Dense(1)(hidden2)

In [56]:
#create model
model = Model(inputs = [user_id_input, artist_name_input, release_name_input, recording_name_input], outputs = output)

#compile model
model.compile(optimizer = 'adam', loss = 'mse')

In [58]:
#train model
model.fit([train['user_id'], train['artist_name'], train['release_name'], train['recording_name']], epochs = 10)

Epoch 1/10


ValueError: in user code:

    File "c:\Users\derpi\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\derpi\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\derpi\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\derpi\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1152, in train_step
        self._validate_target_and_loss(y, loss)
    File "c:\Users\derpi\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1106, in _validate_target_and_loss
        raise ValueError(

    ValueError: Target data is missing. Your model was compiled with loss=mse, and therefore expects target data to be provided in `fit()`.


## Collaborative Filtering

In [None]:
# idk how to do it yet but i will figure it out