In [1]:
import pandas as pd

# Load the dataset
file_path = './seleccted_users.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe
data.head()


Unnamed: 0.1,Unnamed: 0,user,song,time_zone,popularity,release,danceability,energy,key,mode,valence,tempo,duration_ms
0,0,user_007XIjOr,DaTQ53TUmfP93FSr,0.5,0.578947,0.997018,0.364372,0.257,0.636364,1.0,0.130261,0.316884,0.041089
1,1,user_007XIjOr,DaTQ53TUmfP93FSr,0.5,0.578947,0.997018,0.364372,0.257,0.636364,1.0,0.130261,0.316884,0.041089
2,2,user_02DWuQOR,DaTQ53TUmfP93FSr,0.5,0.578947,0.997018,0.364372,0.257,0.636364,1.0,0.130261,0.316884,0.041089
3,3,user_02DWuQOR,DaTQ53TUmfP93FSr,0.5,0.578947,0.997018,0.364372,0.257,0.636364,1.0,0.130261,0.316884,0.041089
4,4,user_02DWuQOR,DaTQ53TUmfP93FSr,0.5,0.578947,0.997018,0.364372,0.257,0.636364,1.0,0.130261,0.316884,0.041089


In [2]:
# Remove the 'Unnamed: 0' column
data = data.drop(columns=['Unnamed: 0'])

# Convert 'user' and 'song' identifiers to categorical variables and then to integer indices
data['user_id'] = data['user'].astype('category').cat.codes
data['song_id'] = data['song'].astype('category').cat.codes

# Check the number of unique users and songs
num_users = data['user_id'].nunique()
num_songs = data['song_id'].nunique()

# Display the transformation and the number of unique users and songs
transformed_head = data.head()
transformed_head, num_users, num_songs


(            user              song  time_zone  popularity   release  \
 0  user_007XIjOr  DaTQ53TUmfP93FSr        0.5    0.578947  0.997018   
 1  user_007XIjOr  DaTQ53TUmfP93FSr        0.5    0.578947  0.997018   
 2  user_02DWuQOR  DaTQ53TUmfP93FSr        0.5    0.578947  0.997018   
 3  user_02DWuQOR  DaTQ53TUmfP93FSr        0.5    0.578947  0.997018   
 4  user_02DWuQOR  DaTQ53TUmfP93FSr        0.5    0.578947  0.997018   
 
    danceability  energy       key  mode   valence     tempo  duration_ms  \
 0      0.364372   0.257  0.636364   1.0  0.130261  0.316884     0.041089   
 1      0.364372   0.257  0.636364   1.0  0.130261  0.316884     0.041089   
 2      0.364372   0.257  0.636364   1.0  0.130261  0.316884     0.041089   
 3      0.364372   0.257  0.636364   1.0  0.130261  0.316884     0.041089   
 4      0.364372   0.257  0.636364   1.0  0.130261  0.316884     0.041089   
 
    user_id  song_id  
 0        0      443  
 1        0      443  
 2        1      443  
 3        

In [3]:
# We will use the 'song_id' of the next song as the label.
# The features will be all other columns except 'user' and 'song' which are now represented by 'user_id' and 'song_id'.

# Shift the song_id column to get the next song
data['next_song_id'] = data.groupby('user_id')['song_id'].shift(-1)

# Drop the last occurrence of each user where the next song is NaN due to the shift operation
data = data.dropna(subset=['next_song_id'])

# Convert next_song_id to integer as the shift operation converts it to float
data['next_song_id'] = data['next_song_id'].astype(int)

# Now, let's separate the dataframe into X (features) and y (labels)
X = data.drop(columns=['user', 'song', 'next_song_id'])
y = data['next_song_id']

# Display the first few rows of features and labels to verify
X.head(), y.head()


(   time_zone  popularity   release  danceability  energy       key  mode  \
 0        0.5    0.578947  0.997018      0.364372   0.257  0.636364   1.0   
 1        0.5    0.578947  0.997018      0.364372   0.257  0.636364   1.0   
 2        0.5    0.578947  0.997018      0.364372   0.257  0.636364   1.0   
 3        0.5    0.578947  0.997018      0.364372   0.257  0.636364   1.0   
 4        0.5    0.578947  0.997018      0.364372   0.257  0.636364   1.0   
 
     valence     tempo  duration_ms  user_id  song_id  
 0  0.130261  0.316884     0.041089        0      443  
 1  0.130261  0.316884     0.041089        0      443  
 2  0.130261  0.316884     0.041089        1      443  
 3  0.130261  0.316884     0.041089        1      443  
 4  0.130261  0.316884     0.041089        1      443  ,
 0     443
 1    1238
 2     443
 3     443
 4     443
 Name: next_song_id, dtype: int32)

In [4]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shape of the training and validation sets
(X_train.shape, X_val.shape), (y_train.shape, y_val.shape)


(((3088, 12), (773, 12)), ((3088,), (773,)))

In [9]:
sequence_length = 10

In [10]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Flatten, concatenate
from tensorflow.keras.optimizers import Adam

# Assuming sequence_length and num_features are defined based on your data
def create_model_with_lstm(num_users, num_songs, sequence_length, num_features, embedding_size=50, lstm_units=64):
    # User and song sequence input layers
    user_sequence_input = Input(shape=(sequence_length,), name='user_sequence_input')
    song_sequence_input = Input(shape=(sequence_length,), name='song_sequence_input')

    # Timestamp input (assuming preprocessed to a suitable format, e.g., scaled)
    timestamp_input = Input(shape=(sequence_length, 1), name='timestamp_input')

    # Embedding layers for user and song sequences
    user_embedding = Embedding(output_dim=embedding_size, input_dim=num_users, input_length=sequence_length, name='user_embedding')(user_sequence_input)
    song_embedding = Embedding(output_dim=embedding_size, input_dim=num_songs, input_length=sequence_length, name='song_embedding')(song_sequence_input)

    # Flatten the embeddings
    user_vec = Flatten()(user_embedding)
    song_vec = Flatten()(song_embedding)

    # LSTM layer for combined sequence and timestamp data
    combined_input = concatenate([user_vec, song_vec, Flatten()(timestamp_input)])
    lstm_layer = LSTM(lstm_units, return_sequences=False)(Reshape((sequence_length, -1))(combined_input))

    # Concatenate LSTM output with other attributes (if any)
    other_input = Input(shape=(num_features,), name='other_input') 
    concat = concatenate([lstm_layer, other_input])

    # Dense layers
    dense = Dense(256, activation='relu')(concat)
    dropout = Dropout(0.5)(dense)
    output = Dense(num_songs, activation='softmax')(dropout)

    # Create and compile the model
    model = Model(inputs=[user_sequence_input, song_sequence_input, timestamp_input, other_input], outputs=output)
    model.compile(optimizer=Adam(0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

# Assuming num_users, num_songs, sequence_length, and num_features are defined based on your data
model = create_model_with_lstm(num_users, num_songs, sequence_length, num_features)

# Display the model summary
model.summary()



NameError: name 'num_features' is not defined

In [5]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, concatenate, Dense, Dropout, Flatten
from tensorflow.keras.optimizers import Adam

# Define the model
def create_model(num_users, num_songs, embedding_size=50):
    # User and song input layers
    user_input = Input(shape=(1,), name='user_input')
    song_input = Input(shape=(1,), name='song_input')
    print(user_input)
    
    # Other song attributes input layer
    other_input = Input(shape=(X_train.shape[1]-2,), name='other_input')  # minus 2 for user_id and song_id

    # Embedding layers
    user_embedding = Embedding(output_dim=embedding_size, input_dim=num_users, input_length=1, name='user_embedding')(user_input)
    song_embedding = Embedding(output_dim=embedding_size, input_dim=num_songs, input_length=1, name='song_embedding')(song_input)
    
    # Flatten the embeddings
    user_vec = Flatten()(user_embedding)
    song_vec = Flatten()(song_embedding)
    
    # Concatenate embeddings with other attributes
    concat = concatenate([user_vec, song_vec, other_input])
    
    # Dense layers [[
    dense = Dense(256, activation='relu')(concat)
    dropout = Dropout(0.5)(dense)
    output = Dense(num_songs, activation='softmax')(dropout)
    
    # Create and compile the model
    model = Model(inputs=[user_input, song_input, other_input], outputs=output)
    model.compile(optimizer=Adam(0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

# Create the model
model = create_model(num_users, num_songs)

# Display the model summary
model.summary()


KerasTensor(type_spec=TensorSpec(shape=(None, 1), dtype=tf.float32, name='user_input'), name='user_input', description="created by layer 'user_input'")
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 song_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 user_embedding (Embedding)     (None, 1, 50)        500         ['user_input[0][0]']             
                                                                                                  
 song_embedding (Embedding)     (None, 1,

In [6]:
# Assuming X_train and y_train are already defined and correctly formatted

# Prepare inputs for the embedding layers
user_ids = X_train['user_id'].values
song_ids = X_train['song_id'].values

# Prepare the other song attributes input
other_attributes = X_train.drop(columns=['user_id', 'song_id']).values

# Train the model
history = model.fit(
    [user_ids, song_ids, other_attributes], # input as a list
    y_train, # labels
    batch_size=32, # example batch size
    epochs=100, # example number of epochs
    validation_split=0.2, # use part of the training data for validation
    verbose=1
)

# Save the model
model.save('song_recommendation_model.h5')

# You can then plot the training history to evaluate the training process
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Similarly for the loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100

KeyboardInterrupt: 

In [None]:
import numpy as np

def make_recommendation(model, user_id, current_song_id, song_features, num_recommendations=5):
    # Prepare the input for the model
    user_input = np.array([user_id])
    song_input = np.array([current_song_id])
    other_features_input = np.array([song_features])
    
    # Get the prediction (probability distribution over songs)
    predictions = model.predict([user_input, song_input, other_features_input])
    
    # Get the indices of the top N recommendations
    recommended_song_ids = predictions[0].argsort()[-num_recommendations:][::-1]
    
    return recommended_song_ids

# Example usage:
user_id = 5  # example user id
current_song_id = 100  # example current song id
song_features = X_val.iloc[0, 2:].values  # example song features excluding user and song id

recommendations = make_recommendation(model, user_id, current_song_id, song_features)
print(f"Recommended song IDs for user {user_id}: {recommendations}")


In [None]:
# Create a mapping from 'song_id' to 'song' names
song_id_to_name = pd.Series(data['song'].values, index=data['song_id']).to_dict()

# Display the first few elements of the mapping
list(song_id_to_name.items())[:5]

# Example recommended song IDs
recommended_song_ids = [100]

# Get the recommended song names using the mapping
recommended_song_names = [song_id_to_name[song_id] for song_id in recommended_song_ids]

recommended_song_names
