In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Flatten, concatenate, Reshape
from tensorflow.keras.optimizers import Adam
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split


In [2]:
metadata_path = '../data/id_metadata.csv'
listening_history_path = '../data/listening_history.csv'

metadata_df = pd.read_csv(metadata_path, delimiter='\t')
listening_history_df = pd.read_csv(listening_history_path, delimiter='\t')

In [3]:
listening_history_df = listening_history_df.head(10000)

In [4]:
listening_history_df['timestamp'] = pd.to_datetime(listening_history_df['timestamp'])
listening_history_df.sort_values(by=['user', 'timestamp'], inplace=True)
user_encoder = LabelEncoder()
song_encoder = LabelEncoder()
listening_history_df['user_id'] = user_encoder.fit_transform(listening_history_df['user'])
listening_history_df['song_id'] = song_encoder.fit_transform(listening_history_df['song'])

In [5]:
def classify_time_of_day(timestamp):
    hour = timestamp.hour
    if 4 <= hour < 12:
        return 1  # 4:00 AM to 11:59 AM
    elif 12 <= hour < 20:
        return 2  # 12:00 PM to 7:59 PM
    else:
        return 3  # 8:00 PM to 3:59 AM

In [6]:
listening_history_df['time_of_day'] = listening_history_df['timestamp'].apply(classify_time_of_day)

In [7]:
listening_history_df

Unnamed: 0,user,song,timestamp,user_id,song_id,time_of_day
0,user_007XIjOr,DaTQ53TUmfP93FSr,2019-02-20 12:28:00,0,995,2
1,user_007XIjOr,dGeyvi5WCOjDU7da,2019-02-20 12:35:00,0,2727,2
2,user_007XIjOr,qUm54NYOjeFhmKYx,2019-02-20 12:48:00,0,3678,2
3,user_007XIjOr,FtnuMT1DlevSR2n5,2019-02-20 12:52:00,0,1146,2
4,user_007XIjOr,LHETTZcSZLeaVOGh,2019-02-20 13:09:00,0,1503,2
...,...,...,...,...,...,...
9995,user_06kNhNYa,ovi7JfEwHwZVac7L,2019-01-27 20:35:00,26,3551,3
9996,user_06kNhNYa,xfSE1aB7yIH42NPi,2019-01-27 20:39:00,26,4208,3
9997,user_06kNhNYa,E3iQOe4M979du9C8,2019-01-28 16:10:00,26,1023,2
9998,user_06kNhNYa,LuFeu9smK7RiTZqD,2019-01-28 16:18:00,26,1546,2


In [8]:
listening_history_df['timestamp'] = (listening_history_df['timestamp'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

In [9]:
# Assuming the 'id' column in metadata_df corresponds to the song ID
# Set 'id' as the index for easy lookup
metadata_df.set_index('id', inplace=True)
# Selecting the columns to be scaled
features_to_scale = ['popularity', 'release', 'danceability', 'energy', 'key', 'mode', 'valence', 'tempo', 'duration_ms']

# Fit the scaler to the metadata features
scaler = MinMaxScaler()
metadata_df[features_to_scale] = scaler.fit_transform(metadata_df[features_to_scale])


In [10]:
# Now the get_song_metadata function will return scaled values
def get_song_metadata(song_id):
    if song_id in metadata_df.index:
        metadata = metadata_df.loc[song_id, features_to_scale]
        return metadata.values
    else:
        return np.zeros(len(features_to_scale))

In [11]:
get_song_metadata('0010xmHR6UICBOYT')

array([0.4842105263157895, 0.9940357852882702, 0.5981781376518218, 0.513,
       0.6363636363636364, 0.0, 0.2635270541082164, 0.7089578967736092,
       0.06372524451262468], dtype=object)

In [12]:

# classify_time_of_day(pd.to_datetime(user_df['timestamp'].tolist()[0], unit='s'))

In [13]:
sequence_length = 10
users = listening_history_df['user_id'].unique()
song_sequences = []
timestamp_sequences = []
next_song_labels = []
metadata_sequences = []
time_of_day_sequences = []  # To store time-of-day sequences

In [14]:
for user in users:
    user_df = listening_history_df[listening_history_df['user_id'] == user]
    user_songs = user_df['song_id'].tolist()
    user_timestamps = user_df['timestamp'].tolist()
    user_time_of_day = user_df['time_of_day'].tolist()  # Fetch time-of-day data

    for i in range(1, len(user_songs)):
        end_idx = min(i + sequence_length, len(user_songs))
        start_idx = max(0, end_idx - sequence_length)

        song_sequence = user_songs[start_idx:end_idx]
        timestamp_sequence = user_timestamps[start_idx:end_idx]
        metadata_sequence = [get_song_metadata(song) for song in user_songs[start_idx:end_idx]]
        time_of_day_sequence = user_time_of_day[start_idx:end_idx]  # Use the time-of-day data

        if len(song_sequence) == sequence_length:
            song_sequences.append(song_sequence)
            timestamp_sequences.append(timestamp_sequence)
            metadata_sequences.append(metadata_sequence)
            time_of_day_sequences.append(time_of_day_sequence)

            if i + 1 < len(user_songs):
                next_song_labels.append(user_songs[i + 1])


In [15]:
# Number of unique users and songs
num_users = len(user_encoder.classes_)
num_songs = len(song_encoder.classes_)
metadata_dim = len(features_to_scale)

# Embedding dimensions
user_embedding_dim = 50
song_embedding_dim = 50
time_of_day_dim = 3  # Assuming 4 categories for time of day

# Model architecture
user_input = Input(shape=(sequence_length,), name='user_input')
user_embedding = Embedding(num_users, user_embedding_dim, input_length=sequence_length)(user_input)

song_input = Input(shape=(sequence_length,), name='song_input')
song_embedding = Embedding(num_songs, song_embedding_dim, input_length=sequence_length)(song_input)

metadata_input = Input(shape=(sequence_length, metadata_dim), name='metadata_input')
# No need to flatten the metadata
# reshape_metadata = Reshape((sequence_length, metadata_dim))(metadata_input)

time_of_day_input = Input(shape=(sequence_length,), name='time_of_day_input')
time_of_day_embedding = Embedding(time_of_day_dim, time_of_day_dim, input_length=sequence_length)(time_of_day_input)

concatenated = concatenate([user_embedding, song_embedding, metadata_input, time_of_day_embedding])

lstm_layer = LSTM(128, return_sequences=False)(concatenated)
dense_layer = Dense(64, activation='relu')(lstm_layer)
dropout_layer = Dropout(0.2)(dense_layer)
output_layer = Dense(num_songs, activation='softmax')(dropout_layer)

model = Model(inputs=[user_input, song_input, metadata_input, time_of_day_input], outputs=output_layer)
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 10)]         0           []                               
                                                                                                  
 song_input (InputLayer)        [(None, 10)]         0           []                               
                                                                                                  
 time_of_day_input (InputLayer)  [(None, 10)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 10, 50)       1350        ['user_input[0][0]']             
                                                                                              

In [20]:
# Ensure the lengths of song_sequences and next_song_labels are the same
assert len(song_sequences) == len(next_song_labels)

# Create user_array
user_array = np.array([user_id for user_id in user_df['user_id'].unique() for _ in range(len(song_sequences)//len(user_df['user_id'].unique()))])

# Check lengths again
assert len(user_array) == len(song_sequences) == len(next_song_labels)

AssertionError: 

In [17]:
user_array = np.array([np.full(shape=(sequence_length,), fill_value=user_id) for user_id, _ in enumerate(song_sequences)])
song_array = np.array(song_sequences)
metadata_array = np.array(metadata_sequences)
time_of_day_array = np.array(time_of_day_sequences)
label_array = np.array(next_song_labels)

In [18]:
# Splitting parameters
test_size = 0.2  # 20% of the data will be used for validation

# Splitting the data
train_user_array, val_user_array, train_label_array, val_label_array = train_test_split(user_array, label_array, test_size=test_size, random_state=42)
train_song_array, val_song_array = train_test_split(song_array, test_size=test_size, random_state=42)
train_metadata_array, val_metadata_array = train_test_split(metadata_array, test_size=test_size, random_state=42)
train_time_of_day_array, val_time_of_day_array = train_test_split(time_of_day_array, test_size=test_size, random_state=42)

ValueError: Found input variables with inconsistent numbers of samples: [9973, 9946]

In [None]:
song_sequences_padded = pad_sequences(song_sequences, maxlen=sequence_length)
timestamp_sequences_padded = pad_sequences(timestamp_sequences, maxlen=sequence_length)
next_song_labels = np.array(next_song_labels)

In [None]:
# Assuming metadata_embedding_size and time_of_day_size are defined based on your data
# def create_model_with_lstm(num_users, num_songs, sequence_length, metadata_embedding_size, time_of_day_size, num_features=0, embedding_size=50, lstm_units=64):    # Existing input layers
#     user_sequence_input = Input(shape=(sequence_length,), name='user_sequence_input')
#     song_sequence_input = Input(shape=(sequence_length,), name='song_sequence_input')
#     timestamp_input = Input(shape=(sequence_length,), name='timestamp_input')

#     # New input layers for song metadata and time of day
#     song_metadata_input = Input(shape=(sequence_length, metadata_embedding_size), name='song_metadata_input')
#     time_of_day_input = Input(shape=(sequence_length, time_of_day_size), name='time_of_day_input')

#     # Embedding layers
#     user_embedding = Embedding(output_dim=embedding_size, input_dim=num_users, input_length=sequence_length, name='user_embedding')(user_sequence_input)
#     song_embedding = Embedding(output_dim=embedding_size, input_dim=num_songs, input_length=sequence_length, name='song_embedding')(song_sequence_input)

#     # Flatten embeddings
#     user_vec = Flatten()(user_embedding)
#     song_vec = Flatten()(song_embedding)

#     # Combine all inputs
#     combined_input = concatenate([
#         user_vec, 
#         song_vec, 
#         Flatten()(timestamp_input), 
#         Flatten()(song_metadata_input), 
#         Flatten()(time_of_day_input)
#     ])

#     # LSTM layer
#     lstm_layer = LSTM(lstm_units, return_sequences=False)(Reshape((sequence_length, -1))(combined_input))

#     # Concatenate with other attributes
#     concat = concatenate([lstm_layer, other_input])

#     # Dense layers
#     dense = Dense(256, activation='relu')(concat)
#     dropout = Dropout(0.5)(dense)
#     output = Dense(num_songs, activation='softmax')(dropout)

#     # Create and compile the model
#     model = Model(inputs=[user_sequence_input, song_sequence_input, timestamp_input, song_metadata_input, time_of_day_input], outputs=output)
#     model.compile(optimizer=Adam(0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

#     return model
def create_model(sequence_length, num_songs, num_users, metadata_embedding_size):
    # Define input layers
    song_input = Input(shape=(sequence_length,), name='song_input')
    timestamp_input = Input(shape=(sequence_length, 1), name='timestamp_input')
    metadata_input = Input(shape=(sequence_length, metadata_embedding_size), name='metadata_input')
    time_of_day_input = Input(shape=(sequence_length, 1), name='time_of_day_input')  # Assuming time_of_day is a single value per timestep

    # Embedding layer for songs
    song_embedding = Embedding(input_dim=num_songs, output_dim=50, input_length=sequence_length)(song_input)

    # LSTM layer for songs
    song_lstm = LSTM(64, return_sequences=False)(song_embedding)

    # Flatten other inputs
    flattened_timestamp = Flatten()(timestamp_input)
    flattened_metadata = Flatten()(metadata_input)
    flattened_time_of_day = Flatten()(time_of_day_input)

    # Combine all inputs
    combined = concatenate([song_lstm, flattened_timestamp, flattened_metadata, flattened_time_of_day])

    # Dense layers
    dense1 = Dense(256, activation='relu')(combined)
    dropout1 = Dropout(0.5)(dense1)
    output = Dense(num_songs, activation='softmax')(dropout1)

    # Create and compile model
    model = Model(inputs=[song_input, timestamp_input, metadata_input, time_of_day_input], outputs=output)
    model.compile(optimizer=Adam(0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

In [None]:
# Model parameters
num_songs = len(np.unique(song_sequences_padded))
num_users = len(users)  # or replace with the actual number of unique users
metadata_embedding_size = len(features_to_scale)  # Based on your scaled metadata features

# Create the model
model = create_model(sequence_length, num_songs, num_users, metadata_embedding_size)

In [None]:
# Flatten each metadata sequence since they are currently a list of arrays
metadata_sequences_flat = [[item for sublist in sequence for item in sublist] for sequence in metadata_sequences]

# Pad the flattened metadata sequences
metadata_sequences_padded = pad_sequences(metadata_sequences_flat, maxlen=sequence_length * len(features_to_scale), dtype='float32')

# Reshape the metadata to have the correct form for the LSTM layer
metadata_sequences_padded = metadata_sequences_padded.reshape(-1, sequence_length, len(features_to_scale))

In [None]:
# Ensure the time_of_day_sequences is a numpy array with an extra dimension
timestamp_sequences_padded_3d = np.expand_dims(timestamp_sequences_padded, axis=-1)

model.fit(
    [song_sequences_padded, timestamp_sequences_padded_3d, metadata_sequences_padded, time_of_day_sequences_padded],
    next_song_labels,
    batch_size=32,
    epochs=10,
    validation_split=0.2
)

In [None]:
# Flatten each metadata sequence since they are currently a list of arrays
metadata_sequences_flat = [[item for sublist in sequence for item in sublist] for sequence in metadata_sequences]

# Pad the flattened metadata sequences
metadata_sequences_padded = pad_sequences(metadata_sequences_flat, maxlen=sequence_length * len(features_to_scale), dtype='float32')

# Reshape the metadata to have the correct form for the LSTM layer
metadata_sequences_padded = metadata_sequences_padded.reshape(-1, sequence_length, len(features_to_scale))

In [None]:
time_of_day_sequences_padded = pad_sequences(time_of_day_sequences, maxlen=sequence_length, padding='post', dtype='int32')


In [None]:
# Model parameters
num_users = len(listening_history_df['user_id'].unique())
num_songs = len(listening_history_df['song_id'].unique())
metadata_embedding_size = len(features_to_scale)  # Number of features in song metadata
time_of_day_size = 0  # Update this if you add time-of-day information


In [None]:
song_sequences_padded

In [None]:
timestamp_sequences_padded

In [None]:
metadata_sequences_padded

In [None]:
time_of_day_sequences_padded

In [None]:

# Corrected model creation call
model = create_model_with_lstm(num_users, num_songs, sequence_length, metadata_embedding_size, time_of_day_size)

# Fit the model
model.fit(
    [song_sequences_padded, song_sequences_padded, timestamp_sequences_padded, metadata_sequences_padded, time_of_day_sequences_padded],
    next_song_labels,
    batch_size=32,
    epochs=10,
    validation_split=0.2
)