# libraries

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
import numpy as np
import pandas as pd

In [5]:
path_to_usertrack_data = '/content/gdrive/MyDrive/adaptive-recommedation-system/nowplayingrs/nowplaying_rs_dataset/user_track_hashtag_timestamp.csv'

In [6]:
data = pd.read_csv(path_to_usertrack_data)

In [7]:
data

Unnamed: 0,user_id,track_id,hashtag,created_at
0,81496937,cd52b3e5b51da29e5893dba82a418a4b,nowplaying,2014-01-01 05:54:21
1,81496937,cd52b3e5b51da29e5893dba82a418a4b,goth,2014-01-01 05:54:21
2,81496937,cd52b3e5b51da29e5893dba82a418a4b,deathrock,2014-01-01 05:54:21
3,81496937,cd52b3e5b51da29e5893dba82a418a4b,postpunk,2014-01-01 05:54:21
4,2205686924,da3110a77b724072b08f231c9d6f7534,NowPlaying,2014-01-01 05:54:22
...,...,...,...,...
17560108,2819332208,03498f305040835c5f76d7c5660204a2,nowplaying,2014-12-23 07:21:04
17560109,154070865,8bacefe018a221d933529dd466e7c1c0,nowplaying,2014-12-23 07:21:07
17560110,985591650,0e64c11b9a77e93f343f9c1c0cdbcf54,nowplaying,2014-12-23 07:21:08
17560111,15518784,af5c5f220e0a872ac129f4f88b3db5f9,nowplaying,2014-12-23 07:21:11


# parsing the data

In [8]:
# convert created_at to datetime object and unix timestamp

data['created_at'] = pd.to_datetime(data['created_at'])

In [9]:
data['timestamp'] = (data['created_at'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

In [10]:
data

Unnamed: 0,user_id,track_id,hashtag,created_at,timestamp
0,81496937,cd52b3e5b51da29e5893dba82a418a4b,nowplaying,2014-01-01 05:54:21,1388555661
1,81496937,cd52b3e5b51da29e5893dba82a418a4b,goth,2014-01-01 05:54:21,1388555661
2,81496937,cd52b3e5b51da29e5893dba82a418a4b,deathrock,2014-01-01 05:54:21,1388555661
3,81496937,cd52b3e5b51da29e5893dba82a418a4b,postpunk,2014-01-01 05:54:21,1388555661
4,2205686924,da3110a77b724072b08f231c9d6f7534,NowPlaying,2014-01-01 05:54:22,1388555662
...,...,...,...,...,...
17560108,2819332208,03498f305040835c5f76d7c5660204a2,nowplaying,2014-12-23 07:21:04,1419319264
17560109,154070865,8bacefe018a221d933529dd466e7c1c0,nowplaying,2014-12-23 07:21:07,1419319267
17560110,985591650,0e64c11b9a77e93f343f9c1c0cdbcf54,nowplaying,2014-12-23 07:21:08,1419319268
17560111,15518784,af5c5f220e0a872ac129f4f88b3db5f9,nowplaying,2014-12-23 07:21:11,1419319271


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17560113 entries, 0 to 17560112
Data columns (total 5 columns):
 #   Column      Dtype         
---  ------      -----         
 0   user_id     int64         
 1   track_id    object        
 2   hashtag     object        
 3   created_at  datetime64[ns]
 4   timestamp   int64         
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 669.9+ MB


# feature engineering


### Time-decay function definition

This function will serve to establish a weight on recent interactions using exponential decay function, given as;

$$w(t) = e^{^{-\lambda(t - t_0)}}.$$

where, **w(t)** is the weight at time **t, t_0** is the current time, and **λ** is the decay rate.


In [12]:
## getting current time, assuming it's the latest timestamp from the data - t0
current_time = data['timestamp'].max()

# setting the decay rate - lamda λ
decay_rate = 0.001

# the time-decay funtion
def time_decay(timestamps, current_time, decay_rate):
  return np.exp(-decay_rate * (current_time - timestamps))

In [13]:
data['decay_weight'] = time_decay(data['timestamp'], current_time, decay_rate)

In [14]:
data.tail()

Unnamed: 0,user_id,track_id,hashtag,created_at,timestamp,decay_weight
17560108,2819332208,03498f305040835c5f76d7c5660204a2,nowplaying,2014-12-23 07:21:04,1419319264,0.993024
17560109,154070865,8bacefe018a221d933529dd466e7c1c0,nowplaying,2014-12-23 07:21:07,1419319267,0.996008
17560110,985591650,0e64c11b9a77e93f343f9c1c0cdbcf54,nowplaying,2014-12-23 07:21:08,1419319268,0.997004
17560111,15518784,af5c5f220e0a872ac129f4f88b3db5f9,nowplaying,2014-12-23 07:21:11,1419319271,1.0
17560112,15518784,af5c5f220e0a872ac129f4f88b3db5f9,listenlive,2014-12-23 07:21:11,1419319271,1.0


### Time-decay weight application

In [15]:
# from related data, the explicit interaction value (which was rating) is 1 for each, we'll use it to multiply our decay weight

data['weighted_interaction'] = data['decay_weight'] * 1

In [16]:
data.tail()

Unnamed: 0,user_id,track_id,hashtag,created_at,timestamp,decay_weight,weighted_interaction
17560108,2819332208,03498f305040835c5f76d7c5660204a2,nowplaying,2014-12-23 07:21:04,1419319264,0.993024,0.993024
17560109,154070865,8bacefe018a221d933529dd466e7c1c0,nowplaying,2014-12-23 07:21:07,1419319267,0.996008,0.996008
17560110,985591650,0e64c11b9a77e93f343f9c1c0cdbcf54,nowplaying,2014-12-23 07:21:08,1419319268,0.997004,0.997004
17560111,15518784,af5c5f220e0a872ac129f4f88b3db5f9,nowplaying,2014-12-23 07:21:11,1419319271,1.0,1.0
17560112,15518784,af5c5f220e0a872ac129f4f88b3db5f9,listenlive,2014-12-23 07:21:11,1419319271,1.0,1.0


### Interaction matrix

This is the user-to-item matrix, as `users(user_id)` on the rows, `item(track_id)` on the columns, and then feature cells containing the `weighted_interactions`

In [17]:
# interaction_matrix = data.pivot_table(index='user_id', columns='track_id', values='weighted_interaction', fill_value=0)

due to inefficient memory and GPU, I'll be using a sparse matrix representation with SciPy

In [18]:
from scipy.sparse import csr_matrix

In [20]:
# creating a sparse matrix from the dataset
user_ids = data['user_id'].astype('category').cat.codes
track_ids = data['track_id'].astype('category').cat.codes

sparse_interaction_matrix = csr_matrix((data['weighted_interaction'], (user_ids, track_ids)))


In [21]:
# matrix shape will be by (number of users, number of tracks)
print(sparse_interaction_matrix)

  (0, 237753)	0.0
  (1, 68886)	0.0
  (1, 342349)	0.0
  (2, 8806)	0.0
  (2, 16896)	0.0
  (2, 18493)	0.0
  (2, 23231)	0.0
  (2, 26298)	0.0
  (2, 48655)	0.0
  (2, 70991)	0.0
  (2, 80499)	0.0
  (2, 82544)	0.0
  (2, 152652)	0.0
  (2, 189249)	0.0
  (2, 203266)	0.0
  (2, 216217)	0.0
  (2, 229388)	0.0
  (2, 240076)	0.0
  (2, 241284)	0.0
  (2, 253737)	0.0
  (2, 284627)	0.0
  (3, 339042)	0.0
  (4, 5077)	0.0
  (4, 12515)	0.0
  (4, 13541)	0.0
  :	:
  (138217, 327963)	7.454176694349057e-25
  (138218, 119051)	6.111407515426417e-53
  (138218, 316430)	3.392485092637685e-83
  (138219, 16682)	1.3065532103069673e-81
  (138220, 32629)	1.0757171486667669e-135
  (138220, 58041)	2.3080487282943027e-134
  (138220, 69959)	9.639610622923227e-169
  (138220, 109913)	1.7845547198195654e-96
  (138220, 119699)	1.7337676651169857e-74
  (138220, 126709)	1.2338726759101766e-137
  (138220, 139971)	2.8858438942955707e-133
  (138220, 151874)	1.611607737115131e-132
  (138220, 155890)	5.186850617598935e-72
  (138220, 159393

## model building

* **selection:** for this **TDCF**, a matrix factorization model is used, one which can handle sparse matrices:

    * TruncatedSVD: *Truncated Singular Value Decomposition*

In [31]:
from sklearn.decomposition import TruncatedSVD
import random

In [25]:
# TRAINING THE MODEL (sparse matrix)

# latent factors
n_components = 20

# training
model = TruncatedSVD(n_components=n_components)
model.fit(sparse_interaction_matrix)

# matrix transfomation
lower_dimensional_features = model.transform(sparse_interaction_matrix)

In [26]:
sparse_interaction_matrix

<138223x344536 sparse matrix of type '<class 'numpy.float64'>'
	with 3017064 stored elements in Compressed Sparse Row format>

In [27]:
lower_dimensional_features

array([[ 0.00000000e+000,  0.00000000e+000,  0.00000000e+000, ...,
         0.00000000e+000,  0.00000000e+000,  0.00000000e+000],
       [ 0.00000000e+000,  0.00000000e+000,  0.00000000e+000, ...,
         0.00000000e+000,  0.00000000e+000,  0.00000000e+000],
       [ 0.00000000e+000,  0.00000000e+000,  0.00000000e+000, ...,
         0.00000000e+000,  0.00000000e+000,  0.00000000e+000],
       ...,
       [ 2.34924591e-089, -6.88529057e-090,  1.30689726e-088, ...,
         1.76334999e-085, -1.18502016e-085, -6.58265983e-087],
       [ 5.96470322e-143,  1.96391313e-140,  3.73444805e-140, ...,
         1.28856602e-137,  8.14215051e-139, -2.94278380e-138],
       [ 0.00000000e+000,  0.00000000e+000,  0.00000000e+000, ...,
         0.00000000e+000,  0.00000000e+000,  0.00000000e+000]])

# Recommendation Generation

In [28]:
from sklearn.metrics.pairwise import cosine_similarity

# using cosine similarity to compute item(track)
track_similarity = cosine_similarity(lower_dimensional_features.T)

# getting recommendation function for a user
def get_recommendations(track_id, similarity_matrix, k=5):
  # similariy scores for a specific track with all tracks
  similarity_scores = list(enumerate(similarity_matrix[track_id]))

  # sort tracks
  similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

  # the score of k most similar track
  most_similar_tracks = similarity_scores[1:k+1]

  # get track ID
  track_ids = [track[0] for track in most_similar_tracks]

  return track_ids

get recommendation for an item

In [29]:
recommended_track_ids = get_recommendations(10, track_similarity)
print(recommended_track_ids)

[8, 14, 18, 2, 3]


In [30]:
track_similarity

array([[ 1.00000000e+00,  2.51664775e-15, -4.95628737e-18,
        -1.07291246e-17, -1.57577430e-17,  3.43485537e-18,
         2.24762741e-18,  6.07555073e-18,  9.77615957e-19,
        -2.56611868e-18,  5.41268488e-18,  6.61143461e-18,
        -8.03606730e-18, -2.87270573e-19, -9.45588994e-18,
        -5.91701317e-19,  9.16870979e-19, -8.70382974e-18,
         1.02238837e-17, -2.85556644e-18],
       [ 2.51664775e-15,  1.00000000e+00,  9.37971765e-17,
         6.55874388e-16, -5.15250981e-16,  4.24761539e-16,
        -5.58871726e-16,  6.48378254e-16,  1.56167180e-18,
         3.52322036e-17,  4.19028045e-17, -1.89779195e-17,
        -2.14082263e-17,  1.32186544e-17,  9.84460028e-19,
        -8.88288223e-18, -1.28087483e-18, -3.15582871e-17,
        -4.14971980e-18, -1.50608918e-17],
       [-4.95628737e-18,  9.37971765e-17,  1.00000000e+00,
         8.18227055e-16,  1.24805874e-15, -6.74104899e-16,
        -7.32352285e-16,  1.08098418e-16,  2.13749445e-16,
        -3.94504740e-16,  1.4

In [35]:
# Extract unique user IDs and track IDs
unique_user_ids = data['user_id'].unique()
unique_track_ids = data['track_id'].unique()

# Create a mapping from user and track IDs to indices
user_id_to_index_mapping = {user_id: index for index, user_id in enumerate(unique_user_ids)}
index_to_track_id_mapping = {index: track_id for index, track_id in enumerate(unique_track_ids)}


# Select a random user ID from the unique user IDs
random_user_id = random.choice(unique_user_ids)

# Use the selected user ID to get the index
user_index = user_id_to_index_mapping[random_user_id]

def get_user_recommendations(user_id, user_features, track_similarity_matrix, top_k=5):
    # reduced features for the user
    user_profile = user_features[user_id]

    # calculate similarity of this user to all tracks
    similarity_scores = cosine_similarity([user_profile], track_similarity_matrix)[0]

    # top K track indices with highest similarity scores
    top_track_indices = similarity_scores.argsort()[-top_k:][::-1]

    # map indices to original track IDs
    top_track_ids = [index_to_track_id_mapping[index] for index in top_track_indices]

    return top_track_ids



# Get recommendations for the randomly selected user
recommended_track_ids = get_user_recommendations(user_index, lower_dimensional_features, track_similarity.T)
print("Recommendations for user", random_user_id, ":", recommended_track_ids)

Recommendations for user 859083768 : ['fb45503a9e3596197fa64bd4649e8f78', 'b8dc623f5b936f9959e648323f061b3a', 'da3110a77b724072b08f231c9d6f7534', 'ba84d88c10fb0e42d4754a27ead10546', '33f95122281f76e7134f9cbea3be980f']
