In [6]:
import pandas as pd
import json
import ast
import os
from tqdm import tqdm
import numpy as np
from scipy.sparse import coo_matrix
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF

In [7]:
data = pd.read_csv(r'C:\Users\frmar\OneDrive\Desktop\GitHub\Music-Recommender-System\dataframes\playlists_trackId.csv')

In [8]:
# Parse the tracks column into actual lists of integers
data["tracks"] = data["tracks"].apply(lambda x: list(map(int, ast.literal_eval(x))))

# Display the transformed DataFrame
print(data)

        user_id                                             tracks
0             0  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
1             1  [52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 6...
2             2  [91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101,...
3             3  [155, 156, 157, 158, 159, 160, 161, 162, 163, ...
4             4  [281, 282, 283, 284, 285, 286, 287, 288, 289, ...
...         ...                                                ...
999995   999995  [116671, 5783, 48788, 5784, 91283, 17696, 5046...
999996   999996  [2262285, 2262286, 2262287, 2262288, 2262289, ...
999997   999997  [39643, 41143, 5434, 22056, 3860, 1043, 9152, ...
999998   999998  [5768, 5872, 14504, 3936, 76224, 5838, 5838, 2...
999999   999999  [12421, 12518, 14294, 268821, 311299, 92033, 1...

[1000000 rows x 2 columns]


In [9]:
# Step 1: Extract rows, columns, and values for the sparse matrix
rows = []
cols = []
values = []

for index, row in tqdm(data.iterrows()):
    try:
        user_id = row["user_id"]  # Extract user_id
        tracks = row["tracks"]    # Extract tracks
        rows.extend([user_id] * len(tracks))  # Repeat user_id for each track
        cols.extend(tracks)  # Track indices as columns
        values.extend([1] * len(tracks))  # All values are 1
    except Exception as e:
        print(f'Error at index {index}: {e}')

1000000it [00:49, 20259.27it/s]


In [10]:
# Step 2: Create the sparse matrix
sparse_matrix = csr_matrix((values, (rows, cols)), shape=(1000001, 2262293))

In [11]:
print(sparse_matrix)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 65464776 stored elements and shape (1000001, 2262293)>
  Coords	Values
  (0, 1)	1
  (0, 2)	1
  (0, 3)	1
  (0, 4)	1
  (0, 5)	1
  (0, 6)	1
  (0, 7)	1
  (0, 8)	1
  (0, 9)	1
  (0, 10)	1
  (0, 11)	1
  (0, 12)	1
  (0, 13)	1
  (0, 14)	1
  (0, 15)	1
  (0, 16)	1
  (0, 17)	1
  (0, 18)	1
  (0, 19)	1
  (0, 20)	1
  (0, 21)	1
  (0, 22)	1
  (0, 23)	1
  (0, 24)	1
  (0, 25)	1
  :	:
  (999999, 194398)	1
  (999999, 194410)	1
  (999999, 221578)	1
  (999999, 250248)	1
  (999999, 250249)	1
  (999999, 263925)	2
  (999999, 268821)	1
  (999999, 270010)	1
  (999999, 273644)	1
  (999999, 274131)	1
  (999999, 288295)	1
  (999999, 311291)	1
  (999999, 311299)	1
  (999999, 314025)	1
  (999999, 327267)	1
  (999999, 340556)	1
  (999999, 363031)	1
  (999999, 377469)	1
  (999999, 379395)	1
  (999999, 462400)	1
  (999999, 466733)	1
  (999999, 466735)	1
  (999999, 582309)	1
  (999999, 582312)	1
  (999999, 627056)	1


In [None]:
num_features = 50  # Number of latent features

# Initialize the NMF model
nmf_model = NMF(n_components=num_features, init='random', random_state=0, max_iter=1000)

# Fit the model and transform the data
P = nmf_model.fit_transform(sparse_matrix)  # User features matrix
Q = nmf_model.components_       # Item features matrix

In [7]:
# Step 3: Apply Truncated SVD
import numpy as np
from scipy.sparse import coo_matrix
from sklearn.decomposition import TruncatedSVD

# Convert COO to CSR for better compatibility
csr = sparse_matrix.tocsr()
csr = csr.astype(float)

from scipy.sparse.linalg import svds

# Perform Truncated SVD with a limited number of components
k = 50  # Number of components
U, sigma, Vt = svds(csr, k=k)

print("U Matrix:", U)
print("Singular Values:", sigma)
print("Vt Matrix:", Vt)

U Matrix: [[ 7.91152764e-05  4.55646943e-04 -5.02736905e-04 ... -1.44587858e-03
   1.82402848e-03  7.74491536e-04]
 [ 1.75035423e-05  3.12490426e-05  8.15832347e-05 ...  1.92904045e-03
   8.90942199e-04  1.31713749e-04]
 [ 5.69781061e-05  4.32067831e-05  1.12561262e-05 ... -1.92183629e-06
   2.77222113e-06  4.10188747e-06]
 ...
 [ 1.49053367e-05  1.33442312e-06  1.13877515e-05 ... -1.33526936e-06
   6.66563146e-06  2.76084294e-06]
 [ 9.32822601e-05 -1.24941083e-05 -4.67362381e-05 ... -4.75555934e-06
   3.69669216e-06  5.87576196e-06]
 [ 6.13680740e-20  3.31500616e-20 -1.78220890e-19 ... -1.23308129e-19
   1.10936867e-19  1.57812097e-19]]
Singular Values: [209.28778874 210.28093996 212.42397963 213.19873828 218.01344851
 219.65115984 221.54174498 223.27100752 224.37757141 226.87397191
 231.72266705 232.93402124 234.09808801 235.74833292 242.61872012
 243.18283765 249.04423834 249.49920643 252.33963985 260.88655205
 262.95955062 266.90232618 269.17892512 274.47680879 274.89211621
 281.16

In [8]:
print(f'Dimensione U: {U.shape}')
print(f'Dimensione sigma: {sigma.shape}')
print(f'Dimensione Vt: {Vt.shape}')

Dimensione U: (1000001, 50)
Dimensione sigma: (50,)
Dimensione Vt: (50, 2262293)


In [9]:
# Convert sigma to diagonal matrix form
sigma = np.diag(sigma)

In [10]:
sigma

array([[209.28778874,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        , 210.28093996,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        ,   0.        , 212.42397963, ...,   0.        ,
          0.        ,   0.        ],
       ...,
       [  0.        ,   0.        ,   0.        , ..., 596.02062965,
          0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
        690.3306173 ,   0.        ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        , 906.25404211]], shape=(50, 50))

In [37]:
user = 999999

R_pred_user = np.dot(np.dot(U[user, :k], sigma[:k, :k]), Vt[:k, :])

print(R_pred_user)

[ 6.85434061e-17 -5.95851875e-04 -1.00930750e-03 ... -2.43592414e-08
 -2.43592414e-08 -2.43592414e-08]


In [38]:
user_tracks = data.loc[data['user_id'] == user, 'tracks'].values[0]
user_tracks = user_tracks[:-1]
# Print the tracks for user i
print(f"Tracks for user {user}: {user_tracks}")

Tracks for user 999999: [12421, 12518, 14294, 268821, 311299, 92033, 12463, 41601, 12423, 41168, 39990, 12422, 41184, 273644, 97741, 72435, 314025, 12556, 189369, 72395, 153573, 39988, 40000, 14304, 12426, 12425, 153550, 462400, 12493, 12424, 45776, 12451, 12455, 138207, 379395, 83427, 72425, 40017, 13644, 12489, 12552, 274131, 168841, 194398, 125607, 12444, 71112, 189377, 263925, 377469, 133842, 12461, 14260, 163478, 12535, 153545, 363031, 69331, 39982, 77299, 12511, 12512, 12462, 153537, 153538, 97742, 53263, 153539, 153540, 153541, 153542, 153543, 153544, 12462, 12420, 40001, 84864, 311291, 45751, 270010, 125617, 582309, 582312, 12424, 627056, 3897, 83428, 340556, 194410, 41163, 12489, 263925, 101245, 327267, 250248, 250249, 128315, 221578, 12426, 41167, 153549, 466735, 466733, 72395, 72411, 72433, 288295, 141055, 141040]


In [39]:
new_list = [item for idx, item in enumerate(R_pred_user) if idx not in user_tracks]

In [40]:
max_position = np.argmax(new_list)
print(max_position)

5313
