In [None]:

# Load required modules
import csv
import numpy as np
import h5py
from scipy import sparse


UAM_MATLAB_FILE = 'LFM-1b_LEs.mat'         # Matlab .mat file where the listening events are stored
ARTISTS_FILE = "LFM-1b_artists.txt"        # artist names for UAM
USER_DATA_FILE = "output_data_new.txt"     # user names for UAM
K = 3                                      # maximum number of seed's neighbors to select


# Read the user-artist-matrix and corresponding artist and user indices from Matlab file
def read_UAM(m_file, user_data_file):
    mf = h5py.File(m_file, 'r')
    user_ids = np.array(mf.get('idx_users')).astype(np.int64)[ :47587]
    artist_ids = np.array(mf.get('idx_artists')).astype(np.int64)
    # Load UAM
    UAM = sparse.csr_matrix((mf['/LEs/']["data"],
                             mf['/LEs/']["ir"],
                             mf['/LEs/']["jc"])).transpose()    #.tocoo().transpose()



    # Read gender and filter only female information from user data file
    user_data = np.loadtxt(user_data_file, dtype=np.str)
    #user_data = np.genfromtxt(user_data_file, dtype=np.str, missing_values='NA')
    gender_ids = (user_data[:, 3] == 'f').astype(np.int64) # Assuming the gender information is in the fourth column



    # Filter users based on gender (female)
    female_user_ids = user_ids[gender_ids == 1][ :47587]
    user_indices = np.where(np.isin(user_ids, female_user_ids))[0]
    artist_indices = np.where(np.isin(artist_ids, artist_ids))[0]
    UAM = UAM[user_indices][:, artist_indices]


    # user and artist indices to access UAM
    UAM_user_idx = UAM.indices #UAM.row -> for COO matrix
    UAM_artist_idx = UAM.indptr #UAM.col -> for COO matrix
    return UAM, UAM_user_idx, UAM_artist_idx, female_user_ids, artist_ids


# Function to read metadata (users or artists)
def read_from_file(filename, col):                  # col = column to read from file
    data = []
    with open(filename, 'r') as f:                  # open file for reading
        reader = csv.reader(f, delimiter='\t')      # create reader
        headers = next(reader)                     # skip header
        for row in reader:
            item = row[col]
            data.append(item)
    f.close()
    return data


# Main program
if __name__ == '__main__':
    # Initialize variables
    artists = []            # artists
    users = []              # users

    # Read UAM
    UAM, UAM_user_idx, UAM_artist_idx, female_user_ids, artist_ids = read_UAM(UAM_MATLAB_FILE, USER_DATA_FILE)
    print ('Female Users: ', len(female_user_ids))
    print ('Artists: ', len(artist_ids))

    # Load metadata from provided files into lists
    artists = read_from_file(ARTISTS_FILE, 1)
    users = read_from_file(USER_DATA_FILE, 0)


    # For all users
    for u in range(0, UAM.shape[0]):
        print ("Seed user-id: " + str(users[u]))

    # get (normalized) playcount vector for current user u
    pc_vec = UAM.getrow(u)

    # Compute similarities as dot product between playcount vector of user and all users via UAM (assuming that UAM is already normalized)
    # print uU_sim_users
    uU_sim = pc_vec.dot(UAM.transpose()).tocoo()
    uU_user_idx = uU_sim.col
    uU_data = uU_sim.data

    #
    # Determine nearest neighbors to seed based on uUM
    #

    # Find the occurrence of the seed user in uU_data cols
    # and set to 0 so that it is not selected as its own NN
    occ_user_idx = (uU_user_idx == u)
    uU_data[occ_user_idx] = 0

    # Eliminate zeros
    uU_sim.data = uU_data
    uU_sim = uU_sim.tocsr()
    uU_sim.eliminate_zeros()
    uU_sim = uU_sim.tocoo()
    uU_user_idx = uU_sim.col
    uU_data = uU_sim.data

    # Sort users according to the similarity (uU_data)
    sort_index = np.argsort(uU_data)

    # Select the K nearest neighbors among all users
    # Note that uU_user_idx indeed provides the indices for users in UAM
    recommended_user_idx = uU_user_idx[sort_index[-K:]]
    # Get user_ids corresponding to nearest neighbors
    recommended_user_ids = female_user_ids[recommended_user_idx]
    # Get similarity score for nearest neighbors
    recommended_user_scores = uU_data[sort_index[-K:]]

    print ("Nearest K=" + str(K) + " neighbors\' user-ids: ", recommended_user_ids.flatten())
    print ("Scores/similarities:" + str(recommended_user_scores))
    print ("Index in UAM for recommended user-ids:" + str(recommended_user_idx))

    #
    # Determine set of recommended artists
    #
    import heapq

    recommended_artists_idx = []
    for u_idx in recommended_user_idx:
        recommended_artists_idx.extend(list(UAM.getrow(u_idx).indices))

    # Convert to set to remove duplicates and sort it
    recommended_artists_idx = sorted(set(recommended_artists_idx))
    # Remove artists already known to seed user
    recommended_artists_idx = np.setdiff1d(recommended_artists_idx, pc_vec.indices)

    # Get the top 5 recommended artists indices
    top_10_artists_idx = heapq.nsmallest(10, recommended_artists_idx)

    print("Indices of 10 recommended artists: ", top_10_artists_idx)




Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  user_data = np.loadtxt(user_data_file, dtype=np.str)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Seed user-id: 6637338
Seed user-id: 6637392
Seed user-id: 6639528
Seed user-id: 6640603
Seed user-id: 6641136
Seed user-id: 6641327
Seed user-id: 6641715
Seed user-id: 6642166
Seed user-id: 6642212
Seed user-id: 6643067
Seed user-id: 6644706
Seed user-id: 6645358
Seed user-id: 6646298
Seed user-id: 6646467
Seed user-id: 6647544
Seed user-id: 6649074
Seed user-id: 6649766
Seed user-id: 6650024
Seed user-id: 6651434
Seed user-id: 6651944
Seed user-id: 6653806
Seed user-id: 6653887
Seed user-id: 6653912
Seed user-id: 6654458
Seed user-id: 6654494
Seed user-id: 6654733
Seed user-id: 6655888
Seed user-id: 6656019
Seed user-id: 6656229
Seed user-id: 6656446
Seed user-id: 6656548
Seed user-id: 6657738
Seed user-id: 6658416
Seed user-id: 6659890
Seed user-id: 6660531
Seed user-id: 6660658
Seed user-id: 6660777
Seed user-id: 6661114
Seed user-id: 6662348
Seed user-id: 6662597
Seed user-id: 6662662
Seed user-id: 6662699
Seed user-i

In [None]:
# Load required modules
import csv
import numpy as np
import h5py
from scipy import sparse


UAM_MATLAB_FILE = 'LFM-1b_LEs.mat'         # Matlab .mat file where the listening events are stored
ARTISTS_FILE = "LFM-1b_artists.txt"        # artist
USER_DATA_FILE = "output_data_new.txt"     # user
K = 3                                      # maximum number of seed's neighbors to select


# Read the user-artist-matrix and corresponding artist and user indices from Matlab file
def read_UAM(m_file, user_data_file):
    mf = h5py.File(m_file, 'r')
    user_ids = np.array(mf.get('idx_users')).astype(np.int64)[ :47587]
    artist_ids = np.array(mf.get('idx_artists')).astype(np.int64)
    # Load UAM
    UAM = sparse.csr_matrix((mf['/LEs/']["data"],
                             mf['/LEs/']["ir"],
                             mf['/LEs/']["jc"])).transpose()    #.tocoo().transpose()



    # Read gender and filter only male information from user data file
    user_data = np.loadtxt(user_data_file, dtype=np.str)
    #user_data = np.genfromtxt(user_data_file, dtype=np.str, missing_values='NA')
    gender_ids = (user_data[:, 3] == 'm').astype(np.int64) # Assuming the gender information is in the fourth column



    # Filter users based on gender (male)
    male_user_ids = user_ids[gender_ids == 1][ :47587]
    user_indices = np.where(np.isin(user_ids, male_user_ids))[0]
    artist_indices = np.where(np.isin(artist_ids, artist_ids))[0]
    UAM = UAM[user_indices][:, artist_indices]


    # user and artist indices to access UAM
    UAM_user_idx = UAM.indices #UAM.row -> for COO matrix
    UAM_artist_idx = UAM.indptr #UAM.col -> for COO matrix
    return UAM, UAM_user_idx, UAM_artist_idx, male_user_ids, artist_ids


# Function to read metadata (users or artists)
def read_from_file(filename, col):                  # col = column to read from file
    data = []
    with open(filename, 'r') as f:                  # open file for reading
        reader = csv.reader(f, delimiter='\t')      # create reader
        headers = next(reader)                     # skip header
        for row in reader:
            item = row[col]
            data.append(item)
    f.close()
    return data


# Main program
if __name__ == '__main__':
    # Initialize variables
    artists = []            # artists
    users = []              # users

    # Read UAM
    UAM, UAM_user_idx, UAM_artist_idx, male_user_ids, artist_ids = read_UAM(UAM_MATLAB_FILE, USER_DATA_FILE)
    print ('Male Users: ', len(male_user_ids))
    print ('Artists: ', len(artist_ids))

    # Load metadata from provided files into lists
    artists = read_from_file(ARTISTS_FILE, 1)
    users = read_from_file(USER_DATA_FILE, 0)


    # For all users
    for u in range(0, UAM.shape[0]):
        print ("Seed user-id: " + str(users[u]))

    # get (normalized) playcount vector for current user u
    pc_vec = UAM.getrow(u)

    # Compute similarities as dot product between playcount vector of user and all users via UAM (assuming that UAM is already normalized)
    # print uU_sim_users
    uU_sim = pc_vec.dot(UAM.transpose()).tocoo()
    uU_user_idx = uU_sim.col
    uU_data = uU_sim.data

    #
    # Determine nearest neighbors to seed based on uUM
    #

    # Find the occurrence of the seed user in uU_data cols
    # and set to 0 so that it is not selected as its own NN
    occ_user_idx = (uU_user_idx == u)
    uU_data[occ_user_idx] = 0

    # Eliminate zeros
    uU_sim.data = uU_data
    uU_sim = uU_sim.tocsr()
    uU_sim.eliminate_zeros()
    uU_sim = uU_sim.tocoo()
    uU_user_idx = uU_sim.col
    uU_data = uU_sim.data

    # Sort users according to the similarity (uU_data)
    sort_index = np.argsort(uU_data)

    # Select the K nearest neighbors among all users
    # Note that uU_user_idx indeed provides the indices for users in UAM
    recommended_user_idx = uU_user_idx[sort_index[-K:]]
    # Get user_ids corresponding to nearest neighbors
    recommended_user_ids = male_user_ids[recommended_user_idx]
    # Get similarity score for nearest neighbors
    recommended_user_scores = uU_data[sort_index[-K:]]

    print ("Nearest K=" + str(K) + " neighbors\' user-ids: ", recommended_user_ids.flatten())
    print ("Scores/similarities:" + str(recommended_user_scores))
    print ("Index in UAM for recommended user-ids:" + str(recommended_user_idx))

    #
    # Determine set of recommended artists
    #

    import heapq

    recommended_artists_idx = []
    for u_idx in recommended_user_idx:
        recommended_artists_idx.extend(list(UAM.getrow(u_idx).indices))

    # Convert to set to remove duplicates and sort it
    recommended_artists_idx = sorted(set(recommended_artists_idx))
    # Remove artists already known to seed user
    recommended_artists_idx = np.setdiff1d(recommended_artists_idx, pc_vec.indices)

    # Get the top 5 recommended artists indices
    top_10_artists_idx = heapq.nsmallest(10, recommended_artists_idx)

    print("Indices of 10 recommended artists: ", top_10_artists_idx)




Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  user_data = np.loadtxt(user_data_file, dtype=np.str)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Seed user-id: 30729490
Seed user-id: 30736147
Seed user-id: 30736620
Seed user-id: 30736678
Seed user-id: 30736795
Seed user-id: 30737312
Seed user-id: 30738157
Seed user-id: 30738892
Seed user-id: 30739242
Seed user-id: 30740088
Seed user-id: 30743292
Seed user-id: 30745062
Seed user-id: 30746884
Seed user-id: 30748150
Seed user-id: 30750128
Seed user-id: 30753555
Seed user-id: 30756231
Seed user-id: 30757642
Seed user-id: 30760801
Seed user-id: 30762172
Seed user-id: 30762814
Seed user-id: 30762972
Seed user-id: 30763502
Seed user-id: 30763768
Seed user-id: 30767643
Seed user-id: 30769790
Seed user-id: 30773667
Seed user-id: 30774250
Seed user-id: 30774640
Seed user-id: 30776222
Seed user-id: 30778224
Seed user-id: 30779589
Seed user-id: 30779630
Seed user-id: 30780785
Seed user-id: 30784847
Seed user-id: 30787199
Seed user-id: 30789895
Seed user-id: 30790869
Seed user-id: 30791108
Seed user-id: 30793035
Seed user-id: 3

In [None]:
# Load required modules
import csv
import numpy as np
import h5py
from scipy import sparse


UAM_MATLAB_FILE = 'LFM-1b_LEs.mat'         # Matlab .mat file where the listening events are stored
ARTISTS_FILE = "LFM-1b_artists.txt"        # artist names for UAM
USER_DATA_FILE = "output_data_new.txt"            # user names for UAM
K = 3                                      # maximum number of seed's neighbors to select


# Read the user-artist-matrix and corresponding artist and user indices from Matlab file
def read_UAM(m_file, user_data_file):
    mf = h5py.File(m_file, 'r')
    user_ids = np.array(mf.get('idx_users')).astype(np.int64)[ :47587]
    artist_ids = np.array(mf.get('idx_artists')).astype(np.int64)



# Load UAM
    UAM = sparse.csr_matrix((mf['/LEs/']['data'][()],
                             mf['/LEs/']['ir'][()],
                             mf['/LEs/']['jc'][()])).transpose()

    # Read gender information from user data file and take both female & male data
    user_data = np.loadtxt(user_data_file, dtype=np.str)  # Assuming user data file is a text file with each row representing a user's data
    gender_ids = np.isin(user_data[:, 3], ['f', 'm']).astype(np.int64)  # Include both female and male users

    # Filter users based on gender (both female and male)
    both_user_ids = user_ids[gender_ids == 1][ :47587]
    user_indices = np.where(np.isin(user_ids, both_user_ids))[0]
    artist_indices = np.where(np.isin(artist_ids, artist_ids))[0]
    UAM = UAM[user_indices][:, artist_indices]

    # user and artist indices to access UAM
    UAM_user_idx = UAM.indices
    UAM_artist_idx = UAM.indptr
    return UAM, UAM_user_idx, UAM_artist_idx, both_user_ids, artist_ids



# Function to read metadata (users or artists)
def read_from_file(filename, col):                  # col = column to read from file
    data = []
    with open(filename, 'r') as f:                  # open file for reading
        reader = csv.reader(f, delimiter='\t')      # create reader
        headers = next(reader)                      # skip header
        for row in reader:
            item = row[col]
            data.append(item)
    f.close()
    return data


# Main program
if __name__ == '__main__':
    # Initialize variables
    artists = []            # artists
    users = []              # users

    # Read UAM
    UAM, UAM_user_idx, UAM_artist_idx, both_user_ids, artist_ids = read_UAM(UAM_MATLAB_FILE, USER_DATA_FILE)
    print ('Both Female & Male Users: ',len(both_user_ids))
    print ('Artists: ', len(artist_ids))

    # Load metadata from provided files into lists
    artists = read_from_file(ARTISTS_FILE, 1)
    users = read_from_file(USER_DATA_FILE, 0)


    # For all users
    for u in range(0, UAM.shape[0]):
        print ("Seed user-id: " + str(users[u]))

    # get (normalized) playcount vector for current user u
    pc_vec = UAM.getrow(u)

    # Compute similarities as dot product between playcount vector of user and all users via UAM (assuming that UAM is already normalized)
    # print uU_sim_users
    uU_sim = pc_vec.dot(UAM.transpose()).tocoo()
    uU_user_idx = uU_sim.col
    uU_data = uU_sim.data

    #
    # Determine nearest neighbors to seed based on uUM
    #

    # Find the occurrence of the seed user in uU_data cols
    # and set to 0 so that it is not selected as its own NN
    occ_user_idx = (uU_user_idx == u)
    uU_data[occ_user_idx] = 0

    # Eliminate zeros
    uU_sim.data = uU_data
    uU_sim = uU_sim.tocsr()
    uU_sim.eliminate_zeros()
    uU_sim = uU_sim.tocoo()
    uU_user_idx = uU_sim.col
    uU_data = uU_sim.data

    # Sort users according to the similarity (uU_data)
    sort_index = np.argsort(uU_data)

    # Select the K nearest neighbors among all users
    # Note that uU_user_idx indeed provides the indices for users in UAM
    recommended_user_idx = uU_user_idx[sort_index[-K:]]
    # Get user_ids corresponding to nearest neighbors
    recommended_user_ids = both_user_ids[recommended_user_idx]
    # Get similarity score for nearest neighbors
    recommended_user_scores = uU_data[sort_index[-K:]]

    print ("Nearest K=" + str(K) + " neighbors\' user-ids: ", recommended_user_ids.flatten())
    print ("Scores/similarities:" + str(recommended_user_scores))
    print ("Index in UAM for recommended user-ids:" + str(recommended_user_idx))

    #
    # Determine set of recommended artists
    #

    import heapq

    recommended_artists_idx = []
    for u_idx in recommended_user_idx:
        recommended_artists_idx.extend(list(UAM.getrow(u_idx).indices))

    # Convert to set to remove duplicates and sort it
    recommended_artists_idx = sorted(set(recommended_artists_idx))
    # Remove artists already known to seed user
    recommended_artists_idx = np.setdiff1d(recommended_artists_idx, pc_vec.indices)

    # Get the top 5 recommended artists indices
    top_10_artists_idx = heapq.nsmallest(10, recommended_artists_idx)

    print("Indices of 10 recommended artists: ", top_10_artists_idx)




Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  user_data = np.loadtxt(user_data_file, dtype=np.str)  # Assuming user data file is a text file with each row representing a user's data


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Seed user-id: 48290686
Seed user-id: 48290952
Seed user-id: 48291656
Seed user-id: 48291749
Seed user-id: 48291771
Seed user-id: 48292323
Seed user-id: 48293125
Seed user-id: 48293339
Seed user-id: 48294858
Seed user-id: 48295089
Seed user-id: 48295671
Seed user-id: 48295977
Seed user-id: 48296059
Seed user-id: 48296191
Seed user-id: 48296889
Seed user-id: 48296977
Seed user-id: 48298585
Seed user-id: 48299003
Seed user-id: 48299153
Seed user-id: 48299201
Seed user-id: 48299950
Seed user-id: 48301096
Seed user-id: 48301171
Seed user-id: 48301480
Seed user-id: 48301510
Seed user-id: 48301889
Seed user-id: 48302031
Seed user-id: 48302262
Seed user-id: 48302403
Seed user-id: 48303361
Seed user-id: 48303528
Seed user-id: 48304182
Seed user-id: 48304973
Seed user-id: 48305384
Seed user-id: 48306143
Seed user-id: 48307643
Seed user-id: 48308528
Seed user-id: 48308911
Seed user-id: 48309627
Seed user-id: 48309730
Seed user-id: 4