In [1]:
import numpy as np
import pandas as pd
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

## Data Handling Process

### 1. **Load Metadata**:
   - The metadata is loaded from `id_metadata.csv`, using tab (`'\t'`) as the delimiter, into the `metadata_df` DataFrame.

### 2. **Load Listening History**:
   - Listening history data is loaded from `listening_history.csv` into the `df` DataFrame.

### 3. **Convert Timestamps**:
   - The `timestamp` column in `df` is converted to datetime format to facilitate time-based operations.

In [2]:
metadata_path = '../data/id_metadata.csv'
listening_history_path = '../data/listening_history.csv'
metadata_df = pd.read_csv(metadata_path, delimiter='\t')
df = pd.read_csv(listening_history_path, delimiter='\t')

df['timestamp'] = pd.to_datetime(df['timestamp'])
df

Unnamed: 0,user,song,timestamp
0,user_007XIjOr,DaTQ53TUmfP93FSr,2019-02-20 12:28:00
1,user_007XIjOr,dGeyvi5WCOjDU7da,2019-02-20 12:35:00
2,user_007XIjOr,qUm54NYOjeFhmKYx,2019-02-20 12:48:00
3,user_007XIjOr,FtnuMT1DlevSR2n5,2019-02-20 12:52:00
4,user_007XIjOr,LHETTZcSZLeaVOGh,2019-02-20 13:09:00
...,...,...,...
5109587,user_zzWscYTy,BBiswLufo26YQCT7,2019-01-10 15:57:00
5109588,user_zzWscYTy,5ZHgff3sjETIiedr,2019-01-10 16:21:00
5109589,user_zzWscYTy,m4O1iLh6fC43xjRy,2019-01-10 16:48:00
5109590,user_zzWscYTy,mvUaP8k67qOFfA65,2019-01-10 21:13:00


## Data Filtering Process Based on Recent Dates

### 1. **Determine Latest Date**:
   - Calculate the most recent date (`latest_date`) in the `timestamp` column of the DataFrame `df`.

### 2. **Compute Date for One Week Ago**:
   - Subtract 7 days from the `latest_date` using `pd.Timedelta`, resulting in the date `one_week_ago`.

### 3. **Filter Recent Data**:
   - Restrict `df` to only include rows where the `timestamp` is on or after `one_week_ago`, effectively filtering the data to the last week.

### 4. **Output Filtered DataFrame**:
   - The resulting DataFrame `df` now contains only the records from the past week, ready for analysis or further processing.


In [3]:
latest_date = df['timestamp'].max()
one_week_ago = latest_date - pd.Timedelta(days=7)
df = df[df['timestamp'] >= one_week_ago]
df

Unnamed: 0,user,song,timestamp
5288,user_02jFGVkG,ke5JKpLa8Dw7dCDO,2019-03-19 12:39:00
5289,user_02jFGVkG,nRpEFcFzYZ8Z8Cye,2019-03-19 12:44:00
5290,user_02jFGVkG,UTDxdZ1outySsU7O,2019-03-19 15:18:00
5291,user_02jFGVkG,UTDxdZ1outySsU7O,2019-03-19 15:24:00
5292,user_02jFGVkG,bxo3drSzBGDlrodp,2019-03-19 15:24:00
...,...,...,...
5104162,user_zwQunLVn,A8GhNkPLuKusMIvF,2019-03-21 18:29:00
5104163,user_zwQunLVn,0m1MyuwLx1EjuQqD,2019-03-21 22:29:00
5104164,user_zwQunLVn,0m1MyuwLx1EjuQqD,2019-03-21 22:33:00
5104165,user_zwQunLVn,l2kISVY4j8iz1Gg7,2019-03-21 22:36:00


## DataFrame Operations Overview

### 1. **Check for Missing Values**:
   - `df.isnull().sum()` calculates the total number of missing values in each column of the DataFrame.
   
### 2. **Identify Unique Entries**:
   - `df.song.unique()` retrieves an array of unique song IDs from the `song` column.
   - `df.user.unique()` retrieves an array of unique user IDs from the `user` column.

### 3. **Calculate Song Popularity**:
   - The popularity of each song is calculated as the frequency of the song's appearance in the DataFrame divided by the total number of unique songs.
   - This calculated popularity is then mapped back to the `song` column of `df` and stored in a new column `song_popularity`.

The final DataFrame `df` is enhanced with a new `song_popularity` column which provides a relative measure of how frequently each song appears in the dataset, adjusted by the number of unique songs.


In [4]:
df.isnull().sum()

user         0
song         0
timestamp    0
dtype: int64

In [5]:
unique_names_song = df.song.unique()
unique_names_user = df.user.unique()
unique_names_song.shape, unique_names_user.shape

((18193,), (633,))

In [6]:
df.isnull().values.any()
song_popularity = df['song'].value_counts() / len(unique_names_song)
df['song_popularity'] = df['song'].map(song_popularity)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['song_popularity'] = df['song'].map(song_popularity)


Unnamed: 0,user,song,timestamp,song_popularity
5288,user_02jFGVkG,ke5JKpLa8Dw7dCDO,2019-03-19 12:39:00,0.000220
5289,user_02jFGVkG,nRpEFcFzYZ8Z8Cye,2019-03-19 12:44:00,0.000715
5290,user_02jFGVkG,UTDxdZ1outySsU7O,2019-03-19 15:18:00,0.000110
5291,user_02jFGVkG,UTDxdZ1outySsU7O,2019-03-19 15:24:00,0.000110
5292,user_02jFGVkG,bxo3drSzBGDlrodp,2019-03-19 15:24:00,0.000110
...,...,...,...,...
5104162,user_zwQunLVn,A8GhNkPLuKusMIvF,2019-03-21 18:29:00,0.000055
5104163,user_zwQunLVn,0m1MyuwLx1EjuQqD,2019-03-21 22:29:00,0.000879
5104164,user_zwQunLVn,0m1MyuwLx1EjuQqD,2019-03-21 22:33:00,0.000879
5104165,user_zwQunLVn,l2kISVY4j8iz1Gg7,2019-03-21 22:36:00,0.000879


## Building Interaction Matrix and Enriched Data

### 1. **Initialize Interaction Matrix**:
   - An interaction matrix is created with dimensions corresponding to the unique count of users and songs, initialized with zeros.

### 2. **Map Users and Songs to Matrix Indices**:
   - Dictionaries `user_indices` and `song_indices` are created to map user and song identifiers to matrix indices for easy access.

### 3. **Populate Interaction Matrix**:
   - Iterate through the DataFrame `df`, using mapped indices to fill the matrix with the logarithm of song popularity incremented by one, to factor in popularity dynamics in interactions.

### 5. **Prepare Data for Detailed Interaction DataFrame**:
   - Arrays are prepared for user IDs, song IDs and interaction values by iterating over the interaction matrix for each song-user pair.

### 6. **Construct Feature-Rich DataFrame**:
   - A new DataFrame `interaction_df` is created to encapsulate user IDs, song IDs and their interaction.


In [7]:
interaction_matrix = np.zeros((df['user'].nunique(), len(unique_names_song)))

# Map users and songs to matrix indices
user_indices = {user: idx for idx, user in enumerate(df['user'].unique())}
song_indices = {song: idx for idx, song in enumerate(unique_names_song)}

for index, row in df.iterrows():
    user_idx = user_indices[row['user']]
    song_idx = song_indices[row['song']]
    interaction_matrix[user_idx, song_idx] = np.log(row['song_popularity'] + 1)

# Create lists for DataFrame
user_ids, song_ids,  interactions = [], [], []
for user in user_indices:
    for song in song_indices:
        user_ids.append(user_indices[user])
        song_ids.append(song_indices[song])
        interactions.append(interaction_matrix[user_indices[user], song_indices[song]])

# Create the interaction DataFrame
interaction_df = pd.DataFrame({
    'user_id': user_ids,
    'song_id': song_ids,
    'interaction': interactions
})

interaction_df

Unnamed: 0,user_id,song_id,interaction
0,0,0,0.000220
1,0,1,0.000714
2,0,2,0.000110
3,0,3,0.000110
4,0,4,0.000110
...,...,...,...
11516164,632,18188,0.000055
11516165,632,18189,0.000055
11516166,632,18190,0.000055
11516167,632,18191,0.000055


## Data Encoding and Basic Statistics

### 1. **Encode User and Song Identifiers**:
   - `LabelEncoder` is used to transform non-numeric user and song identifiers into numeric representations. The transformed identifiers are stored in new columns `user_id` and `song_id` in the DataFrame `df`.

### 2. **Calculate Unique Counts**:
   - Calculate the number of unique users (`N`) and the number of unique songs (`M`) from the newly encoded `user_id` and `song_id` columns.


In [8]:
user_encoder = LabelEncoder()
song_encoder = LabelEncoder()
df['user_id'] = user_encoder.fit_transform(df['user'])
df['song_id'] = song_encoder.fit_transform(df['song'])

N = df.user_id.nunique()  # Number of users
M = df.song_id.nunique()  # Number of songs

print(N, M)
print(df.shape, interaction_df.shape)

633 18193
(68090, 6) (11516169, 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['user_id'] = user_encoder.fit_transform(df['user'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['song_id'] = song_encoder.fit_transform(df['song'])


## Data Splitting and Exploration

### 1. **Split Data into Training and Testing Sets**:
   - The `interaction_df` DataFrame is split into training (`df_train`) and testing sets (`df_test`) using a 20% test size allocation and a random seed for reproducibility.

### 2. **Evaluate Unique Interaction Values**:
   - Determine the number of unique interaction values within the `df_train` using `interaction.nunique()` to understand the diversity of user-song interactions.

### 4. **Check for Missing Values**:
   - Calculate and display the total count of missing values per column in both `df_train` and `df_test` using `isnull().sum()` to assess data cleanliness and readiness for further processing.

In [9]:
df_train, df_test = train_test_split(interaction_df, test_size=0.2, random_state=42)
df_train

Unnamed: 0,user_id,song_id,interaction
3437593,188,17309,0.0
10363724,569,11907,0.0
6280692,345,4107,0.0
3531948,194,2506,0.0
4391169,241,6656,0.0
...,...,...,...
2234489,122,14943,0.0
4304572,236,11024,0.0
10081351,554,2429,0.0
6550634,360,1154,0.0


In [10]:
df_train.interaction.nunique()

109

In [11]:
df_test.isnull().sum()

user_id        0
song_id        0
interaction    0
dtype: int64

In [12]:
df_train.isnull().sum()

user_id        0
song_id        0
interaction    0
dtype: int64

## Model Loading and Inspection

### 1. **Load Pre-trained Model**:
   - The Keras model is loaded from a specified path (`model_path`), where it was previously saved as `baseline-model.h5`.

### 2. **Display Model Architecture**:
   - Use `model.summary()` to print the structure of the model. This includes details of all layers, their types, outputs, and the number of parameters both trainable and non-trainable.

In [13]:
model_path = '../helpers/baseline-model.h5'
model = keras.models.load_model(model_path)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 embedding (Embedding)          (None, 1, 15)        9495        ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 1, 15)        272895      ['input_2[0][0]']                
                                                                                              

## Predictive Model Input Preparation and Execution

### 1. **Select Specific User**:
   - A specific user (`specific_user_id`) is selected from the test data to focus the predictions on.

### 2. **Prepare User Input Array**:
   - Create an array `user_input` where the selected user’s ID is repeated for each song, ensuring each song is paired with the user for prediction purposes.

### 3. **Prepare Song Input Array**:
   - Generate `song_input` as an array of indices representing all unique songs (`M` is the total count of unique songs).

### 4. **Map Song IDs to DataFrame Indices**:
   - Construct a dictionary `song_id_to_index` that maps each song ID to its corresponding index in `interaction_df` for efficient data retrieval.

### 5. **Execute Predictions**:
   - Make predictions for all songs for the selected user using the prepared inputs (`user_input`, `song_input`). 

### 6. **Adjust Predictions**:
   - Normalize predicted interactions by adding the mean interaction value (`mu`) from the training set to each prediction, compensating for any baseline shifts in interaction levels.

In [14]:
user_ids_test = df_test.user_id.values
song_ids_test = df_test.song_id.values

# Select a specific user for the prediction
specific_user_id = user_ids_test[0]

# Prepare input data for the model
M = interaction_df['song_id'].nunique()  # Total number of unique songs
user_input = np.array([specific_user_id] * M)  # Repeat the user ID for each song
song_input = np.array(range(M))  # Array of all unique song IDs

# Map song IDs to indices in df
song_id_to_index = {id: idx for idx, id in enumerate(interaction_df['song_id'].unique())}

# Make predictions for this user with all songs
predicted_interactions = model.predict([user_input, song_input])

mu = df_train.interaction.mean()  # Mean interaction value for normalization
predicted_interactions = predicted_interactions.flatten() + mu
predicted_interactions



array([ 0.00088141, -0.00157933, -0.0007543 , ...,  0.00566827,
       -0.00082867, -0.00432805], dtype=float32)

## Extracting Top Song Recommendations

### 1. **Determine Number of Recommendations**:
   - Define `N` as 10 to specify the number of top recommendations to be retrieved.

### 2. **Identify Top Recommendations**:
   - Use `np.argsort()` on the `predicted_interactions` array to get indices of songs sorted by predicted interaction strength.
   - Reverse the order (`[::-1]`) to start with the highest values and select the top `N` indices (`top_n_indices`).

### 3. **Map Indices to Original Song IDs**:
   - Convert the top indices (`top_n_indices`) back to original song IDs using the `song_encoder.inverse_transform()` function, yielding `top_n_song_ids`.

In [15]:
# Determine the number of top recommendations top 10
N = 10
top_n_indices = np.argsort(predicted_interactions)[::-1][:N]

# Convert the indices to original song IDs
top_n_song_ids = song_encoder.inverse_transform(top_n_indices)

print(f"Top {N} recommended song IDs for user {specific_user_id} are:", top_n_song_ids)
print(user_input.shape)
print(song_input.shape)
print(df[df.user_id == specific_user_id].song.unique())

Top 10 recommended song IDs for user 277 are: ['BajuXAK1vYSV9DqI' 'A561aRQ5xWskGyTy' 'XCh4gauugO7p5yEq'
 '0lHFJ5a22QJltO8s' 'QDtAFWCvEaRwFPnC' '2iuSNjkeDoqs4zjd'
 'DyoJSQa9jxP6YjKw' 'RwP7Pv5i89I1rHvi' 'Su1CX9btXIQzVWUY'
 'prOM9bF9uzuC4rqR']
(18193,)
(18193,)
['wG3YQktIKVlZruUu' 'J4tQCUzKKKj3OkV0' 'AWLAvZOhu4vLV6RA'
 'b5ef6NhN3RBrQXIE' 'SAryWbXDkmkFWW3Z' 'GQoNBkAIX2obbtPn'
 'Ax8Us7NbJqe08R0a' 'Br8x0lv4QnA6pnqC' 'ilrbbVMFVbnwmdFp'
 'rPcvhlfan3sOTDX8' 'RCGLlYCigkopKd7D' '6WlGmqDauWacBf9D'
 'vZ3hXeIO65MA9fBi' 's7f6VKariF11TTjK' 'aAXwzXQZdbdPxFqT'
 'L6AP1Ype1fRuWBwD' '5yY4kVyuJPdbOegL' '2HgkT8eEkxxEo2Tg'
 'bhOXqQHAGOeJqm6I' 'X8ZpmMJo8zT2QvcT' 'MwpFtrA5qYK4v5tl'
 '6U91Xztu28VOYkmU' 'NsNXlxbPuFnyPv9p' 'OBkwaSaAAVVcBaag']


## Displaying Detailed Song Information for Recommendations

### 1. **Load Song Metadata**:
   - Import song metadata from `id_information.csv` using `pd.read_csv()`, specifying a tab (`'\t'`) as the delimiter, into the DataFrame `id_information`.

### 2. **Filter Relevant Song Details**:
   - Filter `id_information` to include only the entries corresponding to the `top_n_song_ids`, which are the IDs of the top recommended songs. This is accomplished using the `isin()` method, ensuring that only relevant song information is considered.

### 3. **Display Top Recommended Songs**:
   - Print details of the top recommended songs specifically tailored for the user (`specific_user_id`). The details displayed include the artist, song title, and album name from the `recommended_songs` DataFrame.
   - This step highlights the song information, providing a more meaningful context to the recommendations, such as knowing the artist and album for each recommended song.

In [16]:
# Load song information
id_information = pd.read_csv('../data/id_information.csv', sep='\t')

# Filter id_information to only include the top N recommended song IDs
recommended_songs = id_information[id_information['id'].isin(top_n_song_ids)]

# Print the details of the top N recommended songs
print(f"Top {N} recommended songs for user {specific_user_id} are:")
print(recommended_songs[['artist', 'song', 'album_name']])

Top 10 recommended songs for user 277 are:
                     artist  \
1372    Pinkshinyultrablast   
4875               Gorillaz   
17728       Brandy & Monica   
20425  Ludwig van Beethoven   
24578         Kylie Minogue   
46270          iamamiwhoami   
49352           Nicki Minaj   
51068           beabadoobee   
58665           Paula Abdul   
91362       Hiroyuki Sawano   

                                                    song  \
1372                                          Land's End   
4875                                          Magic City   
17728                                    The Boy Is Mine   
20425  Symphony No. 7 in A Major, Op. 92: II. Allegretto   
24578                                           Timebomb   
46270                                          blue blue   
49352                                    Roman's Revenge   
51068                                     If You Want To   
58665                                       Cold Hearted   
91362          

## Analyzing and Displaying User's Recent Song History

### 1. **Filter Songs for Specific User**:
   - Extract rows from the DataFrame `df` where the `user_id` matches the specific user (`specific_user_id`). This subset contains all the songs interacted with by this particular user.

### 2. **Sort Songs by Recent Play**:
   - Sort the filtered data (`user_songs`) by the `timestamp` column in descending order to prioritize the most recent interactions. This sorted DataFrame is stored as `user_songs_sorted`.

### 3. **Identify Last 5 Played Songs**:
   - Retrieve the IDs of the last five songs played by this user from the top of the sorted DataFrame, ensuring these are the most recent songs interacted with.

### 4. **Filter Song Metadata**:
   - Use the song IDs (`last_5_song_ids`) to filter `id_information` to include only metadata for these last five songs. This step ensures that the information displayed pertains only to the most recent song interactions.

### 5. **Display Song Information**:
   - Print details about these last five songs, including the artist, song title, and album name, providing a comprehensive view of the user’s most recent music preferences.

In [17]:
# Filter the DataFrame for the specific user
user_songs = df[df.user_id == specific_user_id]

# Sort the data by the timestamp column in descending order
user_songs_sorted = user_songs.sort_values(by='timestamp', ascending=False)

last_5_song_ids = user_songs_sorted['song'].head(5).values
print(last_5_song_ids)

last_5_songs_info = id_information[id_information['id'].isin(last_5_song_ids)]
print("Information about the last 5 songs played:")
print(last_5_songs_info[['artist', 'song', 'album_name']])

['rPcvhlfan3sOTDX8' 'ilrbbVMFVbnwmdFp' 'OBkwaSaAAVVcBaag'
 'NsNXlxbPuFnyPv9p' '6U91Xztu28VOYkmU']
Information about the last 5 songs played:
              artist              song                          album_name
11445  Marty Robbins          Big Iron  Gunfighter Ballads And Trail Songs
42102  Marty Robbins     Billy the Kid  Gunfighter Ballads And Trail Songs
42687  Marty Robbins        Utah Carol  Gunfighter Ballads And Trail Songs
78856      blink-182  When I Was Young                    Dogs Eating Dogs
94075      blink-182        Boxing Day                    Dogs Eating Dogs


## Generating and Comparing Predicted Recommendations with Actual Preferences

### 1. **Define Function to Retrieve Actual Relevant Songs**:
   - The `get_relevant_songs` function is designed to fetch the unique song IDs that a specific user has interacted with from the DataFrame `df`. This establishes a baseline of songs that are known to be relevant to the user.

### 2. **Initialize Prediction and Actual Dictionaries**:
   - Two dictionaries, `predictions` and `actual`, are initialized to store the predicted top song IDs and actual relevant song IDs for each user, respectively.

### 3. **Iterate Over a Subset of Users**:
   - Loop through each user ID in the test set. This looping facilitates the prediction and validation process for multiple users in a manageable subset.

### 4. **Generate Predictions for Each User**:
   - For each user:
     - Create an input array (`user_input`) that repeats the user ID for each song, corresponding to the total number of unique songs (`M`).
     - Predict interaction scores using the model for all songs with the prepared inputs. Flatten the result to simplify handling.
     - Sort the predicted scores in descending order and extract the indices of the top `N` scores.
     - Use `song_encoder.inverse_transform` to convert these indices back into original song IDs (`top_n_song_ids`).

### 5. **Store Predictions and Actual Song IDs**:
   - Store the top `N` predicted song IDs for each user in the `predictions` dictionary.
   - Fetch and store the actual relevant songs for the user using `get_relevant_songs` and store them in the `actual` dictionary.

### 6. **Output Progress**:
   - Print the total number of users being processed and the current progress after each user's data is processed to monitor the computation and ensure it is proceeding correctly.

In [18]:
def get_relevant_songs(user_id, df):
    return df[df.user_id == user_id]['song'].unique()

predictions = {}
actual = {}

i = 0
# Get predictions and actual relevant songs
for user_id in user_ids_test[:500]: 
    # Predict top N songs
    user_input = np.array([user_id] * M)
    predicted_interactions = model.predict([user_input, song_input]).flatten()
    top_n_indices = np.argsort(predicted_interactions)[::-1][:N]
    top_n_song_ids = song_encoder.inverse_transform(top_n_indices)

    # Store the top N song IDs
    predictions[user_id] = top_n_song_ids.tolist()

    # Get actual relevant songs
    actual_relevant_songs = get_relevant_songs(user_id, df)
    actual[user_id] = actual_relevant_songs.tolist()
    i += 1
    print(len(user_ids_test), i)

2303234 1
2303234 2
2303234 3
2303234 4
2303234 5
2303234 6
2303234 7
2303234 8
2303234 9
2303234 10
2303234 11
2303234 12
2303234 13
2303234 14
2303234 15
2303234 16
2303234 17
2303234 18
2303234 19
2303234 20
2303234 21
2303234 22
2303234 23
2303234 24
2303234 25
2303234 26
2303234 27
2303234 28
2303234 29
2303234 30
2303234 31
2303234 32
2303234 33
2303234 34
2303234 35
2303234 36
2303234 37
2303234 38
2303234 39
2303234 40
2303234 41
2303234 42
2303234 43
2303234 44
2303234 45
2303234 46
2303234 47
2303234 48
2303234 49
2303234 50
2303234 51
2303234 52
2303234 53
2303234 54
2303234 55
2303234 56
2303234 57
2303234 58
2303234 59
2303234 60
2303234 61
2303234 62
2303234 63
2303234 64
2303234 65
2303234 66
2303234 67
2303234 68
2303234 69
2303234 70
2303234 71
2303234 72
2303234 73
2303234 74
2303234 75
2303234 76
2303234 77
2303234 78
2303234 79
2303234 80
2303234 81
2303234 82
2303234 83
2303234 84
2303234 85
2303234 86
2303234 87
2303234 88
2303234 89
2303234 90
2303234 91
2303234 

2303234 121
2303234 122
2303234 123
2303234 124
2303234 125
2303234 126
2303234 127
2303234 128
2303234 129
2303234 130
2303234 131
2303234 132
2303234 133
2303234 134
2303234 135
2303234 136
2303234 137
2303234 138
2303234 139
2303234 140
2303234 141
2303234 142
2303234 143
2303234 144
2303234 145
2303234 146
2303234 147
2303234 148
2303234 149
2303234 150
2303234 151
2303234 152
2303234 153
2303234 154
2303234 155
2303234 156
2303234 157
2303234 158
2303234 159
2303234 160
2303234 161
2303234 162
2303234 163
2303234 164
2303234 165
2303234 166
2303234 167
2303234 168
2303234 169
2303234 170
2303234 171
2303234 172
2303234 173
2303234 174
2303234 175
2303234 176
2303234 177
2303234 178
2303234 179
2303234 180
2303234 181
2303234 182
2303234 183
2303234 184
2303234 185
2303234 186
2303234 187
2303234 188
2303234 189
2303234 190
2303234 191
2303234 192
2303234 193
2303234 194
2303234 195
2303234 196
2303234 197
2303234 198
2303234 199
2303234 200
2303234 201
2303234 202
2303234 203
2303

2303234 239
2303234 240
2303234 241
2303234 242
2303234 243
2303234 244
2303234 245
2303234 246
2303234 247
2303234 248
2303234 249
2303234 250
2303234 251
2303234 252
2303234 253
2303234 254
2303234 255
2303234 256
2303234 257
2303234 258
2303234 259
2303234 260
2303234 261
2303234 262
2303234 263
2303234 264
2303234 265
2303234 266
2303234 267
2303234 268
2303234 269
2303234 270
2303234 271
2303234 272
2303234 273
2303234 274
2303234 275
2303234 276
2303234 277
2303234 278
2303234 279
2303234 280
2303234 281
2303234 282
2303234 283
2303234 284
2303234 285
2303234 286
2303234 287
2303234 288
2303234 289
2303234 290
2303234 291
2303234 292
2303234 293
2303234 294
2303234 295
2303234 296
2303234 297
2303234 298
2303234 299
2303234 300
2303234 301
2303234 302
2303234 303
2303234 304
2303234 305
2303234 306
2303234 307
2303234 308
2303234 309
2303234 310
2303234 311
2303234 312
2303234 313
2303234 314
2303234 315
2303234 316
2303234 317
2303234 318
2303234 319
2303234 320
2303234 321
2303

2303234 358
2303234 359
2303234 360
2303234 361
2303234 362
2303234 363
2303234 364
2303234 365
2303234 366
2303234 367
2303234 368
2303234 369
2303234 370
2303234 371
2303234 372
2303234 373
2303234 374
2303234 375
2303234 376
2303234 377
2303234 378
2303234 379
2303234 380
2303234 381
2303234 382
2303234 383
2303234 384
2303234 385
2303234 386
2303234 387
2303234 388
2303234 389
2303234 390
2303234 391
2303234 392
2303234 393
2303234 394
2303234 395
2303234 396
2303234 397
2303234 398
2303234 399
2303234 400
2303234 401
2303234 402
2303234 403
2303234 404
2303234 405
2303234 406
2303234 407
2303234 408
2303234 409
2303234 410
2303234 411
2303234 412
2303234 413
2303234 414
2303234 415
2303234 416
2303234 417
2303234 418
2303234 419
2303234 420
2303234 421
2303234 422
2303234 423
2303234 424
2303234 425
2303234 426
2303234 427
2303234 428
2303234 429
2303234 430
2303234 431
2303234 432
2303234 433
2303234 434
2303234 435
2303234 436
2303234 437
2303234 438
2303234 439
2303234 440
2303

2303234 476
2303234 477
2303234 478
2303234 479
2303234 480
2303234 481
2303234 482
2303234 483
2303234 484
2303234 485
2303234 486
2303234 487
2303234 488
2303234 489
2303234 490
2303234 491
2303234 492
2303234 493
2303234 494
2303234 495
2303234 496
2303234 497
2303234 498
2303234 499
2303234 500


## Recommendation System Evaluation Metrics

### Metric Calculations

#### 1. **Precision at K**:
   - Measures the proportion of recommended items in the top-K set that are relevant.

#### 2. **Recall at K**:
   - Assesses how many relevant items are found in the top-K recommendations.

#### 3. **Mean Average Precision at K (MAP@K)**:
   - Computes the mean of the average precision scores for each user, considering only the top-K recommendations.

#### 4. **Mean Reciprocal Rank (MRR)**:
   - Calculates the average of the reciprocal of the rank of the first relevant item among the recommendations.

#### 5. **Normalized Discounted Cumulative Gain at K (NDCG@K)**:
   - Evaluates the gain of a recommendation based on its position in the result list, giving higher importance to hits at top ranks.

### Functions Defined

- **`precision_at_k`**: Compares the top-K predicted items to the actual relevant items for each user to calculate precision.
- **`recall_at_k`**: Identifies how many of the relevant items appear in the top-K predictions for each user.
- **`mean_avg_precision_at_k`** and **`mean_average_precision_at_k`**: Both calculate the average precision at K for predictions against actual data.
- **`mean_reciprocal_rank`**: Computes the average reciprocal rank where the rank is the position of the first relevant recommendation.
- **`dcg_at_k`**: Computes the Discounted Cumulative Gain at K, a measure of ranking quality.
- **`ndcg_at_k`**: Normalizes the DCG at K by the ideal or perfect DCG at K, providing a measure of the model's performance relative to the best possible scenario.


In [19]:
def precision_at_k(actual, predicted, k):
    precision_scores = []
    for user_id in actual:
        # Initialize true positives count
        true_positives = 0
        # Check if user exists in predictions
        if user_id in predicted and len(predicted[user_id]) >= k:
            # Count the number of relevant items in the top k predictions
            true_positives = len(set(predicted[user_id][:k]) & set(actual[user_id]))
        # Calculate precision for this user
        precision = true_positives / float(k)
        precision_scores.append(precision)
    # Return the average precision at k for all users
    return sum(precision_scores) / len(precision_scores)


def recall_at_k(actual, predicted, k):
    recall_scores = []
    for user_id in actual:
        # Initialize true positives count
        true_positives = 0
        # Check if user exists in predictions
        if user_id in predicted:
            # Count the number of relevant items in the top k predictions
            true_positives = len(set(predicted[user_id][:k]) & set(actual[user_id]))
            recall = true_positives / float(len(actual[user_id]))
        else:
            # If no predictions for the user, recall is 0
            recall = 0.0
        recall_scores.append(recall)
    # Return the average recall at k for all users
    return sum(recall_scores) / len(recall_scores)


def avg_precision_at_k(actual, predicted, k=10):
    ap_sum = 0
    for user, true_items in actual.items():
        pred_items = predicted[user][:k]
        hits = 0
        sum_precs = 0
        for i, p in enumerate(pred_items):
            if p in true_items:
                hits += 1
                sum_precs += hits / (i + 1.0)
        ap_sum += sum_precs / min(len(true_items), k)
    return ap_sum / len(actual)


def mean_avg_precision_at_k(actual, predicted, k=10):
    return avg_precision_at_k(actual, predicted, k)


def mean_average_precision_at_k(actual, predicted, k=10):
    AP_sum = 0.0
    for user_id in actual:
        if user_id in predicted:
            pred_items = predicted[user_id][:k]
            hits = 0
            sum_precisions = 0
            for i, p in enumerate(pred_items):
                if p in actual[user_id] and p not in pred_items[:i]:
                    hits += 1
                    sum_precisions += hits / (i + 1.0)
            AP_sum += sum_precisions / min(len(actual[user_id]), k)
    return AP_sum / len(actual)


def mean_reciprocal_rank(actual, predicted):
    MRR_sum = 0.0
    for user_id in actual:
        if user_id in predicted:
            pred_items = predicted[user_id]
            for rank, p in enumerate(pred_items, start=1):
                if p in actual[user_id]:
                    MRR_sum += 1.0 / rank
                    break
    return MRR_sum / len(actual)


def dcg_at_k(relevances, k):
    relevances = np.asfarray(relevances)[:k]
    if relevances.size:
        return np.sum(relevances / np.log2(np.arange(2, relevances.size + 2)))
    return 0.0


def ndcg_at_k(actual, predicted, k=10):
    NDCG_sum = 0.0
    for user_id in actual:
        if user_id in predicted:
            pred_items = predicted[user_id][:k]
            true_relevances = [1 if item in actual[user_id] else 0 for item in pred_items]
            ideal_relevances = [1] * len(actual[user_id])
            NDCG_sum += dcg_at_k(true_relevances, k) / dcg_at_k(ideal_relevances, k)
    return NDCG_sum / len(actual)

k = 10
precision = precision_at_k(actual, predictions, k)
recall = recall_at_k(actual, predictions, k)
map_k = mean_average_precision_at_k(actual, predictions, k)
mrr = mean_reciprocal_rank(actual, predictions)
ndcg_k = ndcg_at_k(actual, predictions, k)

print(f"Precision@{k}: {precision}")
print(f"Recall@{k}: {recall}")
print(f"MAP@{k}: {map_k}")
print(f"MRR: {mrr}")
print(f"NDCG@{k}: {ndcg_k}")

Precision@10: 0.0041666666666666675
Recall@10: 0.0009490057461604342
MAP@10: 0.0015367535903250192
MRR: 0.015334467120181404
NDCG@10: 0.004696199420932304
