In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
# Suppress native-hadoop warning
!sed -i '$a\# Add the line for suppressing the NativeCodeLoader warning \nlog4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR,console' /$HADOOP_HOME/etc/hadoop/log4j.properties

In [2]:
import sys
sys.path.append('/home/work')

BASE_DIR = '/home/work'

In [4]:
import pyspark
from pyspark.sql import SparkSession, functions as F
from pyspark.ml import Pipeline
from pyspark.ml.recommendation import ALSModel
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

from data.utils import load_from_hdfs, user_based_train_test_split, load_data
from models.utils import load_model
from models.evaluation_metrics import calculate_rmse, calculate_mae, calculate_song_coverage, calculate_user_coverage, calculate_precision_recall

In [5]:
# Set Spark Settings
conf = pyspark.SparkConf().setAll([
    ('spark.master', 'local[3]'),
    ('spark.app.name', 'MusicRecommender'),
    ('spark.driver.memory','14g'),
    # ('spark.sql.shuffle.partitions', '200'),
])
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# Print Spark Settings
settings = spark.sparkContext.getConf().getAll()
for s in settings:
    print(s)

('spark.master', 'local[3]')
('spark.executor.id', 'driver')
('spark.driver.host', '693f94dcf7da')
('spark.app.startTime', '1717134134288')
('spark.driver.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false')
('spark.driver.por

## Evaluate Models

In [6]:
# Data Variables
test_user_ratings = 10
data_scale = 0.2
seed = 42

# Results DataFrame
results = []
model_dirs = ['als_model/raw', 'als_model/user_rating_downsampled', 'als_model/user_song_rating_downsampled']
model_type = 'ALS'

# Load Datasets
raw_train, raw_test = load_from_hdfs('raw', 2)
combined_raw = raw_train.union(raw_test)

user_rating_downsampled = load_data('processed/user_rating_downsampled.txt')
user_song_rating_downsampled = load_data('processed/user_and_song_rating_downsampled.txt')

datasets = [combined_raw, user_rating_downsampled, user_song_rating_downsampled]
split_datasets = [user_based_train_test_split(dataset, test_user_ratings, data_scale, seed) for dataset in datasets]
model_names = ['raw', 'user_rating_downsampled', 'user_song_rating_downsampled']

# Static Variables
total_size = 717872016
total_users = 1823179
total_songs = 136736

user_recs = None
predictions = None

batch_size = 10000
n_recs = 10

for model_dir, model_name, dataset in zip(model_dirs, model_names, split_datasets):
    train_data, test_data = dataset
    # Clear Cache
    spark.catalog.clearCache()
            
    # Load Model
    model_path = f'file://{BASE_DIR}/models/{model_dir}'
    model = load_model(model_type, model_path)
    
    # Train Metrics
    train_user_ids = train_data.select('user_id').distinct()
    train_users = train_user_ids.count()
    train_song_ids = train_data.select('song_id').distinct()
    train_songs = train_song_ids.count()
    
    # Song Metrics
    test_user_ids = test_data.select('user_id').distinct()
    test_users = test_user_ids.count()
    test_song_ids = test_data.select('song_id').distinct()
    test_songs = test_song_ids.count()

    spark.catalog.clearCache()
    
    # Recommendation Metrics
    user_recs = model.recommendForAllUsers(n_recs)
    user_recs = user_recs.select('user_id', F.explode('recommendations').alias('recommendation'))
    user_recs = user_recs.select('user_id', F.col('recommendation.song_id').alias('song_id'), F.col('recommendation.rating').alias('rating'))
                
    recommendation_users = user_recs.select('user_id').distinct().count()
    recommendation_songs = user_recs.select('song_id').distinct().count()
    
                            
    # Coverage Metrics
    test_song_coverage = calculate_song_coverage(train_songs, test_songs)
    test_overall_song_coverage = calculate_song_coverage(total_songs, test_songs)
    test_user_coverage = calculate_user_coverage(train_users, test_users)
    test_overall_user_coverage = calculate_user_coverage(total_users, test_users)
    recommendations_song_coverage = calculate_song_coverage(train_songs, recommendation_songs)
    recommendations_overall_song_coverage = calculate_song_coverage(total_songs, recommendation_songs)
    recommendations_user_coverage = calculate_user_coverage(train_users, recommendation_users)
    recommendations_overall_user_coverage = calculate_user_coverage(total_users, recommendation_users)
    
    # Get Predictions
    predictions = model.transform(test_data)
    
    # Evaluation Metrics
    rmse = calculate_rmse(predictions)
    mae = calculate_mae(predictions)
    precision, recall = calculate_precision_recall(user_recs, test_data, 0.0)            
    
    results.append({
        'Model': model_name,
        'Dataset': dataset,
        'Users(Train:Test)': f'{train_users} : {test_users}',
        'Songs(Train:Test)': f'{train_songs} : {test_songs}',
        'Test User Coverage(Model:Overall)': f'{round(test_user_coverage, 2)} : {round(test_overall_user_coverage, 2)}',
        'Test Song Coverage(Model:Overall)': f'{round(test_song_coverage, 2)} : {round(test_overall_song_coverage, 2)}',
        'Recommendations User Coverage(Model:Overall)': f'{round(recommendations_user_coverage, 2)} : {round(recommendations_overall_user_coverage, 2)}',
        'Recommendations Song Coverage(Model:Overall)': f'{round(recommendations_song_coverage, 2)} : {round(recommendations_overall_song_coverage, 2)}',
        'Recommendations Precision': round(precision, 4),
        'Recommendations Recall': round(recall, 4),     
        'Predictions RMSE': rmse,
        'Predictions MAE': mae,
    })

In [7]:
# Results DataFrame
model_dir = 'als_model/user_song_balanced'
model_type = 'ALS'
dataset = 'processed/user_song_balanced'
par = 0

# Static Variables
total_size = 717872016
total_users = 1823179
total_songs = 136736

n_recs = 10

# Clear Cache
spark.catalog.clearCache()

# Load Data
train_data, test_data = load_from_hdfs(dataset, par)

# Load Model
model_path = f'file://{BASE_DIR}/models/{model_dir}'
model = load_model(model_type, model_path)        

                                                                                

Loaded partition 0: 11650865 training records and 1922481 test records from HDFS
root
 |-- user_id: integer (nullable = true)
 |-- song_id: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- partition_id: integer (nullable = false)



In [8]:
# # Train Metrics
# train_user_ids = train_data.select('user_id').distinct()
# train_users = train_user_ids.count()
# train_song_ids = train_data.select('song_id').distinct()
# train_songs = train_song_ids.count()

In [9]:

# # Song Metrics
# test_user_ids = test_data.select('user_id').distinct()
# test_users = test_user_ids.count()
# test_song_ids = test_data.select('song_id').distinct()
# test_songs = test_song_ids.count()

In [10]:
# print(f"'Users(Train:Test)': {train_users} : {test_users}")
# print(f"'Songs(Train:Test)': {train_songs} : {test_songs}")

In [11]:
# Recommendation Metrics
user_recs = model.recommendForAllUsers(n_recs)
user_recs = user_recs.select('user_id', F.explode('recommendations').alias('recommendation'))
user_recs = user_recs.select('user_id', F.col('recommendation.song_id').alias('song_id'), F.col('recommendation.rating').alias('rating'))

recommendation_users = user_recs.select('user_id').distinct()
recommendation_songs = user_recs.count()
recommendation_songs = user_recs.select('song_id').distinct()
recommendation_songs = user_recs.count()

                                                                                

In [12]:
# # Coverage Metrics
# test_song_coverage = calculate_song_coverage(train_songs, test_songs)
# test_overall_song_coverage = calculate_song_coverage(total_songs, test_songs)
# test_user_coverage = calculate_user_coverage(train_users, test_users)
# test_overall_user_coverage = calculate_user_coverage(total_users, test_users)
# recommendations_song_coverage = calculate_song_coverage(train_songs, recommendation_songs)
# recommendations_overall_song_coverage = calculate_song_coverage(total_songs, recommendation_songs)
# recommendations_user_coverage = calculate_user_coverage(train_users, recommendation_users)
# recommendations_overall_user_coverage = calculate_user_coverage(total_users, recommendation_users)

In [13]:
# print(f"'Test User Coverage(Model:Overall)': {round(test_user_coverage, 2)} : {round(test_overall_user_coverage, 2)}")
# print(f"'Test Song Coverage(Model:Overall)': {round(test_song_coverage, 2)} : {round(test_overall_song_coverage, 2)}")
# print(f"'Recommendations User Coverage(Model:Overall)': {round(recommendations_user_coverage, 2)} : {round(recommendations_overall_user_coverage, 2)}")
# print(f"'Recommendations Song Coverage(Model:Overall)': {round(recommendations_song_coverage, 2)} : {round(recommendations_overall_song_coverage, 2)}")

In [14]:
# Get Predictions
spark.catalog.clearCache()
predictions = model.transform(test_data)

In [15]:

# Evaluation Metrics
rmse = calculate_rmse(predictions)
mae = calculate_mae(predictions)
precision, recall = calculate_precision_recall(user_recs, test_data, 0.0)

                                                                                

Precision: 0.0005877247224475671 = Relevant Recommendations: 1036 / Total Recommendations: 1762730
Recall: 0.0005388869903005544 = Relevant Recommendations: 1036 / Relevant Test Items: 1922481


In [16]:
print(f"'Recommendations Precision': {round(precision, 4)}")
print(f"'Recommendations Recall': {round(recall, 4)}")
print(f"'Predictions RMSE': {rmse}")
print(f"'Predictions MAE': {mae}")

'Recommendations Precision': 0.0006
'Recommendations Recall': 0.0005
'Predictions RMSE': 1.0726797541719026
'Predictions MAE': 0.8087341378166875


## Evaluation Results

In [17]:
# # Evaluation Results Schema
# schema = StructType([
#     StructField("Model", StringType(), True),
#     StructField("Dataset", StringType(), True),
#     StructField("Users(Train:Test)", StringType(), True),
#     StructField("Songs(Train:Test)", StringType(), True),
#     StructField("Test User Coverage(Model:Overall)", StringType(), True),
#     StructField("Test Song Coverage(Model:Overall)", StringType(), True),
#     StructField("Recommendations User Coverage(Model:Overall)", StringType(), True),
#     StructField("Recommendations Song Coverage(Model:Overall)", StringType(), True),
#     StructField("Recommendations Precision", FloatType(), True),
#     StructField("Recommendations Recall", FloatType(), True),
#     StructField("Predictions RMSE", FloatType(), True),
#     StructField("Predictions MAE", FloatType(), True),
# ])

# # Output Results as DF
# results_df = spark.createDataFrame(results, schema)
# results_df.show(truncate=False) 

In [18]:
import matplotlib.pyplot as plt
import numpy as np

# Data points for three datasets
splits = ['Raw', 'Downscaled User Ratings', 'Downscaled User & Song Ratings']
precision = [0.0005, 0.0002, 0.0006]
recall = [0.0005 , 0.0001, 0.0005]
rmse = [1.119, 1.031, 1.072]
mae = [0.884, 0.809, 0.808]

# Creating the bar width
bar_width = 0.2

# Set position of bar on X axis
r1 = np.arange(len(splits))
r2 = [x + bar_width for x in r1]
r3 = [x + bar_width for x in r2]
r4 = [x + bar_width for x in r3]

# Plotting the bar chart with a logarithmic scale
plt.figure(figsize=(14, 7))
plt.bar(r1, precision, color='blue', width=bar_width, edgecolor='grey', label='Precision')
plt.bar(r2, recall, color='orange', width=bar_width, edgecolor='grey', label='Recall')
plt.bar(r3, rmse, color='green', width=bar_width, edgecolor='grey', label='RMSE')
plt.bar(r4, mae, color='red', width=bar_width, edgecolor='grey', label='MAE')

# Adding labels, title, and legend
plt.xlabel('Splits', fontweight='bold')
plt.xticks([r + bar_width for r in range(len(splits))], splits)
plt.ylabel('Values (Log Scale)', fontweight='bold')
plt.yscale('log')  # Set y-axis to logarithmic scale
plt.title('Model Evaluation Metrics Across Splits')
plt.legend()

# Display the plot
plt.show()