In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Suppress native-hadoop warning 
!sed -i '$a\# Add the line for suppressing the NativeCodeLoader warning \nlog4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR,console' /$HADOOP_HOME/etc/hadoop/log4j.properties

In [3]:
import sys
sys.path.append('/home/work')

BASE_DIR = '/home/work'

In [4]:
import pyspark
from pyspark.sql import SparkSession, functions as F
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

from data.utils import load_from_hdfs, load_data, user_based_train_test_split

In [5]:
# Set Spark Settings
conf = pyspark.SparkConf().setAll([
    ('spark.master', 'local[4]'),
    ('spark.app.name', 'MusicRecommender'),
    # ('spark.executor.instances', '2'),  # Number of executors
    # ('spark.executor.cores', '8'),  # Cores per executor
    # ('spark.executor.memory', '10g'),  # Memory per executor
    ('spark.driver.memory','14g'),
])
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# Print Spark Settings
settings = spark.sparkContext.getConf().getAll()
for s in settings:
    print(s)

('spark.app.submitTime', '1717561770979')
('spark.driver.port', '43095')
('spark.master', 'local[4]')
('spark.app.startTime', '1717561771088')
('spark.driver.host', '3e27c8492344')
('spark.executor.id', 'driver')
('spark.app.id', 'local-1717561771836')
('spark.driver.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-open

In [6]:
test_user_ratings = 10
data_scale = 0.2
seed = 42

raw_train, raw_test = load_from_hdfs('raw', 2)
combined_raw = raw_train.union(raw_test)

user_rating_downsampled = load_data('processed/user_rating_downsampled.txt')
user_song_rating_downsampled = load_data('processed/user_and_song_rating_downsampled.txt')

datasets = [combined_raw, user_rating_downsampled, user_song_rating_downsampled]
spit_dataset = [user_based_train_test_split(dataset, test_user_ratings, data_scale, seed) for dataset in datasets]
model_names = ['raw', 'user_rating_downsampled', 'user_song_rating_downsampled']

                                                                                

Loaded partition 2: 76909821 training records and 2000000 test records from HDFS
root
 |-- user_id: integer (nullable = true)
 |-- song_id: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- partition_id: integer (nullable = false)



                                                                                

root
 |-- user_id: integer (nullable = true)
 |-- song_id: integer (nullable = true)
 |-- rating: integer (nullable = true)



                                                                                

Loaded 50296259 records from HDFS


                                                                                

root
 |-- user_id: integer (nullable = true)
 |-- song_id: integer (nullable = true)
 |-- rating: integer (nullable = true)



                                                                                

Loaded 26960370 records from HDFS


                                                                                

Initial train data count: 76909821


                                                                                

Initial test data count: 2000000


                                                                                

Final train data count: 13781964


                                                                                

Final test data count: 2000000


                                                                                

Initial train data count: 48296709


                                                                                

Initial test data count: 1999550


                                                                                

Final train data count: 8059701


                                                                                

Final test data count: 1999550


                                                                                

Initial train data count: 24985887


                                                                                

Initial test data count: 1974483


                                                                                

Final train data count: 3417591


[Stage 180:>                                                        (0 + 4) / 4]

Final test data count: 1974483


                                                                                

In [7]:
# Define ALS model
als = ALS(userCol="user_id", itemCol="song_id", ratingCol="rating",
          coldStartStrategy="drop")

In [8]:
# Tune model hyperparameters
param_grid = ParamGridBuilder() \
    .addGrid(als.maxIter, [5]) \
    .addGrid(als.regParam, [0.1]) \
    .addGrid(als.rank, [10]) \
    .build()
    
# Define a model evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

# Build cross validator
# crossval = CrossValidator(estimator=als,
#                             estimatorParamMaps=param_grid,
#                             evaluator=evaluator,
#                             numFolds=5)

tvs = TrainValidationSplit(estimator=als,
                            estimatorParamMaps=param_grid,
                            evaluator=evaluator,
                            trainRatio=0.8)

In [9]:
# Loop through datasets and train the model
for i, dataset in enumerate(datasets):
    print(f"Processing dataset {i+1}/{len(datasets)}")
    train_data, test_data = spit_dataset[i]
    
    try:
        # Clear spark cache to avoid memory issues
        spark.catalog.clearCache()

        # Fit ALS model
        print(f"Training model for dataset {i+1}")
        model = tvs.fit(train_data)

        # Get best model
        best_model = model.bestModel
        print(f'Best model for dataset {i+1}: {best_model}')

        # Save the best model for later evaluation
        best_model.write().overwrite().save(f'file://{BASE_DIR}/models/als_model/{model_names[i]}')
    
    except Exception as e:
        print(f'Error training ALS model for dataset {i+1}: {e}')

Processing dataset 1/3
Training model for dataset 1


Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

Du

Error training ALS model for dataset 1: An error occurred while calling o345.fit
Processing dataset 2/3
Error training ALS model for dataset 2: An error occurred while calling o273.clearCache
Processing dataset 3/3
Error training ALS model for dataset 3: [Errno 111] Connection refused


In [None]:
# spark.stop()