# Preliminary EDA

In [1]:
# Suppress native-hadoop warning
!sed -i '$a\# Add the line for suppressing the NativeCodeLoader warning \nlog4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR,console' /$HADOOP_HOME/etc/hadoop/log4j.properties
# Auto reload modules
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/home/work')
BASE_DIR = '/home/work'
DATA_DIR = BASE_DIR + '/data'

DATASET = '/processed/user_rating_balanced'
TRAIN_DIR = DATA_DIR + DATASET + '/train'
TEST_DIR = DATA_DIR + DATASET + '/test'

In [3]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col, count, desc, min, max, log, abs, mean, stddev, row_number, rand
from pyspark.sql.window import Window
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from data.utils.data_loader import load_file_from_hdfs

from EDA.clean_data import cleaned_df

In [4]:
conf = pyspark.SparkConf()
settings = conf.getAll()

# Set Spark Settings
conf = pyspark.SparkConf().setAll([
('spark.master', 'local[4]'),
('spark.app.name', 'MusicRecommender'),
('spark.driver.memory','14g')])
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# Print the Spark Session settings
settings = spark.sparkContext.getConf().getAll()
for s in settings:
    print(s)

('spark.app.submitTime', '1717117718523')
('spark.master', 'local[4]')
('spark.app.id', 'local-1717117719346')
('spark.executor.id', 'driver')
('spark.driver.host', '693f94dcf7da')
('spark.driver.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDire

Load choosen partition of song ratings, song attributes, and genre hierarchy.

In [5]:
song_ratings_train_file_path = "/raw/train/train_2.txt"
song_ratings_train = load_file_from_hdfs(song_ratings_train_file_path)

song_ratings_test_file_path = "/raw/test/test_2.txt"
song_ratings_test = load_file_from_hdfs(song_ratings_test_file_path)

song_ratings = song_ratings_train.union(song_ratings_test)

song_attributes_file_path = "song-attributes.txt"
song_attributes = load_file_from_hdfs(song_attributes_file_path)

genre_hierarchy_file_path = "genre-hierarchy.txt"
genre_hierarchy = load_file_from_hdfs(genre_hierarchy_file_path)

                                                                                

Call cleaned_df function to get cleaned df from EDA

In [6]:
df = cleaned_df(song_ratings,song_attributes,genre_hierarchy)

root
 |-- user_id: integer (nullable = true)
 |-- song_id: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- album_id: integer (nullable = true)
 |-- artist_id: integer (nullable = true)
 |-- genre_id: integer (nullable = true)
 |-- genre_name: string (nullable = true)



                                                                                

+-------+------------------+-----------------+------------------+------------------+------------------+------------------+-----------------+
|summary|           user_id|          song_id|            rating|          album_id|         artist_id|          genre_id|       genre_name|
+-------+------------------+-----------------+------------------+------------------+------------------+------------------+-----------------+
|  count|          78909821|         78909821|          78909821|          78909821|          78909821|          78909821|         78909821|
|   mean|499746.46840439545|68274.02707812504|3.1528440674070213|10361.592222392697|4776.0641216636395|16.666298850177345|             NULL|
| stddev| 57936.75037674207|39459.18779033151|1.6030326088607252| 5906.090895444379| 2680.786813237493| 42.91624825455018|             NULL|
|    min|            400000|                0|                 1|                 0|                 0|                 0|Adult Alternative|
|    max|    

                                                                                

Null Columns in cleaned df: {'user_id': 0, 'song_id': 0, 'rating': 0, 'album_id': 0, 'artist_id': 0, 'genre_id': 0, 'genre_name': 0}


In [7]:
df.count()

                                                                                

78909821

Get users' rating count to ensure balanced distribution of ratings per user.

In [8]:
user_ratings_count = df.groupBy("user_id").count()
user_ratings_count = user_ratings_count.withColumnRenamed("count", "ratings_count")
# user_ratings_count.show(5)

Min and max values of users are quite different. There must be some outliers.

In [9]:
min_max_values = user_ratings_count.select(min("ratings_count").alias("min_value"), max("ratings_count").alias("max_value")).first()

min_value = min_max_values["min_value"]
max_value = min_max_values["max_value"]

print("Minimum value:", min_value)
print("Maximum value:", max_value)



Minimum value: 30
Maximum value: 131533


                                                                                

Distribution is extremely skewed.

In [10]:
# data= user_ratings_count.toPandas()

# plt.figure(figsize=(10, 6))
# plt.hist(data['ratings_count'], bins=30, edgecolor='black')
# plt.title('Distribution of Rating Counts per User')
# plt.xlabel('Number of Ratings')
# plt.ylabel('Frequency')
# plt.show()

total_users = user_ratings_count.count()
print("Total number of users:", total_users)



Total number of users: 200000


                                                                                

In [11]:
user_ratings_count[user_ratings_count['ratings_count'] == 0].count() # no zero
user_ratings_count = user_ratings_count.withColumn('log_ratings_count', log(user_ratings_count['ratings_count']))  

                                                                                

In [12]:
mean_log = user_ratings_count.select(mean(col("log_ratings_count"))).collect()[0][0]
stddev_log = user_ratings_count.select(stddev(col("log_ratings_count"))).collect()[0][0]

user_ratings_count = user_ratings_count.withColumn("z_score", (col("log_ratings_count") - mean_log) / stddev_log)
user_ratings_count = user_ratings_count.withColumn("is_outlier", abs(col("z_score")) > 3) # threshold 3

                                                                                

In [13]:
# user_ratings_count_cleaned = user_ratings_count.filter(col("is_outlier") == False)
# print("Original DataFrame:", user_ratings_count.count())
# print("Cleaned DataFrame (without outliers):", user_ratings_count_cleaned.count())

Still extremely skewed with large range of ratings count.

In [14]:
# user_ratings_count_cleaned_data= user_ratings_count_cleaned.toPandas()

# plt.figure(figsize=(10, 6))
# plt.hist(user_ratings_count_cleaned_data['ratings_count'], bins=30, edgecolor='black')
# plt.title('New Distribution of Rating Counts per User')
# plt.xlabel('Number of Ratings')
# plt.ylabel('Frequency')
# plt.show()

In [15]:
# user_ratings_count_cleaned_df = df.join(user_ratings_count_cleaned.select("user_id"), on="user_id", how="inner")
#user_ratings_count_cleaned_df.show()

In [16]:
# user_ratings_count_cleaned_df.count()

Use IQR, lower bound, upper bound to remove outliers

In [17]:
quantiles = user_ratings_count.approxQuantile("ratings_count", [0.25, 0.75], 0.05)
Q1 = quantiles[0]
Q3 = quantiles[1]
# IQR = Q3 - Q1

# lower_bound = Q1 - (1.5 * IQR)
upper_bound = Q3
print("Upper bound:", upper_bound)

filtered_below_upper_bound = user_ratings_count.filter(col("ratings_count") < upper_bound)
ratings_below_upper_bound = df.join(filtered_below_upper_bound, on="user_id", how="inner")
users_above_upper_bound = user_ratings_count.filter(col("ratings_count") > upper_bound)
ratings_above_upper_bound = df.join(users_above_upper_bound, on="user_id", how="inner")

# Shuffle user ratings above the upper bound
ratings_with_random = ratings_above_upper_bound.withColumn("rand", rand())
window_spec = Window.partitionBy("user_id").orderBy("rand")

# Sample ratings above the upper bound to match the number of ratings at the upper bound
ratings_with_row_number = ratings_with_random.withColumn("row_number", row_number().over(window_spec))
resampled_ratings = ratings_with_row_number.filter(col("row_number") <= upper_bound).drop("rand", "row_number")

filtered_df = ratings_below_upper_bound.union(resampled_ratings)

print("Data count after downsampling outliers:", filtered_df.count())

                                                                                

Upper bound: 328.0




Data count after downsampling outliers: 35003255


                                                                                

In [18]:
# filtered_df_data = filtered_df.select("user_id", "ratings_count").toPandas()

# plt.figure(figsize=(10, 6))
# plt.hist(filtered_df_data['ratings_count'], bins=30, edgecolor='black')
# plt.title('New Distribution of Rating Counts per User')
# plt.xlabel('Number of Ratings')
# plt.ylabel('Frequency')
# plt.show()

In [19]:
# Create train test split such that there is still at least 10 random ratings per user in the test set
filtered_df = filtered_df.select("user_id", "song_id", "rating")
# Shuffle the data
shuffled_df = filtered_df.orderBy(rand())

# Assign row numbers within each user_id partition
window = Window.partitionBy("user_id").orderBy(rand())
df = shuffled_df.withColumn("row_number", row_number().over(window))

train_df = df.filter(col("row_number") > 10)
test_df = df.filter(col("row_number") <= 10)

train_df = train_df.drop("row_number")
test_df = test_df.drop("row_number")
print("Train data count:", train_df.count())
print("Test data count:", test_df.count())

                                                                                

+-------+-------+------+
|user_id|song_id|rating|
+-------+-------+------+
| 400001|  82451|     5|
| 400001|  84589|     5|
| 400001|  56660|     4|
| 400001|  38174|     5|
| 400001|  18878|     5|
| 400001|  39847|     5|
| 400001| 125351|     4|
| 400001|  86910|     5|
| 400001|   3723|     5|
| 400001|  18663|     4|
| 400001|  94003|     4|
| 400001|  30670|     4|
| 400001| 125173|     5|
| 400001|  83028|     5|
| 400001|  83509|     4|
| 400001|  66132|     5|
| 400001| 114859|     5|
| 400001| 129487|     3|
| 400001|  18936|     5|
| 400001|  13843|     4|
+-------+-------+------+
only showing top 20 rows



                                                                                

+-------+-------+------+
|user_id|song_id|rating|
+-------+-------+------+
| 400001|  30670|     4|
| 400001|  13843|     4|
| 400001|  59796|     5|
| 400001|  86910|     5|
| 400001|  38174|     5|
| 400001| 117092|     5|
| 400001|  66132|     5|
| 400001| 116627|     5|
| 400001| 114356|     4|
| 400001| 105698|     5|
| 400003| 129582|     5|
| 400003|   7475|     5|
| 400003|   3378|     4|
| 400003|  62462|     3|
| 400003|  57959|     4|
| 400003| 126373|     4|
| 400003|  46722|     5|
| 400003|  31139|     5|
| 400003| 102062|     4|
| 400003|  33806|     5|
+-------+-------+------+
only showing top 20 rows



                                                                                

Train data count: 33004575


[Stage 181:>                                                        (0 + 4) / 5]

Test data count: 1998680


                                                                                

In [20]:
# Make dirs if not exist
!mkdir -p $TRAIN_DIR
!mkdir -p $TEST_DIR

# Save Train to local
train_df = train_df.coalesce(1)
test_df = test_df.coalesce(1)
train_df.write.csv(f"file://{TRAIN_DIR}/temp", header=False, mode="overwrite", sep="\t")
test_df.write.csv(f"file://{TEST_DIR}/temp", header=False, mode="overwrite", sep="\t")

                                                                                

In [21]:
# Move to single file
! rm -rf $TRAIN_DIR/train_0.txt
! mv $TRAIN_DIR/temp/part-00000* $TRAIN_DIR/train_0.txt
! rm -rf $TEST_DIR/test_0.txt
! mv $TEST_DIR/temp/part-00000* $TEST_DIR/test_0.txt

# Clean up
! rm -rf $TRAIN_DIR/temp
! rm -rf $TEST_DIR/temp