In [1]:
# Suppress native-hadoop warning
!sed -i '$a\# Add the line for suppressing the NativeCodeLoader warning \nlog4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR,console' /$HADOOP_HOME/etc/hadoop/log4j.properties

In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import split

conf = pyspark.SparkConf().setAll([
    ('spark.master', 'local[*]'),
    ('spark.app.name', 'App Name')])
spark = SparkSession.builder.config(conf=conf).getOrCreate()


In [3]:
song_ratings = spark.read.text("file:///home/work/Project/yahoo_dataset/train_0.txt").cache()
song_ratings.count()

                                                                                

76344627

In [4]:
song_ratings.printSchema()

root
 |-- value: string (nullable = true)



In [5]:
song_ratings.show(5)

+----------+
|     value|
+----------+
| 0\t166\t5|
|0\t2245\t4|
|0\t3637\t4|
|0\t5580\t4|
|0\t5859\t4|
+----------+
only showing top 5 rows



In [6]:
song_ratings = song_ratings.withColumn("user_id", split(song_ratings.value, "\t")[0]) \
       .withColumn("song_id", split(song_ratings.value, "\t")[1]) \
       .withColumn("ratings", split(song_ratings.value, "\t")[2]) \
       .drop("value")

In [7]:
song_ratings.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- ratings: string (nullable = true)



In [8]:
song_ratings.show(5)

+-------+-------+-------+
|user_id|song_id|ratings|
+-------+-------+-------+
|      0|    166|      5|
|      0|   2245|      4|
|      0|   3637|      4|
|      0|   5580|      4|
|      0|   5859|      4|
+-------+-------+-------+
only showing top 5 rows



In [9]:
song_attributes = spark.read.text("file:///home/work/Project/yahoo_dataset/song-attributes.txt").cache()
song_attributes.count()

136736

In [10]:
song_attributes.printSchema()

root
 |-- value: string (nullable = true)



In [11]:
song_attributes.show(5)

+-------------------+
|              value|
+-------------------+
|  0\t12070\t8490\t0|
|1\t19512\t7975\t134|
|  2\t18953\t3492\t0|
|    3\t695\t2653\t0|
|    4\t243\t2282\t0|
+-------------------+
only showing top 5 rows



In [12]:
song_attributes = song_attributes.withColumn("song_id", split(song_attributes.value, "\t")[0]) \
       .withColumn("album_id", split(song_attributes.value, "\t")[1]) \
       .withColumn("artist_id", split(song_attributes.value, "\t")[2]) \
       .withColumn("genre_id", split(song_attributes.value, "\t")[3]) \
       .drop("value")

In [13]:
song_attributes.printSchema()

root
 |-- song_id: string (nullable = true)
 |-- album_id: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- genre_id: string (nullable = true)



In [14]:
song_attributes.show(5)

+-------+--------+---------+--------+
|song_id|album_id|artist_id|genre_id|
+-------+--------+---------+--------+
|      0|   12070|     8490|       0|
|      1|   19512|     7975|     134|
|      2|   18953|     3492|       0|
|      3|     695|     2653|       0|
|      4|     243|     2282|       0|
+-------+--------+---------+--------+
only showing top 5 rows



In [15]:
genre_hierarchy = spark.read.text("file:///home/work/Project/yahoo_dataset/genre-hierarchy.txt").cache()
genre_hierarchy.count()

216

In [16]:
genre_hierarchy.printSchema()

root
 |-- value: string (nullable = true)



In [17]:
genre_hierarchy.show(5)

+--------------------+
|               value|
+--------------------+
|    0\t0\t1\tUnknown|
|1\t1\t1\tElectron...|
|    2\t1\t2\tAmbient|
|3\t2\t3\tAmbient Dub|
|4\t2\t3\tAmbient ...|
+--------------------+
only showing top 5 rows



In [18]:
genre_hierarchy = genre_hierarchy.withColumn("genre_id", split(genre_hierarchy.value, "\t")[0]) \
       .withColumn("parent_genre_id", split(genre_hierarchy.value, "\t")[1]) \
       .withColumn("level", split(genre_hierarchy.value, "\t")[2]) \
       .withColumn("genre_name", split(genre_hierarchy.value, "\t")[3]) \
       .drop("value")

In [19]:
genre_hierarchy.printSchema()

root
 |-- genre_id: string (nullable = true)
 |-- parent_genre_id: string (nullable = true)
 |-- level: string (nullable = true)
 |-- genre_name: string (nullable = true)



In [20]:
genre_hierarchy.show(5)

+--------+---------------+-----+----------------+
|genre_id|parent_genre_id|level|      genre_name|
+--------+---------------+-----+----------------+
|       0|              0|    1|         Unknown|
|       1|              1|    1|Electronic/Dance|
|       2|              1|    2|         Ambient|
|       3|              2|    3|     Ambient Dub|
|       4|              2|    3|    Ambient Tech|
+--------+---------------+-----+----------------+
only showing top 5 rows



In [21]:
df = song_ratings.join(song_attributes, "song_id", how="inner")

In [22]:
df.show(10)

+-------+-------+-------+--------+---------+--------+
|song_id|user_id|ratings|album_id|artist_id|genre_id|
+-------+-------+-------+--------+---------+--------+
|    166|      0|      5|    5303|     7231|       0|
|   2245|      0|      4|   14290|     3265|       0|
|   3637|      0|      4|   15761|      629|       0|
|   5580|      0|      4|    8324|     4761|       0|
|   5859|      0|      4|   19671|     4716|       0|
|   7121|      0|      3|    5303|     7372|       0|
|  10405|      0|      4|    5303|     7149|       0|
|  16794|      0|      5|    2517|     7620|       0|
|  17821|      0|      5|    8620|     2020|     135|
|  21252|      0|      4|   15761|     2535|       0|
+-------+-------+-------+--------+---------+--------+
only showing top 10 rows



In [23]:
df = df.join(genre_hierarchy,"genre_id", how="inner")

In [31]:
df.show(10)

+--------+-------+-------+-------+--------+---------+---------------+-----+------------+
|genre_id|song_id|user_id|ratings|album_id|artist_id|parent_genre_id|level|  genre_name|
+--------+-------+-------+-------+--------+---------+---------------+-----+------------+
|       0|    166|      0|      5|    5303|     7231|              0|    1|     Unknown|
|       0|   2245|      0|      4|   14290|     3265|              0|    1|     Unknown|
|       0|   3637|      0|      4|   15761|      629|              0|    1|     Unknown|
|       0|   5580|      0|      4|    8324|     4761|              0|    1|     Unknown|
|       0|   5859|      0|      4|   19671|     4716|              0|    1|     Unknown|
|       0|   7121|      0|      3|    5303|     7372|              0|    1|     Unknown|
|       0|  10405|      0|      4|    5303|     7149|              0|    1|     Unknown|
|       0|  16794|      0|      5|    2517|     7620|              0|    1|     Unknown|
|     135|  17821|   