# Preliminary EDA

In [2]:
# Suppress native-hadoop warning
!sed -i '$a\# Add the line for suppressing the NativeCodeLoader warning \nlog4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR,console' /$HADOOP_HOME/etc/hadoop/log4j.properties

In [52]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import split
import matplotlib.pyplot as plt
import seaborn as sns

conf = pyspark.SparkConf().setAll([
    ('spark.master', 'local[*]'),
    ('spark.app.name', 'App Name')])
spark = SparkSession.builder.config(conf=conf).getOrCreate()


#### EDA on songs with ratings train data file

In [4]:
song_ratings = spark.read.text("file:///home/work/Project/yahoo_dataset/train_0.txt").cache()
song_ratings.printSchema()

root
 |-- value: string (nullable = true)



In [5]:
song_ratings.show(5)

                                                                                

+----------+
|     value|
+----------+
| 0\t166\t5|
|0\t2245\t4|
|0\t3637\t4|
|0\t5580\t4|
|0\t5859\t4|
+----------+
only showing top 5 rows



In [6]:
song_ratings = song_ratings.withColumn("user_id", split(song_ratings.value, "\t")[0]) \
       .withColumn("song_id", split(song_ratings.value, "\t")[1]) \
       .withColumn("ratings", split(song_ratings.value, "\t")[2]) \
       .drop("value")

In [7]:
song_ratings.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- ratings: string (nullable = true)



In [8]:
song_ratings.show(5)

+-------+-------+-------+
|user_id|song_id|ratings|
+-------+-------+-------+
|      0|    166|      5|
|      0|   2245|      4|
|      0|   3637|      4|
|      0|   5580|      4|
|      0|   5859|      4|
+-------+-------+-------+
only showing top 5 rows



#### EDA on song attribute file

In [9]:
song_attributes = spark.read.text("file:///home/work/Project/yahoo_dataset/song-attributes.txt").cache()
song_attributes.printSchema()

root
 |-- value: string (nullable = true)



In [10]:
song_attributes.show(5)

+-------------------+
|              value|
+-------------------+
|  0\t12070\t8490\t0|
|1\t19512\t7975\t134|
|  2\t18953\t3492\t0|
|    3\t695\t2653\t0|
|    4\t243\t2282\t0|
+-------------------+
only showing top 5 rows



In [11]:
song_attributes = song_attributes.withColumn("song_id", split(song_attributes.value, "\t")[0]) \
       .withColumn("album_id", split(song_attributes.value, "\t")[1]) \
       .withColumn("artist_id", split(song_attributes.value, "\t")[2]) \
       .withColumn("genre_id", split(song_attributes.value, "\t")[3]) \
       .drop("value")

In [12]:
song_attributes.printSchema()

root
 |-- song_id: string (nullable = true)
 |-- album_id: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- genre_id: string (nullable = true)



In [13]:
song_attributes.show(5)

+-------+--------+---------+--------+
|song_id|album_id|artist_id|genre_id|
+-------+--------+---------+--------+
|      0|   12070|     8490|       0|
|      1|   19512|     7975|     134|
|      2|   18953|     3492|       0|
|      3|     695|     2653|       0|
|      4|     243|     2282|       0|
+-------+--------+---------+--------+
only showing top 5 rows



#### EDA on genre hierarchy file

In [14]:
genre_hierarchy = spark.read.text("file:///home/work/Project/yahoo_dataset/genre-hierarchy.txt").cache()
genre_hierarchy.printSchema()

root
 |-- value: string (nullable = true)



In [15]:
genre_hierarchy.show(5)

+--------------------+
|               value|
+--------------------+
|    0\t0\t1\tUnknown|
|1\t1\t1\tElectron...|
|    2\t1\t2\tAmbient|
|3\t2\t3\tAmbient Dub|
|4\t2\t3\tAmbient ...|
+--------------------+
only showing top 5 rows



In [16]:
genre_hierarchy = genre_hierarchy.withColumn("genre_id", split(genre_hierarchy.value, "\t")[0]) \
       .withColumn("parent_genre_id", split(genre_hierarchy.value, "\t")[1]) \
       .withColumn("level", split(genre_hierarchy.value, "\t")[2]) \
       .withColumn("genre_name", split(genre_hierarchy.value, "\t")[3]) \
       .drop("value")

In [17]:
genre_hierarchy.printSchema()

root
 |-- genre_id: string (nullable = true)
 |-- parent_genre_id: string (nullable = true)
 |-- level: string (nullable = true)
 |-- genre_name: string (nullable = true)



In [18]:
genre_hierarchy.show(5)

+--------+---------------+-----+----------------+
|genre_id|parent_genre_id|level|      genre_name|
+--------+---------------+-----+----------------+
|       0|              0|    1|         Unknown|
|       1|              1|    1|Electronic/Dance|
|       2|              1|    2|         Ambient|
|       3|              2|    3|     Ambient Dub|
|       4|              2|    3|    Ambient Tech|
+--------+---------------+-----+----------------+
only showing top 5 rows



#### Joining all files together based on song_id and genre_id

In [19]:
df = song_ratings.join(song_attributes, "song_id", how="inner")
df = df.join(genre_hierarchy,"genre_id", how="inner")

In [50]:
df = df.drop('parent_genre_id').drop('level')
df.describe().show()



+-------+-----------------+------------------+------------------+------------------+------------------+------------------+-----------------+
|summary|         genre_id|           song_id|           user_id|           ratings|          album_id|         artist_id|       genre_name|
+-------+-----------------+------------------+------------------+------------------+------------------+------------------+-----------------+
|  count|         76344627|          76344627|          76344627|          76344627|          76344627|          76344627|         76344627|
|   mean|16.77831798169634| 68279.70831871378|100040.94482474844|3.1534895834909245|10362.502501191078| 4776.541211367763|             NULL|
| stddev|43.07284877305486|39461.312338210184| 57753.04973969481| 1.599535981362968| 5905.052390982918|2681.1466705977155|             NULL|
|    min|                0|                 0|                 0|                 1|                 0|                 0|Adult Alternative|
|    max|    

                                                                                

In [36]:
from pyspark.sql.functions import col

null_columns = {column: df.filter(col(column).isNull()).count() for column in df.columns}
null_columns

                                                                                

{'genre_id': 0,
 'song_id': 0,
 'user_id': 0,
 'ratings': 0,
 'album_id': 0,
 'artist_id': 0,
 'parent_genre_id': 0,
 'level': 0,
 'genre_name': 0}

In [51]:
df.show(10)

+--------+-------+-------+-------+--------+---------+------------+
|genre_id|song_id|user_id|ratings|album_id|artist_id|  genre_name|
+--------+-------+-------+-------+--------+---------+------------+
|       0|    166|      0|      5|    5303|     7231|     Unknown|
|       0|   2245|      0|      4|   14290|     3265|     Unknown|
|       0|   3637|      0|      4|   15761|      629|     Unknown|
|       0|   5580|      0|      4|    8324|     4761|     Unknown|
|       0|   5859|      0|      4|   19671|     4716|     Unknown|
|       0|   7121|      0|      3|    5303|     7372|     Unknown|
|       0|  10405|      0|      4|    5303|     7149|     Unknown|
|       0|  16794|      0|      5|    2517|     7620|     Unknown|
|     135|  17821|      0|      5|    8620|     2020|Classic Rock|
|       0|  21252|      0|      4|   15761|     2535|     Unknown|
+--------+-------+-------+-------+--------+---------+------------+
only showing top 10 rows



In [45]:
df.groupBy('genre_name').count().show()



+----------------+--------+
|      genre_name|   count|
+----------------+--------+
|           World|   10899|
|      Indie Rock|    2949|
|     Classic R&B|   64083|
|           Latin|   44735|
|      Modern R&B|    2518|
|           Metal|   27826|
|Electronic/Dance|   19236|
|            Rock| 5293264|
|     Death Metal|    4710|
|            Jazz|   67980|
|     Early Blues|    2379|
|       Pop Metal|   28399|
|  Country Comedy|     281|
|         New Age|    7730|
|            Folk|   80525|
|         Unknown|65499138|
|          Lounge|      99|
|         Holiday|    2337|
|     Modern Rock|   29596|
|  Minimal Techno|     158|
+----------------+--------+
only showing top 20 rows



                                                                                