In [1]:
from pyspark.sql import SparkSession
from operator import add

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.218:7077") \
        .appName("Daniel_Hjelm_Test")\
        .config("spark.executor.cores",1)\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/04 11:20:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/03/04 11:20:50 WARN ExecutorAllocationManager: Dynamic allocation without a shuffle service is an experimental feature.


In [2]:
# Read all the data
data = spark_session.read.option("recursiveFileLookup", "true").json('hdfs://192.168.2.200:9000/user/ubuntu/lastfm_train/B/A')

# Read a small subset of data:
#data = spark_session.read.json('hdfs://192.168.2.200:9000/user/ubuntu/lastfm_train/B/A/A/*.json')

                                                                                

In [3]:
# Count the amount of rows in the dataset
data.count()

                                                                                

1289

In [4]:
# Display the data
data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+
|              artist|            similars|                tags|           timestamp|               title|          track_id|
+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+
|            Maroon 5|[[TRZHXFO128F9302...|[[pop, 100], [roc...|2011-08-02 20:57:...|   She Will Be Loved|TRBAMHJ128F9302A08|
|          Anita Ward|[[TRQCINT128F4245...|[[Disco, 100], [7...|2011-08-15 12:34:...|        Ring My Bell|TRBASRX128F92E9CD1|
|Bill Haley and th...|[[TRISKEM128F427D...|[[oldies, 100], [...|2011-08-02 09:55:...|See You Later All...|TRBAJNU128F4260039|
|       Eddie Cochran|[[TRIEPQF128F9329...|[[rock n roll, 10...|2011-08-11 20:51:...|     C'mon Everybody|TRBAMXP128F92FC856|
|          The Troggs|[[TRGQCBQ12903CBC...|[[60s, 100], [cla...|2011-08-05 01:01:...|          Wild Thing|TRBAKTC128F9

                                                                                

In [160]:
# Flatten the tags column
# newData = data.select(flatten(data.tags).alias("new tags"))
# newData.count()

                                                                                

1289

In [6]:
# Flatten the tags column
from pyspark.sql.functions import flatten
dataframe = data.withColumn("new tags", flatten(data.tags))
dataframe.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+
|              artist|            similars|                tags|           timestamp|               title|          track_id|            new tags|
+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+
|            Maroon 5|[[TRZHXFO128F9302...|[[pop, 100], [roc...|2011-08-02 20:57:...|   She Will Be Loved|TRBAMHJ128F9302A08|[pop, 100, rock, ...|
|          Anita Ward|[[TRQCINT128F4245...|[[Disco, 100], [7...|2011-08-15 12:34:...|        Ring My Bell|TRBASRX128F92E9CD1|[Disco, 100, 70s,...|
|Bill Haley and th...|[[TRISKEM128F427D...|[[oldies, 100], [...|2011-08-02 09:55:...|See You Later All...|TRBAJNU128F4260039|[oldies, 100, roc...|
|       Eddie Cochran|[[TRIEPQF128F9329...|[[rock n roll, 10...|2011-08-11 20:51:...|     C'mon Everybody|TRBAMXP128F9

In [7]:
dataframe.printSchema()

root
 |-- artist: string (nullable = true)
 |-- similars: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- tags: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- timestamp: string (nullable = true)
 |-- title: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- new tags: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [8]:
# Inspect the new tags column
dataframe.select("new tags").first()

Row(new tags=['pop', '100', 'rock', '83', 'maroon 5', '78', 'alternative', '59', 'pop rock', '51', 'Love', '27', 'alternative rock', '18', 'she will be loved', '16', 'Ballad', '12', 'Mellow', '10', 'male vocalists', '10', '00s', '9', 'soft rock', '8', 'favorites', '8', 'romantic', '7', 'Maroon5', '7', 'chill', '7', 'american', '6', 'love songs', '6', 'indie', '6', 'beautiful', '6', 'chillout', '5', 'memories', '5', 'sad', '4', 'Pop-Rock', '3', 'acoustic', '3', 'Favorite', '3', 'love song', '3', 'Maroon 5 - She will be loved', '3', 'easy listening', '3', 'relax', '2', 'male vocalist', '2', 'Favourites', '2', 'favorite songs', '2', 'Maroon 5 She Will Be Loved', '2', 'funk', '2', '2004', '2', 'indie rock', '2', 'Sentimental', '2', 'loved', '1', 'slow', '1', 'One Tree Hill', '1', 'sweet', '1', 'ballads', '1', 'nostalgic', '1', 'catchy', '1', 'emo', '1', 'melancholic', '1', 'soft', '1', 'amazing', '1', 'cool', '1', 'Awesome', '1', 'top 40', '1', 'Good Stuff', '1', 'favourite', '1', 'guilty 

In [14]:
# Remove the numbers from the tags
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import udf, col

removeNumbers = udf(lambda lst: lst[0::2], ArrayType(StringType()))

dataFrame = dataframe.withColumn("only tags", removeNumbers(col("new tags")))

In [15]:
dataFrame.select("only tags").first()

                                                                                

Row(only tags=['pop', 'rock', 'maroon 5', 'alternative', 'pop rock', 'Love', 'alternative rock', 'she will be loved', 'Ballad', 'Mellow', 'male vocalists', '00s', 'soft rock', 'favorites', 'romantic', 'Maroon5', 'chill', 'american', 'love songs', 'indie', 'beautiful', 'chillout', 'memories', 'sad', 'Pop-Rock', 'acoustic', 'Favorite', 'love song', 'Maroon 5 - She will be loved', 'easy listening', 'relax', 'male vocalist', 'Favourites', 'favorite songs', 'Maroon 5 She Will Be Loved', 'funk', '2004', 'indie rock', 'Sentimental', 'loved', 'slow', 'One Tree Hill', 'sweet', 'ballads', 'nostalgic', 'catchy', 'emo', 'melancholic', 'soft', 'amazing', 'cool', 'Awesome', 'top 40', 'Good Stuff', 'favourite', 'guilty pleasure', 'soul', 'lovely', 'West Coast swing', 'singer-songwriter', 'USA', 'my youth', 'maroon', 'fav', '2000s', 'breaks my heart', 'Guilty Pleasures', 'good', 'Love it', 'falling asleep sadly', 'melodic', 'expressive', 'america', 'poprock', 'adult contemporary', 'Verbotene Liebe', '

In [17]:
dataFrame = dataFrame.drop("tags", "new tags")

In [18]:
dataFrame.printSchema()

root
 |-- artist: string (nullable = true)
 |-- similars: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- timestamp: string (nullable = true)
 |-- title: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- only tags: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [19]:
dataFrame.show()

[Stage 9:>                                                          (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+
|              artist|            similars|           timestamp|               title|          track_id|           only tags|
+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+
|            Maroon 5|[[TRZHXFO128F9302...|2011-08-02 20:57:...|   She Will Be Loved|TRBAMHJ128F9302A08|[pop, rock, maroo...|
|          Anita Ward|[[TRQCINT128F4245...|2011-08-15 12:34:...|        Ring My Bell|TRBASRX128F92E9CD1|[Disco, 70s, danc...|
|Bill Haley and th...|[[TRISKEM128F427D...|2011-08-02 09:55:...|See You Later All...|TRBAJNU128F4260039|[oldies, rock n r...|
|       Eddie Cochran|[[TRIEPQF128F9329...|2011-08-11 20:51:...|     C'mon Everybody|TRBAMXP128F92FC856|[rock n roll, 50s...|
|          The Troggs|[[TRGQCBQ12903CBC...|2011-08-05 01:01:...|          Wild Thing|TRBAKTC128F93358D6|[60s, classic 

                                                                                

In [26]:
# Show the top 20 most frequent vehicle makers, and their frequencies.
from pyspark.sql.functions import explode, desc
#dataFrame.select("*", explode('only tags')).show()
dataFrame.groupBy('only tags').count().sort(desc('count')).show()



+--------------------+-----+
|           only tags|count|
+--------------------+-----+
|                  []|  602|
|              [rock]|    4|
|         [grindcore]|    3|
|           [country]|    3|
|             [blues]|    3|
|               [pop]|    2|
|         [dancehall]|    2|
|        [electronic]|    2|
|              [jazz]|    2|
|            [reggae]|    2|
|[80s, rock, Love,...|    1|
|[chillout, funk, ...|    1|
|[post-punk, punk,...|    1|
|[italian, female ...|    1|
|[songs i should h...|    1|
|[Hip-Hop, rap, hi...|    1|
|[latin, spanish, ...|    1|
|[indie rock, indi...|    1|
|[pop rock, britpo...|    1|
|[trip-hop, electr...|    1|
+--------------------+-----+
only showing top 20 rows



                                                                                

In [74]:
# Create new dataframe with only the tags
tagsDF = dataFrame.select("only tags")
# Create a rdd from that dataframe
tagsRDD = tagsDF.rdd

tagsRDD.take(3)

[Row(only tags=['pop', 'rock', 'maroon 5', 'alternative', 'pop rock', 'Love', 'alternative rock', 'she will be loved', 'Ballad', 'Mellow', 'male vocalists', '00s', 'soft rock', 'favorites', 'romantic', 'Maroon5', 'chill', 'american', 'love songs', 'indie', 'beautiful', 'chillout', 'memories', 'sad', 'Pop-Rock', 'acoustic', 'Favorite', 'love song', 'Maroon 5 - She will be loved', 'easy listening', 'relax', 'male vocalist', 'Favourites', 'favorite songs', 'Maroon 5 She Will Be Loved', 'funk', '2004', 'indie rock', 'Sentimental', 'loved', 'slow', 'One Tree Hill', 'sweet', 'ballads', 'nostalgic', 'catchy', 'emo', 'melancholic', 'soft', 'amazing', 'cool', 'Awesome', 'top 40', 'Good Stuff', 'favourite', 'guilty pleasure', 'soul', 'lovely', 'West Coast swing', 'singer-songwriter', 'USA', 'my youth', 'maroon', 'fav', '2000s', 'breaks my heart', 'Guilty Pleasures', 'good', 'Love it', 'falling asleep sadly', 'melodic', 'expressive', 'america', 'poprock', 'adult contemporary', 'Verbotene Liebe', 

In [136]:
tagsCount = tagsRDD\
            .map(lambda lst: lst[0])\
            .flatMap(lambda x: x)\
            .map(lambda x: (x,1))\
            .reduceByKey(lambda x, y: x+y)\
            .map(lambda x: (x[1], x[0]))\
            .sortByKey(False)
        
            
            # 
#wordCount = wordCount.mapValues(lambda x: (x,1))
tagsCount.take(10)

                                                                                

[(153, 'rock'),
 (100, 'pop'),
 (78, 'alternative'),
 (68, 'female vocalists'),
 (66, 'Love'),
 (65, 'electronic'),
 (62, 'indie'),
 (51, 'favorites'),
 (51, 'dance'),
 (51, '00s')]