In [1]:
from pyspark.sql import SparkSession
from operator import add

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.218:7077") \
        .appName("Daniel_Hjelm_Test")\
        .config("spark.executor.cores",1)\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/04 07:19:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/03/04 07:20:03 WARN ExecutorAllocationManager: Dynamic allocation without a shuffle service is an experimental feature.


In [168]:
# Read all the data
data = spark_session.read.option("recursiveFileLookup", "true").json('hdfs://192.168.2.200:9000/user/ubuntu/lastfm_train/B/A')

# Read a small subset of data:
#data = spark_session.read.json('hdfs://192.168.2.200:9000/user/ubuntu/lastfm_train/B/A/A/*.json')

                                                                                

In [169]:
# Count the amount of rows in the dataset
data.count()

                                                                                

1289

In [170]:
# Display the data
data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+
|              artist|            similars|                tags|           timestamp|               title|          track_id|
+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+
|            Maroon 5|[[TRZHXFO128F9302...|[[pop, 100], [roc...|2011-08-02 20:57:...|   She Will Be Loved|TRBAMHJ128F9302A08|
|          Anita Ward|[[TRQCINT128F4245...|[[Disco, 100], [7...|2011-08-15 12:34:...|        Ring My Bell|TRBASRX128F92E9CD1|
|Bill Haley and th...|[[TRISKEM128F427D...|[[oldies, 100], [...|2011-08-02 09:55:...|See You Later All...|TRBAJNU128F4260039|
|       Eddie Cochran|[[TRIEPQF128F9329...|[[rock n roll, 10...|2011-08-11 20:51:...|     C'mon Everybody|TRBAMXP128F92FC856|
|          The Troggs|[[TRGQCBQ12903CBC...|[[60s, 100], [cla...|2011-08-05 01:01:...|          Wild Thing|TRBAKTC128F9

In [160]:
# Flatten the tags column
# newData = data.select(flatten(data.tags).alias("new tags"))
# newData.count()

                                                                                

1289

In [174]:
# Flatten the tags column
dataframe = data.withColumn("new tags", flatten(data.tags))
dataframe.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+
|              artist|            similars|                tags|           timestamp|               title|          track_id|            new tags|
+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+
|            Maroon 5|[[TRZHXFO128F9302...|[[pop, 100], [roc...|2011-08-02 20:57:...|   She Will Be Loved|TRBAMHJ128F9302A08|[pop, 100, rock, ...|
|          Anita Ward|[[TRQCINT128F4245...|[[Disco, 100], [7...|2011-08-15 12:34:...|        Ring My Bell|TRBASRX128F92E9CD1|[Disco, 100, 70s,...|
|Bill Haley and th...|[[TRISKEM128F427D...|[[oldies, 100], [...|2011-08-02 09:55:...|See You Later All...|TRBAJNU128F4260039|[oldies, 100, roc...|
|       Eddie Cochran|[[TRIEPQF128F9329...|[[rock n roll, 10...|2011-08-11 20:51:...|     C'mon Everybody|TRBAMXP128F9

In [175]:
dataframe.printSchema()

root
 |-- artist: string (nullable = true)
 |-- similars: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- tags: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- timestamp: string (nullable = true)
 |-- title: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- new tags: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [178]:
dataframe.select("new tags").first()

Row(new tags=['pop', '100', 'rock', '83', 'maroon 5', '78', 'alternative', '59', 'pop rock', '51', 'Love', '27', 'alternative rock', '18', 'she will be loved', '16', 'Ballad', '12', 'Mellow', '10', 'male vocalists', '10', '00s', '9', 'soft rock', '8', 'favorites', '8', 'romantic', '7', 'Maroon5', '7', 'chill', '7', 'american', '6', 'love songs', '6', 'indie', '6', 'beautiful', '6', 'chillout', '5', 'memories', '5', 'sad', '4', 'Pop-Rock', '3', 'acoustic', '3', 'Favorite', '3', 'love song', '3', 'Maroon 5 - She will be loved', '3', 'easy listening', '3', 'relax', '2', 'male vocalist', '2', 'Favourites', '2', 'favorite songs', '2', 'Maroon 5 She Will Be Loved', '2', 'funk', '2', '2004', '2', 'indie rock', '2', 'Sentimental', '2', 'loved', '1', 'slow', '1', 'One Tree Hill', '1', 'sweet', '1', 'ballads', '1', 'nostalgic', '1', 'catchy', '1', 'emo', '1', 'melancholic', '1', 'soft', '1', 'amazing', '1', 'cool', '1', 'Awesome', '1', 'top 40', '1', 'Good Stuff', '1', 'favourite', '1', 'guilty 

In [None]:
# Remove the numbers from the tags
from pyspark.sql.types import StringType, ArrayType
from pyspark.sql.functions import udf, col

removeNumbers = udf(lambda lst: lst[0::2], ArrayType(StringType))

dataFrame = dataframe.withColumn("only tags", removeNumbers(col("new tags")))

In [180]:
dataFrame.select("only tags").first()

                                                                                

Row(only tags='[pop, rock, maroon 5, alternative, pop rock, Love, alternative rock, she will be loved, Ballad, Mellow, male vocalists, 00s, soft rock, favorites, romantic, Maroon5, chill, american, love songs, indie, beautiful, chillout, memories, sad, Pop-Rock, acoustic, Favorite, love song, Maroon 5 - She will be loved, easy listening, relax, male vocalist, Favourites, favorite songs, Maroon 5 She Will Be Loved, funk, 2004, indie rock, Sentimental, loved, slow, One Tree Hill, sweet, ballads, nostalgic, catchy, emo, melancholic, soft, amazing, cool, Awesome, top 40, Good Stuff, favourite, guilty pleasure, soul, lovely, West Coast swing, singer-songwriter, USA, my youth, maroon, fav, 2000s, breaks my heart, Guilty Pleasures, good, Love it, falling asleep sadly, melodic, expressive, america, poprock, adult contemporary, Verbotene Liebe, The 4400, new, sadness, perfect, 2002, broken smile, Favourite Songs, melancholy, zierkafasza, smooth, personal value, sing along, lounge, anthem, roman

In [182]:
dataFrame = dataFrame.drop("tags", "new tags")

In [193]:
dataFrame.printSchema()

root
 |-- artist: string (nullable = true)
 |-- similars: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- tags: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- timestamp: string (nullable = true)
 |-- title: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- new tags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- only tags 2: string (nullable = true)



In [194]:
dataFrame.show()

22/03/04 10:03:39 ERROR TaskSetManager: Task 0 in stage 130.0 failed 4 times; aborting job


PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/tmp/ipykernel_175505/899565006.py", line 1, in <lambda>
AttributeError: 'list' object has no attribute 'split'


In [None]:
df.select("*", explode("list_of_numbers").alias("exploded"))\
    .where(col("exploded") == 1)\
    .groupBy("letter", "list_of_numbers")\
    .agg(count("exploded").alias("ones"))\
    .show()

In [189]:
# Show the top 20 most frequent vehicle makers, and their frequencies.
from pyspark.sql.functions import explode
dataFrame.select("*", explode('only tags')).show()
#dataFrame.groupBy('only tags').count().sort(desc('count')).show()

AnalysisException: cannot resolve 'explode(`only tags`)' due to data type mismatch: input to function explode should be array or map type, not string;
'Project [artist#1218, similars#1219, timestamp#1221, title#1222, track_id#1223, only tags#1412, explode(only tags#1412) AS List()]
+- Project [artist#1218, similars#1219, timestamp#1221, title#1222, track_id#1223, only tags#1412]
   +- Project [artist#1218, similars#1219, tags#1220, timestamp#1221, title#1222, track_id#1223, new tags#1319, <lambda>(new tags#1319) AS only tags#1412]
      +- Project [artist#1218, similars#1219, tags#1220, timestamp#1221, title#1222, track_id#1223, flatten(tags#1220) AS new tags#1319]
         +- Relation [artist#1218,similars#1219,tags#1220,timestamp#1221,title#1222,track_id#1223] json
