In [1]:
from pyspark.sql import SparkSession
from operator import add

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.218:7077") \
        .appName("Daniel_Hjelm_Test")\
        .config("spark.executor.cores",1)\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/07 07:21:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/03/07 07:21:23 WARN ExecutorAllocationManager: Dynamic allocation without a shuffle service is an experimental feature.


In [2]:
# Read all the data
data = spark_session.read.option("recursiveFileLookup", "true").json('hdfs://192.168.2.200:9000/user/ubuntu/lastfm_train/B/A')

# Read a small subset of data:
#data = spark_session.read.json('hdfs://192.168.2.200:9000/user/ubuntu/lastfm_train/B/A/A/*.json')

                                                                                

In [3]:
# Count the amount of rows in the dataset
data.count()

                                                                                

1289

In [4]:
# Display the data
data.show()

[Stage 5:>                                                          (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+
|              artist|            similars|                tags|           timestamp|               title|          track_id|
+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+
|            Maroon 5|[[TRZHXFO128F9302...|[[pop, 100], [roc...|2011-08-02 20:57:...|   She Will Be Loved|TRBAMHJ128F9302A08|
|          Anita Ward|[[TRQCINT128F4245...|[[Disco, 100], [7...|2011-08-15 12:34:...|        Ring My Bell|TRBASRX128F92E9CD1|
|Bill Haley and th...|[[TRISKEM128F427D...|[[oldies, 100], [...|2011-08-02 09:55:...|See You Later All...|TRBAJNU128F4260039|
|       Eddie Cochran|[[TRIEPQF128F9329...|[[rock n roll, 10...|2011-08-11 20:51:...|     C'mon Everybody|TRBAMXP128F92FC856|
|          The Troggs|[[TRGQCBQ12903CBC...|[[60s, 100], [cla...|2011-08-05 01:01:...|          Wild Thing|TRBAKTC128F9

                                                                                

In [5]:
# Flatten the tags column
from pyspark.sql.functions import flatten
dataframe = data.withColumn("new tags", flatten(data.tags))
dataframe.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+
|              artist|            similars|                tags|           timestamp|               title|          track_id|            new tags|
+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+
|            Maroon 5|[[TRZHXFO128F9302...|[[pop, 100], [roc...|2011-08-02 20:57:...|   She Will Be Loved|TRBAMHJ128F9302A08|[pop, 100, rock, ...|
|          Anita Ward|[[TRQCINT128F4245...|[[Disco, 100], [7...|2011-08-15 12:34:...|        Ring My Bell|TRBASRX128F92E9CD1|[Disco, 100, 70s,...|
|Bill Haley and th...|[[TRISKEM128F427D...|[[oldies, 100], [...|2011-08-02 09:55:...|See You Later All...|TRBAJNU128F4260039|[oldies, 100, roc...|
|       Eddie Cochran|[[TRIEPQF128F9329...|[[rock n roll, 10...|2011-08-11 20:51:...|     C'mon Everybody|TRBAMXP128F9

In [6]:
dataframe.printSchema()

root
 |-- artist: string (nullable = true)
 |-- similars: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- tags: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- timestamp: string (nullable = true)
 |-- title: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- new tags: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [7]:
# Inspect the new tags column
dataframe.select("new tags").first()

Row(new tags=['pop', '100', 'rock', '83', 'maroon 5', '78', 'alternative', '59', 'pop rock', '51', 'Love', '27', 'alternative rock', '18', 'she will be loved', '16', 'Ballad', '12', 'Mellow', '10', 'male vocalists', '10', '00s', '9', 'soft rock', '8', 'favorites', '8', 'romantic', '7', 'Maroon5', '7', 'chill', '7', 'american', '6', 'love songs', '6', 'indie', '6', 'beautiful', '6', 'chillout', '5', 'memories', '5', 'sad', '4', 'Pop-Rock', '3', 'acoustic', '3', 'Favorite', '3', 'love song', '3', 'Maroon 5 - She will be loved', '3', 'easy listening', '3', 'relax', '2', 'male vocalist', '2', 'Favourites', '2', 'favorite songs', '2', 'Maroon 5 She Will Be Loved', '2', 'funk', '2', '2004', '2', 'indie rock', '2', 'Sentimental', '2', 'loved', '1', 'slow', '1', 'One Tree Hill', '1', 'sweet', '1', 'ballads', '1', 'nostalgic', '1', 'catchy', '1', 'emo', '1', 'melancholic', '1', 'soft', '1', 'amazing', '1', 'cool', '1', 'Awesome', '1', 'top 40', '1', 'Good Stuff', '1', 'favourite', '1', 'guilty 

In [8]:
# Remove the numbers from the tags
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import udf, col

removeNumbers = udf(lambda lst: lst[0::2], ArrayType(StringType()))

dataFrame = dataframe.withColumn("only tags", removeNumbers(col("new tags")))

In [9]:
dataFrame.select("only tags").first()

                                                                                

Row(only tags=['pop', 'rock', 'maroon 5', 'alternative', 'pop rock', 'Love', 'alternative rock', 'she will be loved', 'Ballad', 'Mellow', 'male vocalists', '00s', 'soft rock', 'favorites', 'romantic', 'Maroon5', 'chill', 'american', 'love songs', 'indie', 'beautiful', 'chillout', 'memories', 'sad', 'Pop-Rock', 'acoustic', 'Favorite', 'love song', 'Maroon 5 - She will be loved', 'easy listening', 'relax', 'male vocalist', 'Favourites', 'favorite songs', 'Maroon 5 She Will Be Loved', 'funk', '2004', 'indie rock', 'Sentimental', 'loved', 'slow', 'One Tree Hill', 'sweet', 'ballads', 'nostalgic', 'catchy', 'emo', 'melancholic', 'soft', 'amazing', 'cool', 'Awesome', 'top 40', 'Good Stuff', 'favourite', 'guilty pleasure', 'soul', 'lovely', 'West Coast swing', 'singer-songwriter', 'USA', 'my youth', 'maroon', 'fav', '2000s', 'breaks my heart', 'Guilty Pleasures', 'good', 'Love it', 'falling asleep sadly', 'melodic', 'expressive', 'america', 'poprock', 'adult contemporary', 'Verbotene Liebe', '

In [77]:
import re
# Function for fetching the gender from the tags
def genderTags(lst):
    genderList = []
    for element in lst:
        if re.search("female|Female", element) and "female" not in genderList:
            genderList.append("female")
            continue
        elif re.search("male|Male", element) and "male" not in genderList:
            genderList.append("male")
            continue
        
    return genderList

# Create udf from the above function
genderTagUDF = udf(genderTags, ArrayType(StringType()))

# Create a new column representing wether a song has been tagged with "female", "male" or both
dataframe = dataFrame.withColumn("Gender tag", genderTagUDF(col("only tags")))

dataframe.select("Gender tag").take(20)
        


                                                                                

[Row(Gender tag=['male']),
 Row(Gender tag=['female', 'male']),
 Row(Gender tag=['male']),
 Row(Gender tag=['male']),
 Row(Gender tag=['male']),
 Row(Gender tag=['male', 'female']),
 Row(Gender tag=['male']),
 Row(Gender tag=['female', 'male']),
 Row(Gender tag=['female', 'male']),
 Row(Gender tag=['female']),
 Row(Gender tag=['male']),
 Row(Gender tag=['female', 'male']),
 Row(Gender tag=['female', 'male']),
 Row(Gender tag=['female', 'male']),
 Row(Gender tag=[]),
 Row(Gender tag=['male']),
 Row(Gender tag=['male']),
 Row(Gender tag=['male']),
 Row(Gender tag=['male']),
 Row(Gender tag=[])]

In [78]:
# Create new dataframe with only the tags and the gender tag
genderDataFrame = dataframe.select("only tags", "Gender tag")

In [79]:
# Explode the tags column
from pyspark.sql.functions import explode
genderDataFrame = genderDataFrame.select(genderDataFrame["Gender tag"], explode(genderDataFrame["only tags"]))
genderDataFrame = genderDataFrame.withColumnRenamed("col", "Tag")

In [80]:
genderDataFrame.show()

+----------+-----------------+
|Gender tag|              Tag|
+----------+-----------------+
|    [male]|              pop|
|    [male]|             rock|
|    [male]|         maroon 5|
|    [male]|      alternative|
|    [male]|         pop rock|
|    [male]|             Love|
|    [male]| alternative rock|
|    [male]|she will be loved|
|    [male]|           Ballad|
|    [male]|           Mellow|
|    [male]|   male vocalists|
|    [male]|              00s|
|    [male]|        soft rock|
|    [male]|        favorites|
|    [male]|         romantic|
|    [male]|          Maroon5|
|    [male]|            chill|
|    [male]|         american|
|    [male]|       love songs|
|    [male]|            indie|
+----------+-----------------+
only showing top 20 rows



In [82]:
# Explode the gender tag column
genderDataFrame = genderDataFrame.select(explode(genderDataFrame["Gender tag"]), genderDataFrame["Tag"])
genderDataFrame = genderDataFrame.withColumnRenamed("col", "Gender")

In [83]:
genderDataFrame.printSchema()

root
 |-- Gender: string (nullable = true)
 |-- Tag: string (nullable = true)



In [84]:
genderDataFrame.take(1)

                                                                                

[Row(Gender='male', Tag='pop')]

In [86]:
# Find the most frequent tags for "female"
tag_freq_df_female = genderDataFrame.select("Tag").filter(genderDataFrame["Gender"] == "female").groupBy("Tag").agg({"Tag": "count"})\
        .withColumnRenamed("count(Tag)","Frequency of tag").orderBy("Frequency of tag", ascending=False)

tag_freq_df_female.show()



+-----------------+----------------+
|              Tag|Frequency of tag|
+-----------------+----------------+
| female vocalists|              68|
|              pop|              40|
|             rock|              31|
|  female vocalist|              31|
|             Love|              24|
|            dance|              22|
|           female|              21|
|              00s|              20|
|       electronic|              18|
|singer-songwriter|              17|
|        beautiful|              16|
|             soul|              16|
|         chillout|              15|
|      alternative|              14|
|         american|              13|
|              80s|              11|
|        favorites|              11|
|            chill|              10|
|             jazz|              10|
|            indie|              10|
+-----------------+----------------+
only showing top 20 rows



                                                                                