In [1]:
from pyspark.sql import SparkSession
from operator import add

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.218:7077") \
        .appName("Daniel_Hjelm_Test")\
        .config("spark.executor.cores",1)\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/07 09:00:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/03/07 09:00:27 WARN ExecutorAllocationManager: Dynamic allocation without a shuffle service is an experimental feature.


In [2]:
# Read all the data
data = spark_session.read.option("recursiveFileLookup", "true").json('hdfs://192.168.2.200:9000/user/ubuntu/lastfm_train/B/A')

# Read a small subset of data:
#data = spark_session.read.json('hdfs://192.168.2.200:9000/user/ubuntu/lastfm_train/B/A/A/*.json')

                                                                                

In [3]:
# Flatten the tags column
from pyspark.sql.functions import flatten
dataframe = data.withColumn("new tags", flatten(data.tags))

In [4]:
# Remove the numbers from the tags
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import udf, col

removeNumbers = udf(lambda lst: lst[0::2], ArrayType(StringType()))

dataFrame = dataframe.withColumn("only tags", removeNumbers(col("new tags")))

In [5]:
import re
# Function for fetching the gender from the tags
def genderTags(lst):
    genderList = []
    for element in lst:
        if re.search("female|Female", element) and "female" not in genderList:
            genderList.append("female")
            continue
        elif re.search("male|Male", element) and "male" not in genderList:
            genderList.append("male")
            continue
        
    return genderList

# Create udf from the above function
genderTagUDF = udf(genderTags, ArrayType(StringType()))

# Create a new column representing wether a song has been tagged with "female", "male" or both
dataframe = dataFrame.withColumn("Gender tag", genderTagUDF(col("only tags")))
        

In [6]:
# Create new dataframe with only the tags and the gender tag
genderDataFrame = dataframe.select("only tags", "Gender tag")

In [7]:
# Explode the tags column
from pyspark.sql.functions import explode
genderDataFrame = genderDataFrame.select(genderDataFrame["Gender tag"], explode(genderDataFrame["only tags"]))
genderDataFrame = genderDataFrame.withColumnRenamed("col", "Tag")

In [8]:
# Explode the gender tag column
genderDataFrame = genderDataFrame.select(explode(genderDataFrame["Gender tag"]), genderDataFrame["Tag"])
genderDataFrame = genderDataFrame.withColumnRenamed("col", "Gender")

In [9]:
# Find the most frequent tags for "female"
tagsFemale = genderDataFrame.select("Tag").filter(genderDataFrame["Gender"] == "female").groupBy("Tag").agg({"Tag": "count"})\
        .withColumnRenamed("count(Tag)","Frequency of tag").orderBy("Frequency of tag", ascending=False)

tagsFemale.show()

                                                                                

+-----------------+----------------+
|              Tag|Frequency of tag|
+-----------------+----------------+
| female vocalists|              68|
|              pop|              40|
|             rock|              31|
|  female vocalist|              31|
|             Love|              24|
|            dance|              22|
|           female|              21|
|              00s|              20|
|       electronic|              18|
|singer-songwriter|              17|
|        beautiful|              16|
|             soul|              16|
|         chillout|              15|
|      alternative|              14|
|         american|              13|
|              80s|              11|
|        favorites|              11|
|             folk|              10|
|            chill|              10|
|             jazz|              10|
+-----------------+----------------+
only showing top 20 rows



In [10]:
# Find the most frequent tags for "male"
tagsMale = genderDataFrame.select("Tag").filter(genderDataFrame["Gender"] == "male").groupBy("Tag").agg({"Tag": "count"})\
        .withColumnRenamed("count(Tag)","Frequency of tag").orderBy("Frequency of tag", ascending=False)

tagsMale.show()



+-----------------+----------------+
|              Tag|Frequency of tag|
+-----------------+----------------+
|             rock|              56|
|              pop|              54|
|   male vocalists|              42|
|             Love|              41|
| female vocalists|              38|
|        favorites|              33|
|      alternative|              32|
|        beautiful|              32|
|            dance|              27|
|              00s|              26|
|  female vocalist|              25|
|singer-songwriter|              25|
|          Awesome|              25|
|            indie|              23|
|         american|              22|
|         chillout|              20|
|           catchy|              19|
|           female|              19|
|       electronic|              19|
|           oldies|              18|
+-----------------+----------------+
only showing top 20 rows



                                                                                