In [1]:
from pyspark.sql import SparkSession
from operator import add

# Spark session
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.218:7077") \
        .appName("GenderExctractionDaniel")\
        .config("spark.executor.cores",1)\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

# Spark context
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/08 13:23:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/03/08 13:23:46 WARN Utils: Service 'sparkDriver' could not bind on port 9998. Attempting port 9999.
22/03/08 13:23:46 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/03/08 13:23:47 WARN Utils: Service 'org.apache.spark.network.netty.NettyBlockTransferService' could not bind on port 10005. Attempting port 10006.
22/03/08 13:23:47 WARN ExecutorAllocationManager: Dynamic allocation without a shuffle service is an experimental feature.


In [2]:
# Read the data
data = spark_session.read.option("recursiveFileLookup", "true").json('hdfs://192.168.2.200:9000/user/ubuntu/lastfm_train/B/A')


                                                                                

In [3]:
# Flatten the tags column
from pyspark.sql.functions import flatten
dataframe = data.withColumn("New tags", flatten(data.tags))

In [4]:
# Remove the numbers from the tags (the numbers are every other element)
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import udf, col

removeNumbers = udf(lambda lst: lst[0::2], ArrayType(StringType()))

dataFrame = dataframe.withColumn("Only tags", removeNumbers(col("New tags")))

In [5]:
import re
# Function for determine if a song has been tagged with a tag including "male" or "female" or both
def genderTags(lst):
    genderList = []
    for element in lst:
        if re.search("female|Female", element) and "female" not in genderList:
            genderList.append("female")
            continue
        elif re.search("male|Male", element) and "male" not in genderList:
            genderList.append("male")
            continue
        
    return genderList

# Create udf from the above function
genderTagUDF = udf(genderTags, ArrayType(StringType()))

# Create a new column representing wether a song has been tagged with "female", "male" or both
dataframe = dataFrame.withColumn("Gender tag", genderTagUDF(col("Only tags")))
        

In [6]:
# Function for removing the gender tag from the tags
def removeGenderTags(lst):
    tags = []
    for element in lst:
        if re.search("female|Female", element) or re.search("male|Male", element):
            continue
        else:
            tags.append(element)
        
        
    return tags

# Create udf from the above function
removeGenderTagUDF = udf(removeGenderTags, ArrayType(StringType()))

# Create a new column where the"female" and "male" tags have been removed from the other tags
filteredDF = dataframe.withColumn("Tags", removeGenderTagUDF(col("Only tags")))
        

In [7]:
# Create new dataframe with only the tags and the gender tag
genderDataFrame = filteredDF.select("Tags", "Gender tag")

In [8]:
# Explode the tags column
from pyspark.sql.functions import explode
genderDataFrame = genderDataFrame.select(genderDataFrame["Gender tag"], explode(genderDataFrame["Tags"]))
genderDataFrame = genderDataFrame.withColumnRenamed("col", "Tag")

In [9]:
# Explode the gender tag column
genderDataFrame = genderDataFrame.select(explode(genderDataFrame["Gender tag"]), genderDataFrame["Tag"])
genderDataFrame = genderDataFrame.withColumnRenamed("col", "Gender")

In [10]:
# Find the most frequent tags for "female"
tagsFemale = genderDataFrame.select("Tag").filter(genderDataFrame["Gender"] == "female").groupBy("Tag").agg({"Tag": "count"})\
        .withColumnRenamed("count(Tag)","Frequency").orderBy("Frequency", ascending=False)

tagsFemale.show()



+-----------------+---------+
|              Tag|Frequency|
+-----------------+---------+
|              pop|       40|
|             rock|       31|
|             Love|       24|
|            dance|       22|
|              00s|       20|
|       electronic|       18|
|singer-songwriter|       17|
|        beautiful|       16|
|             soul|       16|
|         chillout|       15|
|      alternative|       14|
|         american|       13|
|              80s|       11|
|        favorites|       11|
|             jazz|       10|
|            indie|       10|
|            chill|       10|
|             folk|       10|
|              sad|        9|
|              rnb|        9|
+-----------------+---------+
only showing top 20 rows



                                                                                

In [11]:
# Find the most frequent tags for "male"
tagsMale = genderDataFrame.select("Tag").filter(genderDataFrame["Gender"] == "male").groupBy("Tag").agg({"Tag": "count"})\
        .withColumnRenamed("count(Tag)","Frequency").orderBy("Frequency", ascending=False)

tagsMale.show()



+-----------------+---------+
|              Tag|Frequency|
+-----------------+---------+
|             rock|       56|
|              pop|       54|
|             Love|       41|
|        favorites|       33|
|      alternative|       32|
|        beautiful|       32|
|            dance|       27|
|              00s|       26|
|singer-songwriter|       25|
|          Awesome|       25|
|            indie|       23|
|         american|       22|
|         chillout|       20|
|           catchy|       19|
|       electronic|       19|
|           oldies|       18|
|     classic rock|       17|
|              sad|       16|
|              80s|       16|
| alternative rock|       16|
+-----------------+---------+
only showing top 20 rows



