In [57]:
from pyspark.sql import SparkSession
from operator import add

# Spark session
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.218:7077") \
        .appName("Song titles Manne")\
        .config("spark.executor.cores",1)\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

# Spark context
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")

In [58]:
# Read the data
data = spark_session.read.option("recursiveFileLookup", "true").json('hdfs://192.168.2.200:9000/user/ubuntu/lastfm_train')


In [None]:
# Flatten the tags column
from pyspark.sql.functions import flatten
dataframe = data.withColumn("New tags", flatten(data.tags))

In [None]:
# Remove the numbers from the tags (the numbers are every other element)
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import udf, col

removeNumbers = udf(lambda lst: lst[0::2], ArrayType(StringType()))

dataFrame = dataframe.withColumn("Only tags", removeNumbers(col("New tags")))

In [None]:
import re
# Function for determine if a song has been tagged with a tag including "male" or "female" or both
def genderTags(lst):
    genderList = []
    for element in lst:
        if re.search("female|Female", element) and "female" not in genderList:
            genderList.append("female")
            continue
        elif re.search("male|Male", element) and "male" not in genderList:
            genderList.append("male")
            continue
        
    return genderList

# Create udf from the above function
genderTagUDF = udf(genderTags, ArrayType(StringType()))

# Create a new column representing wether a song has been tagged with "female", "male" or both
dataframe = dataFrame.withColumn("GenderTag", genderTagUDF(col("Only tags")))
        

In [None]:
from pyspark.sql.functions import size
maleTitles = dataframe.where(size("GenderTag") == 1).filter(dataframe.GenderTag[0] == "male").select("title")


In [None]:
femaleTitles = dataframe.where(size("GenderTag") == 1).filter(dataframe.GenderTag[0] == "female").select("title")


In [None]:
wordsMale = maleTitles.rdd.flatMap(lambda word: word).map(lambda line: line.split(" ")).flatMap(lambda x: x).map(lambda word: word.lower()).map(lambda word: (word, 1)).reduceByKey(add).map(lambda word: (word[1], word[0])).sortByKey(False)
wordsMale.take(20)


                                                                                

[(203, 'the'),
 (91, 'a'),
 (85, 'you'),
 (73, 'i'),
 (73, 'in'),
 (66, 'me'),
 (63, 'of'),
 (62, 'love'),
 (55, 'to'),
 (52, 'on'),
 (50, 'my'),
 (38, 'your'),
 (37, 'it'),
 (33, 'for'),
 (30, 'and'),
 (26, 'be'),
 (21, "don't"),
 (20, 'is'),
 (20, 'all'),
 (20, 'with')]

In [None]:
wordsFemale = femaleTitles.rdd.flatMap(lambda word: word).map(lambda line: line.split(" ")).flatMap(lambda x: x).map(lambda word: word.lower()).map(lambda word: (word, 1)).reduceByKey(add).map(lambda word: (word[1], word[0])).sortByKey(False)
wordsFemale.take(20)

                                                                                

[(88, 'the'),
 (67, 'you'),
 (51, 'me'),
 (43, 'i'),
 (41, 'a'),
 (38, 'love'),
 (31, 'my'),
 (30, 'of'),
 (28, 'to'),
 (27, 'version)'),
 (26, 'in'),
 (26, 'it'),
 (23, 'on'),
 (21, 'is'),
 (18, "don't"),
 (18, 'one'),
 (17, 'your'),
 (16, 'and'),
 (13, '(album'),
 (13, 'up')]