In [1]:
import os
import sys
import numpy as np
import h5py
from pyspark import SparkContext, SparkConf
import time
import math
import subprocess
from operator import add
start_time=time.time()
conf = (SparkConf()
   .setMaster("spark://namenode:7077")\
   .setAppName("test_h5_2")\
   .set("spark.executor.cores",2)\
   .set("spark.pyspark.python","python3.6"))
spark_context = SparkContext(conf = conf)

# Creates a list of the file paths in a list
data = []
p = subprocess.Popen("hadoop fs -ls hdfs://namenode:9000/user/ubuntu/data/A/A/*/*.h5 |  awk '{print $8}'",
    shell=True,
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT)
for line in p.stdout.readlines():
     data.append(line.decode("utf-8") [:-1])

# Takes in the filenames from hdfs and creates an rdd with all the h5 files in binary format
rdd_binary=spark_context.union([spark_context.binaryFiles(f) for f in data])

# Create a function reading the h5 file using h5py and io (to read bytes)
# it returns the artist name and the year of the song
def get_h5(line):
    import h5py
    import io
    with h5py.File(io.BytesIO(line[1])) as f:
        artist=f['metadata']['songs'][0]
        year=f['musicbrainz']['songs'][0]
        return (artist[9], year[1])
rdd_artist_year=rdd_binary.map(get_h5)

# In the rdd there are tuples with year = 0, we want to take those out
# Filter out the rows without a year (year = 0)
rdd_artist_year_no0=rdd_artist_year.filter(lambda row: row[1]!=0)

# Creation of the list of decades we want, each decade (line) has a name, a starting year and an end year
list_decades=[["1920's",1920,1929],
              ["1930's",1930,1939],
              ["1940's",1940,1949],
              ["1950's",1950,1959],
              ["1960's",1960,1969],
              ["1970's",1970,1979],
              ["1980's",1980,1989],
              ["1990's",1990,1999],
              ["2000's",2000,2009],
             ["2010's",2010,2019]]

# Function to get the decade, returns the artist name and the decade
def get_decade(line):
    year=line[1]
    for decade in list_decades:
        if year >= decade[1] and year <= decade[2]:
            return (line[0],decade[0])
        
# We want to get the decade of the song, not the year so we map the get_decade function to the rdd
rdd_decade=rdd_artist_year_no0.map(get_decade)

# Now we want to know about the number of songs an artist has done in a decade, this is where we map and reduce
# Mapping
rdd_mapped=rdd_decade.map(lambda row: (row,1))
# Reducing
rdd_reduced=rdd_mapped.reduceByKey(add)

# Top 3 for each decade list
# For each decade we take the top 3 artists with the most song per decade
list_top3=[]
for decade in list_decades:
    top3=rdd_reduced.filter(lambda row: row[0][1] == decade[0]).takeOrdered(3, key=lambda x: -x[1])
    for artist in top3:
        list_top3.append((artist[0][1],artist[0][0].decode("utf-8"),artist[1]))

# Visualisation of the output
# Feel free to change it if you find another way to represent the output
for decade in list_decades:
    print("####################################")
    print("Top 3 artist of ",decade[0],":")
    for artist in list_top3:
        l_a_d=[]
        if artist[0]==decade[0]:
            l_a_d.append([artist[1],artist[2]])
        for a in l_a_d:
            print("- ",a[0],"\t",a[1])

total_time=time.time()-start_time
print('Time taken for the execution: {}m {}s'.format(
        math.floor(total_time / 60),
        math.floor(total_time % 60)
    ))

####################################
Top 3 artist of  1920's :
####################################
Top 3 artist of  1930's :
-  Billie Holiday 	 1
####################################
Top 3 artist of  1940's :
####################################
Top 3 artist of  1950's :
-  Elvis Presley 	 1
-  Ritchie Valens 	 1
-  Nat King Cole 	 1
####################################
Top 3 artist of  1960's :
-  Paul Revere & The Raiders 	 1
-  Lesley Gore 	 1
-  The Mar-Keys 	 1
####################################
Top 3 artist of  1970's :
-  Johnny Clarke 	 1
-  Riccardo Fogli 	 1
-  Novalis 	 1
####################################
Top 3 artist of  1980's :
-  Soda Stereo 	 2
-  Descendents 	 1
-  Phil Ochs 	 1
####################################
Top 3 artist of  1990's :
-  Big Muff 	 2
-  Baby Mammoth 	 2
-  Planetary Assault Systems 	 1
####################################
Top 3 artist of  2000's :
-  Jag Panzer 	 2
-  Karunesh 	 2
-  Linda Eder 	 2
####################################
Top 