This Python module goes through all pages in "./HTML_pages" and recovers all possible descriptors. This was a necessary step to get a grasp of how many occurences there are of each descriptor, an important data for the next step, which is selecting which descriptors are going to be features of the dataset we'll be building for this problem, and how to model the selected descriptors themselves.

The final output of this module was stored in "sorted_descriptor_list.txt"

In [1]:
from bs4 import BeautifulSoup

# Returns a list containing all unique descriptors found
def getDescriptors(path):
    
    with open(path, "r", encoding="utf-8") as f:
        html = f.read()

    soup = BeautifulSoup(html, 'html.parser')
    
    descriptors = soup.find("span", class_="release_pri_descriptors")
    
    if descriptors.string is None:
        return None
    else:
        descriptorsList = descriptors.string.split(",  ")
        return descriptorsList

In [2]:
# Receives a list of descriptors and adds them to a dictionary.
# The dictionary keeps track of the number of occurrences of each descriptor.
def addDescriptor(dict, descriptorsList):
    
    if descriptorsList is None:
        return None
    else:
        for descriptor in descriptorsList:

            count = dict.get(descriptor, 0)
            count += 1
            dict[descriptor] = count

In [3]:
# Wraps "descriptorsList" and "addDescriptor" in a loop, returning a dictionary of descriptors found in all HTML album pages
def createDescriptorDict(max_albums):
    
    dict = {}
    
    for file_name in range(1, max_albums + 1):
        if file_name % 100 == 0:
            print(str(file_name) + "/" + str(max_albums))

        descriptorsList = getDescriptors("./HTML_pages/" + str(file_name) + ".htm")
        addDescriptor(dict, descriptorsList)
    
    return dict

In [4]:
# Creates a new list from the dictionary, sorted by the most popular descriptor to the least popular
def sortedDescriptorsList(max_albums):
    
    dict = createDescriptorDict(max_albums)
    sorted_descriptors = sorted(dict.items(), key=lambda x: x[1], reverse=True)
    
    return sorted_descriptors

In [5]:
sorted_descriptors = sortedDescriptorsList(706)

100/706
200/706
300/706
400/706
500/706
600/706
700/706


In [6]:
sorted_descriptors

[('male vocalist', 633),
 ('melodic', 430),
 ('energetic', 377),
 ('rhythmic', 320),
 ('passionate', 280),
 ('playful', 274),
 ('raw', 233),
 ('anxious', 230),
 ('bittersweet', 217),
 ('warm', 206),
 ('rebellious', 200),
 ('psychedelic', 198),
 ('melancholic', 191),
 ('poetic', 186),
 ('introspective', 184),
 ('quirky', 179),
 ('eclectic', 176),
 ('love', 170),
 ('atmospheric', 163),
 ('sarcastic', 159),
 ('conscious', 147),
 ('lush', 146),
 ('surreal', 146),
 ('urban', 145),
 ('nocturnal', 144),
 ('dark', 136),
 ('progressive', 135),
 ('longing', 133),
 ('political', 130),
 ('mellow', 126),
 ('alienation', 121),
 ('female vocalist', 119),
 ('humorous', 116),
 ('noisy', 116),
 ('existential', 115),
 ('lonely', 113),
 ('dense', 110),
 ('summer', 110),
 ('uplifting', 107),
 ('manic', 107),
 ('sentimental', 104),
 ('anthemic', 102),
 ('abstract', 100),
 ('romantic', 99),
 ('hypnotic', 99),
 ('cryptic', 95),
 ('angry', 93),
 ('repetitive', 89),
 ('sombre', 87),
 ('aggressive', 86),
 ('pess

In [7]:
len(sorted_descriptors)

167