This Python module goes once again through all pages in "./HTML_pages", but this time:
    1) It recovers only the descriptors we determined in "02_Data_Modelling.ipynb";
    2) Groups similar descriptors under a common name (also determined in "02_Data_Modelling.ipynb")
    3) Stores each album's descriptors in individual lists (conceptually acting as vectors), all inside a single list
    4) Saves this list, which acts as a collection of vectors, in a .csv file

The final output of this module was stored in "full_descriptor_dataset.csv"

In [1]:
from bs4 import BeautifulSoup
import pandas as pd

# Returns a list containing all unique descriptors found
def getDescriptors(path):
    
    with open(path, "r", encoding="utf-8") as f:
        html = f.read()

    soup = BeautifulSoup(html, 'html.parser')
    
    # Retrieves a string of descriptors
    descriptors = soup.find("span", class_="release_pri_descriptors")
    
    if descriptors.string is None:
        return None
    else:
        # Retrieves album name
        meta_tag = soup.find(name="meta", attrs={"itemprop": "name"})
        album_name = meta_tag["content"]
        
        # Creates a list made of separate strings for each descriptor plus the album name
        descriptorsList = descriptors.string.split(",  ")
        descriptorsList.insert(0, album_name)
        return descriptorsList

In [2]:
# Receives a list of descriptors and adds them to a dictionary.
# The dictionary keeps track of the number of occurrences of each descriptor.
def listToSampleVector(descriptorsList):
    
    if descriptorsList is None:
        return None
    else:
        
        # Initializes the sample vector with 38 zeroes (38 descriptors) and inserts the album name as the first element
        sample_vector = [0] * 38
        album_name = descriptorsList[0]
        sample_vector.insert(0, album_name)
        
        for descriptor in descriptorsList:
            
            descriptor = descriptorWrapper(descriptor)
            
            if descriptor is not None:
                computeAxis(descriptor, sample_vector)
    
        return sample_vector

In [3]:
def computeAxis(descriptor, sample_vector):
    if descriptor == 'melodic':
        sample_vector[1] += 1
    elif descriptor == 'rhythmic':
        sample_vector[2] += 1
    elif descriptor == 'energetic':
        sample_vector[3] += 1
    elif descriptor == 'apathetic':
        sample_vector[4] += 1
    elif descriptor == 'raw':
        sample_vector[5] += 1
    elif descriptor == 'surreal':
        sample_vector[6] += 1
    elif descriptor == 'anxious':
        sample_vector[7] += 1
    elif descriptor == 'bittersweet':
        sample_vector[8] += 1
    elif descriptor == 'warm':
        sample_vector[9] += 1
    elif descriptor == 'cold':
        sample_vector[10] += 1
    elif descriptor == 'rebellious':
        sample_vector[11] += 1
    elif descriptor == 'mysterious':
        sample_vector[12] += 1
    elif descriptor == 'poetic':
        sample_vector[13] += 1
    elif descriptor == 'quirky':
        sample_vector[14] += 1
    elif descriptor == 'eclectic':
        sample_vector[15] += 1
    elif descriptor == 'atmospheric':
        sample_vector[16] += 1
    elif descriptor == 'urban':
        sample_vector[17] += 1
    elif descriptor == 'pastoral':
        sample_vector[18] += 1
    elif descriptor == 'dark':
        sample_vector[19] += 1
    elif descriptor == 'romantic':
        sample_vector[20] += 1
    elif descriptor == 'progressive':
        sample_vector[21] += 1
    elif descriptor == 'anthemic':
        sample_vector[22] += 1
    elif descriptor == 'humorous':
        sample_vector[23] += 1
    elif descriptor == 'serious':
        sample_vector[24] += 1
    elif descriptor == 'calm':
        sample_vector[25] += 1
    elif descriptor == 'heavy':
        sample_vector[26] += 1
    elif descriptor == 'noisy':
        sample_vector[27] += 2
    elif descriptor == 'dense':
        sample_vector[28] += 1
    elif descriptor == 'acoustic':
        sample_vector[29] += 1
    elif descriptor == 'sentimental':
        sample_vector[30] += 1
    elif descriptor == 'angry':
        sample_vector[31] += 1
    elif descriptor == 'spiritual':
        sample_vector[32] += 1
    elif descriptor == 'sad':
        sample_vector[33] += 1
    elif descriptor == 'happy':
        sample_vector[34] += 1
    elif descriptor == 'epic':
        sample_vector[35] += 1
    elif descriptor == 'minimalistic':
        sample_vector[36] += 1
    elif descriptor == 'futuristic':
        sample_vector[37] += 1
    elif descriptor == 'natural':
        sample_vector[38] += 1

In [4]:
def descriptorWrapper(descriptor):
    if descriptor in ['natural', 'rain', 'forest', 'desert', 'aquatic', 'tropical', 'seasonal', 'autumn', 'spring']:
        return 'natural'
    elif descriptor in ['dark', 'funereal', 'infernal', 'ominous', 'scary', 'disturbing', 'apocalyptic']:
        return 'dark'
    elif descriptor in ['sad', 'depressive', 'lonely', 'melancholic', 'sombre', 'pessimistic', 'hateful']:
        return 'sad'
    elif descriptor in ['warm', 'summer']:
        return 'warm'
    elif descriptor in ['cold', 'winter', 'nocturnal']:
        return 'cold'
    elif descriptor in ['angry', 'aggressive']:
        return 'angry'
    elif descriptor in ['calm', 'meditative', 'mellow', 'soothing', 'peaceful', 'soft']:
        return 'calm'
    elif descriptor in ['energetic', 'manic']:
        return 'energetic'
    elif descriptor in ['happy', 'playful', 'uplifting', 'triumphant', 'optimistic']:
        return 'happy'
    elif descriptor in ['futuristic', 'space']:
        return 'futuristic'
    elif descriptor in ['humorous', 'sarcastic', 'vulgar', 'satirical']:
        return 'humorous'
    elif descriptor in ['spiritual', 'ethereal', 'hypnotic']:
        return 'spiritual'
    elif descriptor in ['minimalistic', 'repetitive', 'sparse']:
        return 'minimalistic'
    elif descriptor in ['progressive', 'microtonal', 'complex', 'polyphonic', 'avant-garde', 'atonal', 'technical']:
        return 'progressive'
    elif descriptor in ['bittersweet', 'longing']:
        return 'bittersweet'
    elif descriptor in ['surreal', 'psychedelic', 'lush']:
        return 'surreal'
    elif descriptor in ['apathetic', 'lethargic', 'deadpan']:
        return 'apathetic'
    elif descriptor in ['mysterious', 'cryptic']:
        return 'mysterious'
    elif descriptor in ['noisy', 'chaotic', 'dissonant']:
        return 'noisy'
    elif descriptor in ['sentimental', 'passionate']:
        return 'sentimental'
    elif descriptor in ['raw', 'lo-fi']:
        return 'raw'
    elif descriptor in ['urban', 'party']:
        return 'urban'
    elif descriptor in ['romantic', 'sensual']:
        return 'romantic'
    elif descriptor in ['melodic', 'rhythmic', 'anxious', 'rebellious', 'poetic', 'quirky', 'eclectic', 'atmospheric', 'pastoral', 'anthemic', 'serious', 'heavy', 'dense', 'acoustic', 'epic']:
        return descriptor

In [5]:
# Wraps "descriptorsList" and "addDescriptor" in a loop, returning a dictionary of descriptors found in all HTML album pages
def nestedListOfSamples(max_albums):
    
    df_nested_list = []
    
    for file_name in range(1, max_albums + 1):
        if file_name % 100 == 0:
            print(str(file_name) + "/" + str(max_albums))

        descriptorsList = getDescriptors("./HTML_pages/" + str(file_name) + ".htm")
        
        if descriptorsList is not None:
            sample_vector = listToSampleVector(descriptorsList)
            df_nested_list.append(sample_vector)
        else:
            print(file_name)
    
    return df_nested_list

In [6]:
nested = nestedListOfSamples(706)

100/706
200/706
300/706
400/706
500/706
600/706
700/706


In [7]:
df = pd.DataFrame(nested, columns=['name',
                                   'melodic',
                                   'rhythmic',
                                   'energetic',
                                   'apathetic',
                                   'raw',
                                   'surreal',
                                   'anxious',
                                   'bittersweet',
                                   'warm',
                                   'cold',
                                   'rebellious',
                                   'mysterious',
                                   'poetic',
                                   'quirky',
                                   'eclectic',
                                   'atmospheric',
                                   'urban',
                                   'pastoral',
                                   'dark',
                                   'romantic',
                                   'progressive',
                                   'anthemic',
                                   'humorous',
                                   'serious',
                                   'calm',
                                   'heavy',
                                   'noisy',
                                   'dense',
                                   'acoustic',
                                   'sentimental',
                                   'angry',
                                   'spiritual',
                                   'sad',
                                   'happy',
                                   'epic',
                                   'minimalistic',
                                   'futuristic',
                                   'natural'])

In [8]:
df

Unnamed: 0,name,melodic,rhythmic,energetic,apathetic,raw,surreal,anxious,bittersweet,warm,...,acoustic,sentimental,angry,spiritual,sad,happy,epic,minimalistic,futuristic,natural
0,Adolescents,0,0,1,1,1,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0
1,Living in Darkness,1,1,1,0,1,0,1,0,0,...,0,0,1,0,2,0,0,0,0,0
2,Souvenirs d'un autre monde,1,0,0,0,0,1,0,1,1,...,0,2,0,1,1,0,0,0,0,2
3,Brothers and Sisters,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
4,Canta triste,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
701,McCartney,1,0,0,0,2,0,0,1,1,...,0,0,0,0,1,1,0,1,0,0
702,Hail to the Thief,1,1,1,0,0,1,1,0,0,...,0,1,1,0,3,0,0,0,0,1
703,Wish You Were Here,1,0,0,0,0,1,0,2,0,...,0,1,0,0,2,0,1,0,0,0
704,(I'm) Stranded,0,0,1,0,2,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [9]:
df.to_csv('full_descriptor_dataset.csv', index=False)