# Model Training and Song Classification 

Functions used in the main code.

In [None]:
# Clean the tweets and Count occurrences of words in all the tweets

def counter_wd(txt_tweets):
    count = Counter()
    for tweet in txt_tweets:
        for wd in tweet.split(" "):
            count[wd] += 1
    return count

In [None]:
# Inspect the explanation of the predictions with LIME

def lime_pred(texts):
    _seq = tknizer.texts_to_sequences(texts)
    _textdata = pad_sequences(_seq, maxlen=24, padding='post', truncating='post')
    return model.predict(_textdata)

In [None]:
#  Split the lyrics in sentences and clean them

def sent_and_clean(txt):

    str_punct = punctuation.replace("#", "").replace("'", "") + "—" + "“" + "’" + "€" + "£"
    listof = []

    txt_ascii = " ".join(txt.encode("ascii", errors="ignore").decode().split())
    sent_list = nltk.tokenize.sent_tokenize(txt_ascii)

    for sent in sent_list:
        sent_clean = sent.encode("ascii", errors="ignore").decode() # remove weird characters like '오늘 밤 지나 해가 뜰 때까지 계속'
        sent_clean = sent_clean.lower().translate(str.maketrans(str_punct, ' ' * len(str_punct))) # remove punctuation
        sent_clean = " ".join([wd for wd in nltk.word_tokenize(sent_clean) if wd not in stopwords.words("english")]) # remove stopwords and append
        sent_clean = re.sub(r'\d+', '', sent_clean) # remove digits
        sent_clean = " ".join([wd for wd in nltk.word_tokenize(sent_clean) if wd not in ["'s", "n't", "'re", "'ll", "'m", "'", "'ve", "'d", "'cuz"
                                                                                         "'til", "till", "i'mma", "'cause", "'em", "'ma"]]) # remove abbreviations

        if len(sent_clean.split()) > 2:
            listof.append(sent_clean)

    return listof

In [None]:
# Extract the padded sequences

def sent_to_padseq(list_txt):

    pads_list = []
    if len(list_txt) > 1:
        for lis in list_txt:
            if len(lis.split()) <= 24:
                sent_seq = tknizer.texts_to_sequences([lis])
                sent_pad = pad_sequences(sent_seq, maxlen=24, padding="post", truncating="post").tolist()[0]
                pads_list.append(sent_pad)

    return pads_list

In [None]:
# Create a Ragged Tensor to store the predictions of the model

def pred_tensor(pad_column):
    list_arrays = []

    for elem in pad_column:
        pred = model.predict(elem)
        list_arrays.append(pred)

    rag_tensor = tf.ragged.constant(list_arrays)
    return rag_tensor

In [None]:
# Get the max by columns for each row, representing the prevalent emotion for each sentence in a song

def get_maxEmotion(prob_tensor):
    list_max = []

    for array in prob_tensor:
        np_array = array.numpy()
        max_idx = np.argmax(np_array, axis=1)
        if max_idx.shape[0] > 1:
            list_max.append([[x] for x in np.transpose(max_idx).tolist()])
        else:
            list_max.append([max_idx])

    max_tensor = tf.ragged.constant(list_max)
    return max_tensor

In [None]:
# Frequencies of emotions with respect to each song at their corresponding index position 

def get_emotefreq(max_tensor):
    list_freq = []

    for array in max_tensor:
        default_freq = [0, 0, 0, 0]

        np_array = array.numpy()
        array_freq = np.unique(np_array, return_counts=True)

        if array_freq[0].shape[0] < 4: # If not all the emotions are included, then set their frequencies to zero
            valfreq_tup = list(zip(array_freq[0], array_freq[1]))
            for val, freq in valfreq_tup:
                default_freq[val] = freq
            list_freq.append(default_freq)

        else:
            list_freq.append(array_freq[1].tolist())

    freq_numpy = np.array(list_freq)
    return freq_numpy

In [None]:
# Heuristic rule suitable for multi-label classification of the songs

def classify_emote(emote_counts, perc_rule):
    list_class = []
    class_voc = dict([(0, 'anger'), (1, 'fear'), (2, 'joy'), (3, 'sadness')])

    for song in emote_counts:
        satisfy_idx = [idx for idx, val in enumerate(song.tolist()) if val/np.sum(song) >= perc_rule]
        satisfy_class = [class_voc[idx] for idx in satisfy_idx]
        list_class.append(satisfy_class)

    #class_tensor = tf.ragged.constant(list_class, dtype=tf.dtypes.string)
    return pd.Series(list_class, name='Classification')

In [None]:
# Distinct Combination of genres and emotions

def unique_tup(list1):
    unique_list = []
    for x in list1:
        if x not in unique_list:
            unique_list.append(x)
        else:
            continue

    return unique_list