In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Coherence Analysis

## Approach 1: Using Semantic Similarity Graph
Implementation of Semantic Similarity Graph based on Putra et al. Class use to compute the coherence of a given text


## Approach 2: Using Neural Network 
Utilizing LSTM and fine-tuned BERT architecture to generate a coherence score for song lyrics. The architecture will be train on real world lyrics and then compared it to generated lyrics by other ML tasks. 

In [2]:
!pip install gensim
!pip install nltk
!pip install transformer
!pip install networkx
!pip install tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[31mERROR: Could not find a version that satisfies the requirement transformer (from versions: none)[0m
[31mERROR: No matching distribution found for transformer[0m
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
# import relevant libraries
import re
import random
import joblib
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tqdm.notebook import tqdm
import networkx as nx


# import nltk and gensim library for token level embeddings
import nltk
import gensim
import gensim.downloader
from nltk.data import find


In [4]:
DATA_DIR = "/content/drive/MyDrive/W266 Project/w266-finalproj/data"


In [5]:
#@title Word2Vec Embedding Matrix Creation
# instantiate Word2Vec token level embedding
nltk.download('word2vec_sample')
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

# let's try 300 dimension Word2Vec Embedding
EMBEDDING_DIM = 300
# initialize embedding matrix and word-to-id map:
# embedding_matrix = np.zeros((len(model.key_to_index) + 1, EMBEDDING_DIM))       
embedding_matrix = np.zeros((len(model.vocab.keys()) + 1, EMBEDDING_DIM))       

word2Vec_embedder = {}
# build the embedding matrix and the word-to-id map:
# for i, word in enumerate(model.key_to_index):
for i, word in enumerate(model.vocab.keys()):
    embedding_vector = model[word]
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        word2Vec_embedder[word] = embedding_vector

# we can use the last index at the end of the vocab for unknown tokens
word2Vec_embedder['[UNK]'] = np.zeros(EMBEDDING_DIM)
embedding_matrix.shape

[nltk_data] Downloading package word2vec_sample to /root/nltk_data...
[nltk_data]   Unzipping models/word2vec_sample.zip.


(43982, 300)

In [6]:
# preparing preprocessing tools
def text_to_tokens(text):
  """
    Process input text into individual token per sentence
  """
  processed_lyric_tokenzied = [i.lower().replace(',', '').split() for i in text.split("\n")]
  
  return processed_lyric_tokenzied

def tokens_to_vectors(sentences, embedder=word2Vec_embedder):
  """
    Process each token into it embedded vector using pre-trained embedder
    If the word is not in corpus, replace with embedding of unknown token
  """

  embedded_matrix = []
  for s in sentences:
    word_embedding_arr = []
    for word in s:
      try:
        word_embedding_arr.append(embedder[word])
      except:
        word_embedding_arr.append(embedder['[UNK]'])
    embedded_matrix.append(word_embedding_arr)

  return embedded_matrix

In [7]:
class SemanticGraph():
  """
    Semantic Similar Graph Implementation
  """
  def __init__(self, data=None):
    self.lemma = []
    self.vectors = []
    self.G = None

    if data != None:
      self.set_document(data)

  def build_graph(self):
    self.G = nx.DiGraph()
    self.set_nodes()
    self.set_edges()


  def set_document(self, text):
    """
      Tokenize and embedd the tokens
      Load the document into the system
      Generate a node for each token index
    """
    document = {
        'lemma': [],
        'vectors': []
    }
    document['lemma'] = text_to_tokens(text)
    embeddings = tokens_to_vectors(text_to_tokens(text))
    vectors = [np.mean(embedding, axis=0) for embedding in embeddings]
    document['vectors'] = vectors
    self.lemma = document['lemma']
    self.vectors = document['vectors']
    self.build_graph()


  def set_nodes(self):
    """
      Generate a node per sentence in the stored dictionary
    """
    for idx, sentence in enumerate(self.lemma):
      self.G.add_node(idx)


  def set_edges(self):
    """
      Class Abstract Function
    """
    pass


  def print(self):
    """
      Print the graph in the console
    """
    labels = nx.get_edge_attributes(self.G, 'weight')
    for key in labels:
      labels[key] = round(labels[key], 3)
    pos = nx.spring_layout(self.G)
    nx.draw(self.G, pos, with_labels=True, font_weight='bold')
    nx.draw_networkx_edge_labels(self.G, pos, edge_labels=labels)


  def evaluate_coherence(self):
    """
      Compute coherence score the entire document
      Based on Putra et al.
        - Average over number of sentences, and number of outgoing edges weights
    """
    labels = nx.get_edge_attributes(self.G, 'weight')
    if len(labels.keys()) == 0:
      return 0
    return np.mean(np.array(list(labels.values())))

In [8]:
from scipy import spatial

class PAV(SemanticGraph):

    def __init__(self):
        # Inherit the Semantic Graph Class
        # super().__init__(self)
        self.alpha = 0


    def set_alpha(self, _alpha):
        self.alpha = _alpha
        self.build_graph()


    def set_edges(self):
        """
            Polymorp the set_edges method
            Define coherence as previous sentences give context to current sentence
            For PAV:
                1. compute the cosine similarity of setence pairs
                2. Compute Unique Overlapping Terms (UOT)

        """
        i = len(self.vectors) -1
        while i > 0:
            # cosine similarity
            cosine = 1 - spatial.distance.cosine(self.vectors[i], self.vectors[i - 1])
            
            # for two sentences, determine the UOT
            terms1 = set(self.lemma[i])
            terms2 = set(self.lemma[i - 1])
            unique = terms1.intersection(terms2)
            common = terms1.union(terms2)
            uot = 1.0 * len(unique) / len(common)

            # compute the weight
            weight = self.alpha * uot + (1 - self.alpha) * cosine
            self.G.add_weighted_edges_from([(i, i - 1, weight)])
            i -= 1


In [9]:
class SSV(SemanticGraph):
    def __init__(self):
        #super().__init__(self)
        pass

    def set_edges(self):
        """
            Polymorp the set_edges method
            Define coherence as the dependency among sentences
            For SSV:
                1. compute the weight as cosine similarity
        """
        i = 0

        # for each sentence, 
        # loop through each other sentence to compute the weight
        while i < len(self.lemma):
          j = 0

          current_related_idx = -1
          current_related_weight = 0
          while j < len(self.lemma):
            # if the same sentence, simply ignore
            if i == j:
                j += 1
                continue
            

            # compute the weight of both sentence
            # the weights are penalized by the distance between two senteces (closer sentences are more prefer)
            # keep only the most similar sentence among all sentences in a given text
            weight = (1 - spatial.distance.cosine(self.vectors[i], self.vectors[j]) / abs(i-j))
            if weight > current_related_weight:
                current_related_weight = weight
                current_related_idx = j
            j += 1

          # update weights between two sentence nodes
          # go to next sentence
          self.G.add_weighted_edges_from([(i, current_related_idx, current_related_weight)])
          i += 1


In [10]:
class MSV(SemanticGraph):
    def __init__(self):
        #super().__init__(self)
        self.theta = 0


    def set_theta(self, _theta):
        self.theta = _theta
        self.build_graph()


    def set_edges(self):
        """
            Polymorp the set_edges method
            Define coherence as the dependency among sentences
            Similar to SSV but allow for multiple outgoing edges
            For MSV:
                1. compute the weight as cosine similarity
        """
        i = 0
        while i < len(self.lemma):
            j = 0
            while j < len(self.lemma):
                if i == j:
                    j += 1
                    continue

                # compute cosine similarity weight between current sentence and all other sentences in the document
                # the weights are penalized by the distance between two senteces (closer sentences are more prefer)
                # if a the weight is high and pass a certain threshold, that edge is created
                weight = (1 - spatial.distance.cosine(self.vectors[i], self.vectors[j])) / (abs(i - j))
                if weight > self.theta:
                    self.G.add_weighted_edges_from([(i, j, weight)])
                j += 1
            i += 1

In [11]:
data = pd.read_csv(f"{DATA_DIR}/lstm_attention_lyrics_all_genres.csv")
data = data.loc[:, ~data.columns.str.contains('^Unnamed')]
data

Unnamed: 0,Soul,Rock,R&B,Pop,Metal,Jazz,Folk,Rap
0,"should you stay should you go \n hmm ""you're"" ...","""daddy's"" flown across the ocean \n leaving ju...",i remember like it was yesterday \n we were ov...,"""never look back,"" we said \n how was i to kno...",down fell the stars as they splashed into the ...,"with all my heart, i love you, baby \n stay wi...",i know how it feels to be head over heels \n t...,visualizing the realism of life and actuality ...
1,i am in love with a beautiful girl \n i am in ...,"hello, hello, hello, hello \n check, check, ch...","tell me how you been \n well, ""it's"" so good t...",look at those beautiful eyes \n she reminds me...,yesterday is gone forever \n no turning back t...,"good things will come your way \n ""you'll"" fin...",holy war \n genocide \n suicide \n hate and cr...,"+ (lyfe) \n knew i said i would roll ""wit'chu""..."
2,no no no \n no no no \n my world has fell apar...,"oh, sweet thing \n zion ""doesn't"" love you \n ...","yeah, mmm, yeah \n do you know that some folks...",chestnuts roasting on an open fire \n jack fro...,crisis feeds the lunacy \n all fear the new ma...,"your love, like wine. tasting sweeter to me ev...",late last night so far away \n i dreamed mysel...,"""baby, did you hear that ? "" ""yeah, baby, i he..."
3,some boys just got the look of a greek adonis ...,"waking up dead, inside of my head \n would nev...",walk in my shoes \n this hurt inside is too mu...,if you love somebody \n put a light in the air...,"lay beside me, tell me what ""they've"" done \n ...",everyday we go through \n i do something that ...,photographs and memories \n christmas cards yo...,sugar come by and get me high \n sugar come by...
4,"sitting here thinking about the times ""i've"" r...","at my feeding time \n ""she'd"" push food throug...",once upon a time was a dream that sounded craz...,"my friend, hmm my love \n faded, oh \n \n it w...",the sky was clear that night \n we were alone ...,*miles davis hums to the tune \n *olu dara pla...,"i stole your style \n hope you ""don't"" mind \n...","tony dead and his brother too \n streets ""talk..."
...,...,...,...,...,...,...,...,...
95,"lay it down, lay it down, lay it down \n put y...",i jumped in the river and what did i see ? \n ...,i just want to lay next to you for a while \n ...,"once upon a time, you whispered softly in my e...",headless \n i skid like rita lifted ocean \n i...,i got rhythm i got music i got my man who coul...,there is a town in north ontario \n with dream...,"ghetto guitar \n run that back, turbo \n \n i ..."
96,"was blind, but now i see \n \n amazing, amazin...",when she goes storming out \n i run for cover ...,"hey mama, ""don't"" you treat me wrong \n come a...","i see trees of green, red roses too \n i see t...",would you care for me if i was deaf and blind ...,"inseparable \n ""that's"" how ""we'll"" always be ...","gonna tell you a story, that you ""won't"" belie...","yeah, yeah, yeah, yeah, yeah, yeah \n big 14, ..."
97,"there ""ain't"" no reason for us sitting down \n...","sick and tired of the bottom, need a better vi...",ah baby \n after many tears fall from your eye...,oho oho oh oho oho \n ba da ba bum \n oho oho ...,high velocity bullet at close range \n can dam...,falling in love with love is falling for make-...,all you have to do is touch my hand \n to show...,"oh, oh, ""don't"" pray for love \n oh, oh, say i..."
98,willow weep for me \n willow weep for me \n be...,the unknown distance to the great beyond \n st...,"""there's"" a ghost down in the hall \n ""there's...","""i'm"" five years old, ""it's"" getting cold, ""i'...","""i'm"" looking through a hole in the sky \n ""i'...",my heart is like a hand-me-down made soft by o...,it was a cold day in london \n dark clouds rum...,a girl like you in a world like this \n a nigg...


In [12]:
data.columns


Index(['Soul', 'Rock', 'R&B', 'Pop', 'Metal', 'Jazz', 'Folk', 'Rap'], dtype='object')

In [29]:
def preprocessText(text):
    stopChars = [',','(',')','.','-','[',']','"',"?"]
    processedText = text.lower()
    for char in stopChars:
        processedText = processedText.replace(char,'')
    return processedText

In [30]:
scoring = {}
for genre in data.columns:
  # instantiate place holder variables
  PAV_scores = []
  SSV_scores = []
  MSV_scores = []
  lyrics = []

  genre_df = data[[genre]]

  for i, row in genre_df.iterrows():
    generated_lyrics = row.values[0]

    try:
      generated_lyrics = preprocessText(generated_lyrics)

      try:
        alpha = 0.6
        pav = PAV()
        pav.set_document(generated_lyrics)
        pav.set_alpha(alpha)
        PAV_scores.append(pav.evaluate_coherence())
      except Exception as e:
        print(f"PAV Probem found in lyric id: {i} of genre: {genre}, {e}")
        PAV_scores.append(0)
        pass

      try:
        ssv = SSV()
        ssv.set_document(generated_lyrics)
        SSV_scores.append(ssv.evaluate_coherence())
      except:
        print(f"SSV Probem found in lyric id: {i} of genre: {genre}")
        SSV_scores.append(0)
        pass

      try:
        theta = 0.2
        msv = MSV()
        msv.set_document(generated_lyrics)
        msv.set_theta(theta)
        MSV_scores.append(msv.evaluate_coherence())
      except:
        print(f"MSV Probem found in lyric id: {i} of genre: {genre}")
        MSV_scores.append(0)
        pass
    except:
      print(f'{genre}: Failed')

  genre_df['PAV'] = PAV_scores
  genre_df['SSV'] = SSV_scores
  genre_df['MSV'] = MSV_scores
  genre_df.to_csv(f"{DATA_DIR}/lstm_attention_{genre.lower()}_generated_lyrics.csv")

  scoring[genre.lower()] = {
      'PAV': round(sum(PAV_scores)/len(PAV_scores), 5),
      'SSV': round(sum(SSV_scores)/len(SSV_scores), 5),
      'MSV': round(sum(MSV_scores)/len(MSV_scores), 5)
  }

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  dist = 1.0 - uv / np.sqrt(uu * vv)


PAV Probem found in lyric id: 20 of genre: Soul, float division by zero
PAV Probem found in lyric id: 24 of genre: Soul, float division by zero
PAV Probem found in lyric id: 38 of genre: Soul, float division by zero
PAV Probem found in lyric id: 46 of genre: Soul, float division by zero
PAV Probem found in lyric id: 58 of genre: Soul, float division by zero
PAV Probem found in lyric id: 77 of genre: Soul, float division by zero
PAV Probem found in lyric id: 88 of genre: Soul, float division by zero
PAV Probem found in lyric id: 91 of genre: Soul, float division by zero
PAV Probem found in lyric id: 92 of genre: Soul, float division by zero


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


PAV Probem found in lyric id: 1 of genre: Rock, float division by zero
PAV Probem found in lyric id: 5 of genre: Rock, float division by zero
PAV Probem found in lyric id: 7 of genre: Rock, float division by zero
PAV Probem found in lyric id: 8 of genre: Rock, float division by zero
PAV Probem found in lyric id: 14 of genre: Rock, float division by zero
PAV Probem found in lyric id: 21 of genre: Rock, float division by zero
PAV Probem found in lyric id: 29 of genre: Rock, float division by zero
PAV Probem found in lyric id: 31 of genre: Rock, float division by zero
PAV Probem found in lyric id: 32 of genre: Rock, float division by zero
PAV Probem found in lyric id: 33 of genre: Rock, float division by zero
PAV Probem found in lyric id: 34 of genre: Rock, float division by zero
PAV Probem found in lyric id: 39 of genre: Rock, float division by zero
PAV Probem found in lyric id: 42 of genre: Rock, float division by zero
PAV Probem found in lyric id: 44 of genre: Rock, float division by z

In [31]:
lstm_attention_scoring = pd.DataFrame().from_dict(scoring).T
lstm_attention_scoring.to_csv(f"{DATA_DIR}/lstm_attention_semantic_graph_coherence_scores.csv")

In [32]:
lstm_attention_scoring

Unnamed: 0,PAV,SSV,MSV
soul,0.50241,0.99776,0.49785
rock,0.34189,0.9992,0.50014
r&b,0.49977,0.99892,0.50277
pop,0.46629,1.0,0.51594
metal,0.21875,0.99869,0.5041
jazz,0.43015,0.9837,0.45688
folk,0.42286,0.98404,0.4635
rap,0.55093,0.99697,0.51946
