In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Coherence Analysis

## Approach 1: Using Semantic Similarity Graph
Implementation of Semantic Similarity Graph based on Putra et al. Class use to compute the coherence of a given text


## Approach 2: Using Neural Network 
Utilizing LSTM and fine-tuned BERT architecture to generate a coherence score for song lyrics. The architecture will be train on real world lyrics and then compared it to generated lyrics by other ML tasks. 

In [None]:
!pip install gensim
!pip install nltk
!pip install transformer
!pip install networkx
!pip install tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[31mERROR: Could not find a version that satisfies the requirement transformer (from versions: none)[0m
[31mERROR: No matching distribution found for transformer[0m
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# import relevant libraries
import re
import random
import joblib
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tqdm.notebook import tqdm
import networkx as nx


# import nltk and gensim library for token level embeddings
import nltk
import gensim
import gensim.downloader
from nltk.data import find


In [None]:
DATA_DIR = "/content/drive/MyDrive/W266 Project/w266-finalproj/data"


In [None]:
# load in punkt module
# instantiate the tokenizer
nltk.download('punkt')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
gloVe_embedder = gensim.downloader.load('glove-wiki-gigaword-50')
gloVe_embed_vectors_vocab = gloVe_embedder.vocab.keys()
len(gloVe_embed_vectors_vocab)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


400000

In [None]:
#@title Word2Vec Embedding Matrix Creation
# instantiate Word2Vec token level embedding
nltk.download('word2vec_sample')
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

# let's try 300 dimension Word2Vec Embedding
EMBEDDING_DIM = 300
# initialize embedding matrix and word-to-id map:
# embedding_matrix = np.zeros((len(model.key_to_index) + 1, EMBEDDING_DIM))       
embedding_matrix = np.zeros((len(model.vocab.keys()) + 1, EMBEDDING_DIM))       

word2Vec_embedder = {}
# build the embedding matrix and the word-to-id map:
# for i, word in enumerate(model.key_to_index):
for i, word in enumerate(model.vocab.keys()):
    embedding_vector = model[word]
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        word2Vec_embedder[word] = embedding_vector

# we can use the last index at the end of the vocab for unknown tokens
word2Vec_embedder['[UNK]'] = np.zeros(EMBEDDING_DIM)
embedding_matrix.shape

[nltk_data] Downloading package word2vec_sample to /root/nltk_data...
[nltk_data]   Package word2vec_sample is already up-to-date!


(43982, 300)

In [None]:
# preparing preprocessing tools
def text_to_tokens(text):
  """
    Process input text into individual token per sentence
  """
  processed_lyric_tokenzied = [i.lower().replace(',', '').split() for i in text.split("\n")]
  
  return processed_lyric_tokenzied

def tokens_to_vectors(sentences, embedder=word2Vec_embedder):
  """
    Process each token into it embedded vector using pre-trained embedder
    If the word is not in corpus, replace with embedding of unknown token
  """

  embedded_matrix = []
  for s in sentences:
    word_embedding_arr = []
    for word in s:
      try:
        word_embedding_arr.append(embedder[word])
      except:
        word_embedding_arr.append(embedder['[UNK]'])
    embedded_matrix.append(word_embedding_arr)

  return embedded_matrix

In [None]:
class SemanticGraph():
  """
    Semantic Similar Graph Implementation
  """
  def __init__(self, data=None):
    self.lemma = []
    self.vectors = []
    self.G = None

    if data != None:
      self.set_document(data)

  def build_graph(self):
    self.G = nx.DiGraph()
    self.set_nodes()
    self.set_edges()


  def set_document(self, text):
    """
      Tokenize and embedd the tokens
      Load the document into the system
      Generate a node for each token index
    """
    document = {
        'lemma': [],
        'vectors': []
    }
    document['lemma'] = text_to_tokens(text)
    embeddings = tokens_to_vectors(text_to_tokens(text))
    vectors = [np.mean(embedding, axis=0) for embedding in embeddings]
    document['vectors'] = vectors
    self.lemma = document['lemma']
    self.vectors = document['vectors']
    self.build_graph()


  def set_nodes(self):
    """
      Generate a node per sentence in the stored dictionary
    """
    for idx, sentence in enumerate(self.lemma):
      self.G.add_node(idx)


  def set_edges(self):
    """
      Class Abstract Function
    """
    pass


  def print(self):
    """
      Print the graph in the console
    """
    labels = nx.get_edge_attributes(self.G, 'weight')
    for key in labels:
      labels[key] = round(labels[key], 3)
    pos = nx.spring_layout(self.G)
    nx.draw(self.G, pos, with_labels=True, font_weight='bold')
    nx.draw_networkx_edge_labels(self.G, pos, edge_labels=labels)


  def evaluate_coherence(self):
    """
      Compute coherence score the entire document
      Based on Putra et al.
        - Average over number of sentences, and number of outgoing edges weights
    """
    labels = nx.get_edge_attributes(self.G, 'weight')
    if len(labels.keys()) == 0:
      return 0
    return np.mean(np.array(list(labels.values())))

In [None]:
from scipy import spatial

class PAV(SemanticGraph):

    def __init__(self):
        # Inherit the Semantic Graph Class
        # super().__init__(self)
        self.alpha = 0


    def set_alpha(self, _alpha):
        self.alpha = _alpha
        self.build_graph()


    def set_edges(self):
        """
            Polymorp the set_edges method
            Define coherence as previous sentences give context to current sentence
            For PAV:
                1. compute the cosine similarity of setence pairs
                2. Compute Unique Overlapping Terms (UOT)

        """
        i = len(self.vectors) -1
        while i > 0:
            # cosine similarity
            cosine = 1 - spatial.distance.cosine(self.vectors[i], self.vectors[i - 1])
            
            # for two sentences, determine the UOT
            terms1 = set(self.lemma[i])
            terms2 = set(self.lemma[i - 1])
            unique = terms1.intersection(terms2)
            common = terms1.union(terms2)
            uot = 1.0 * len(unique) / len(common)

            # compute the weight
            weight = self.alpha * uot + (1 - self.alpha) * cosine
            self.G.add_weighted_edges_from([(i, i - 1, weight)])
            i -= 1


In [None]:
class SSV(SemanticGraph):
    def __init__(self):
        #super().__init__(self)
        pass

    def set_edges(self):
        """
            Polymorp the set_edges method
            Define coherence as the dependency among sentences
            For SSV:
                1. compute the weight as cosine similarity
        """
        i = 0

        # for each sentence, 
        # loop through each other sentence to compute the weight
        while i < len(self.lemma):
          j = 0

          current_related_idx = -1
          current_related_weight = 0
          while j < len(self.lemma):
            # if the same sentence, simply ignore
            if i == j:
                j += 1
                continue
            

            # compute the weight of both sentence
            # the weights are penalized by the distance between two senteces (closer sentences are more prefer)
            # keep only the most similar sentence among all sentences in a given text
            weight = (1 - spatial.distance.cosine(self.vectors[i], self.vectors[j]) / abs(i-j))
            if weight > current_related_weight:
                current_related_weight = weight
                current_related_idx = j
            j += 1

          # update weights between two sentence nodes
          # go to next sentence
          self.G.add_weighted_edges_from([(i, current_related_idx, current_related_weight)])
          i += 1


In [None]:
class MSV(SemanticGraph):
    def __init__(self):
        #super().__init__(self)
        self.theta = 0


    def set_theta(self, _theta):
        self.theta = _theta
        self.build_graph()


    def set_edges(self):
        """
            Polymorp the set_edges method
            Define coherence as the dependency among sentences
            Similar to SSV but allow for multiple outgoing edges
            For MSV:
                1. compute the weight as cosine similarity
        """
        i = 0
        while i < len(self.lemma):
            j = 0
            while j < len(self.lemma):
                if i == j:
                    j += 1
                    continue

                # compute cosine similarity weight between current sentence and all other sentences in the document
                # the weights are penalized by the distance between two senteces (closer sentences are more prefer)
                # if a the weight is high and pass a certain threshold, that edge is created
                weight = (1 - spatial.distance.cosine(self.vectors[i], self.vectors[j])) / (abs(i - j))
                if weight > self.theta:
                    self.G.add_weighted_edges_from([(i, j, weight)])
                j += 1
            i += 1

In [None]:
data = pd.read_csv(f"{DATA_DIR}/gpt-generated-lyrics/gpt_pop_generated_lyrics.csv")
data

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,genre,original_lyric,generated_lyric,model,PAV,SSV,MSV,cleaned_generated_lyrics
0,0,0,Pop,"""Never look back,"" we said\nHow was I to know ...","""never look back,"" we said how was i to know i...",gpt2,0.0000,0.000000,0.000000,"never look back, we said how was i to know id ..."
1,1,1,Pop,Look at those beautiful eyes\nShe reminds me s...,look at those beautiful eyes she reminds me so...,gpt2,0.0000,0.000000,0.000000,look at those beautiful eyes she reminds me so...
2,2,2,Pop,Chestnuts roasting on an open fire\nJack Frost...,chestnuts roasting on an open fire jack frost ...,gpt2,0.0000,0.000000,0.000000,chestnuts roasting on an open fire jack frost ...
3,3,3,Pop,If you love somebody\nPut a light in the air\n...,if you love somebody put a light in the air fr...,gpt2,0.4000,1.000000,0.593303,if you love somebody put a light in the air fr...
4,4,4,Pop,"My friend, hmm my love\nFaded, oh\n\nIt was ov...","my friend, hmm my love faded, oh it was overca...",gpt2,0.0000,1.000000,0.722222,"my friend, hmm my love faded, oh it was overca..."
...,...,...,...,...,...,...,...,...,...,...
95,95,95,Pop,"Once upon a time, you whispered softly in my e...","once upon a time, you whispered softly in my e...",gpt2,0.4000,1.000000,0.796734,"once upon a time, you whispered softly in my e..."
96,96,96,Pop,"I see trees of green, red roses too\nI see the...","i see trees of green, red roses too i see them...",gpt2,0.3531,0.801668,0.801668,"i see trees of green, red roses too i see them..."
97,97,97,Pop,Oho oho oh oho oho\nBa da ba bum\nOho oho oh o...,oho oho oh oho oho ba da ba bum oho oho oh oho...,gpt2,0.0000,0.000000,0.000000,oho oho oh oho oho ba da ba bum oho oho oh oho...
98,98,98,Pop,"I'm five years old, it's getting cold, I've go...","im five years old, its getting cold, ive got m...",gpt2,0.0000,0.000000,0.000000,"im five years old, its getting cold, ive got m..."


In [None]:

scoring = {}
for genre in ['Metal', 'Rap', 'Rock', 'Jazz', 'Folk', 'Pop', 'Soul', 'RNB']:

  # instantiate place holder variables
  PAV_scores = []
  SSV_scores = []
  MSV_scores = []
  lyrics = []


  # load data set 
  data = pd.read_csv(f"{DATA_DIR}/gpt-generated-lyrics/gpt_{genre.lower()}_generated_lyrics.csv")
  for i, row in data.iterrows():
    # load the lyric and clean it up a little bit
    lyric = row['generated_lyric']
    lyric = lyric.replace("'", '')
    lyric = lyric.replace('"', '')
    lyric = lyric.replace("(", '')
    lyric = lyric.replace(")", '')
    lyric = lyric.replace(";", '')
    lyric = lyric.replace(":", '')
    lyric = lyric.replace("-", '')
    lyric = lyric.replace("[", '')
    lyric = lyric.replace("]", '')
    lyric = lyric.replace("<|endoftext|>", '')
    l = lyric.lower().split(" ")
    generated_lyrics = " ".join(l)
    lyrics.append(generated_lyrics)
    try:
      alpha = 0.6
      pav = PAV()
      pav.set_document(generated_lyrics)
      pav.set_alpha(alpha)
      PAV_scores.append(pav.evaluate_coherence())

    except:
      print(f"PAV Probem found in lyric id: {i} of genre: {genre}")
      PAV_scores.append(0)
      pass

    try:
      ssv = SSV()
      ssv.set_document(generated_lyrics)
      SSV_scores.append(ssv.evaluate_coherence())
    except:
      print(f"SSV Probem found in lyric id: {i} of genre: {genre}")
      SSV_scores.append(0)
      pass

    try:
      theta = 0.2
      msv = MSV()
      msv.set_document(generated_lyrics)
      msv.set_theta(theta)
      MSV_scores.append(msv.evaluate_coherence())
    except:
      print(f"MSV Probem found in lyric id: {i} of genre: {genre}")
      MSV_scores.append(0)
      pass

  data['PAV'] = PAV_scores
  data['SSV'] = SSV_scores
  data['MSV'] = MSV_scores
  data['cleaned_generated_lyrics'] = lyrics
  data.to_csv(f"{DATA_DIR}/gpt-generated-lyrics/gpt_{genre.lower()}_generated_lyrics.csv")

  scoring[genre.lower()] = {
      'PAV': round(sum(PAV_scores)/len(PAV_scores), 5),
      'SSV': round(sum(SSV_scores)/len(SSV_scores), 5),
      'MSV': round(sum(MSV_scores)/len(MSV_scores), 5)
  }

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  dist = 1.0 - uv / np.sqrt(uu * vv)


PAV Probem found in lyric id: 21 of genre: Metal
PAV Probem found in lyric id: 38 of genre: Metal
PAV Probem found in lyric id: 51 of genre: Metal
PAV Probem found in lyric id: 58 of genre: Metal
PAV Probem found in lyric id: 59 of genre: Metal
PAV Probem found in lyric id: 62 of genre: Metal
PAV Probem found in lyric id: 87 of genre: Metal
PAV Probem found in lyric id: 58 of genre: Rap
PAV Probem found in lyric id: 78 of genre: Rap
PAV Probem found in lyric id: 80 of genre: Rap
PAV Probem found in lyric id: 98 of genre: Rap
PAV Probem found in lyric id: 39 of genre: Rock
PAV Probem found in lyric id: 44 of genre: Jazz
PAV Probem found in lyric id: 46 of genre: Jazz
PAV Probem found in lyric id: 50 of genre: Jazz
PAV Probem found in lyric id: 69 of genre: Jazz
PAV Probem found in lyric id: 72 of genre: Folk
PAV Probem found in lyric id: 92 of genre: Folk
PAV Probem found in lyric id: 99 of genre: Folk
PAV Probem found in lyric id: 4 of genre: Pop
PAV Probem found in lyric id: 92 of gen

In [None]:
gpt_scoring = pd.DataFrame().from_dict(scoring).T
gpt_scoring.to_csv(f"{DATA_DIR}/gpt-generated-lyrics/semantic_graph_coherence_scores.csv")

In [None]:
gpt_scoring

Unnamed: 0,PAV,SSV,MSV
metal,0.1498,0.43541,0.31116
rap,0.09527,0.28046,0.1988
rock,0.15458,0.38372,0.29393
jazz,0.12091,0.3214,0.25694
folk,0.09527,0.25776,0.20067
pop,0.07974,0.20323,0.15926
soul,0.13772,0.35291,0.26155
rnb,0.10211,0.25173,0.18377


In [None]:
data = pd.read_csv(f"{DATA_DIR}/lstm_attention_generated_lyrics_all_genres.csv") #, index_col=0)
data

Unnamed: 0.1,Unnamed: 0,Soul,Rock,R&B,Pop,Metal,Jazz,Folk
0,0,"should you stay should you go \n hmm ""you're"" ...","""daddy's"" flown across the ocean \n leaving ju...",i remember like it was yesterday \n we were ov...,"""never look back,"" we said \n how was i to kno...",down fell the stars as they splashed into the ...,"with all my heart, i love you, baby \n stay wi...",i know how it feels to be head over heels \n t...
1,1,i am in love with a beautiful girl \n i am in ...,"hello, hello, hello, hello \n check, check, ch...","tell me how you been \n well, ""it's"" so good t...",look at those beautiful eyes \n she reminds me...,yesterday is gone forever \n no turning back t...,"good things will come your way \n ""you'll"" fin...",holy war \n genocide \n suicide \n hate and cr...
2,2,no no no \n no no no \n my world has fell apar...,"oh, sweet thing \n zion ""doesn't"" love you \n ...","yeah, mmm, yeah \n do you know that some folks...",chestnuts roasting on an open fire \n jack fro...,crisis feeds the lunacy \n all fear the new ma...,"your love, like wine. tasting sweeter to me ev...",late last night so far away \n i dreamed mysel...
3,3,some boys just got the look of a greek adonis ...,"waking up dead, inside of my head \n would nev...",walk in my shoes \n this hurt inside is too mu...,if you love somebody \n put a light in the air...,"lay beside me, tell me what ""they've"" done \n ...",everyday we go through \n i do something that ...,photographs and memories \n christmas cards yo...
4,4,"sitting here thinking about the times ""i've"" r...","at my feeding time \n ""she'd"" push food throug...",once upon a time was a dream that sounded craz...,"my friend, hmm my love \n faded, oh \n \n it w...",the sky was clear that night \n we were alone ...,*miles davis hums to the tune \n *olu dara pla...,"i stole your style \n hope you ""don't"" mind \n..."
...,...,...,...,...,...,...,...,...
95,95,"lay it down, lay it down, lay it down \n put y...",i jumped in the river and what did i see ? \n ...,i just want to lay next to you for a while \n ...,"once upon a time, you whispered softly in my e...",headless \n i skid like rita lifted ocean \n i...,i got rhythm i got music i got my man who coul...,there is a town in north ontario \n with dream...
96,96,"was blind, but now i see \n \n amazing, amazin...",when she goes storming out \n i run for cover ...,"hey mama, ""don't"" you treat me wrong \n come a...","i see trees of green, red roses too \n i see t...",would you care for me if i was deaf and blind ...,"inseparable \n ""that's"" how ""we'll"" always be ...","gonna tell you a story, that you ""won't"" belie..."
97,97,"there ""ain't"" no reason for us sitting down \n...","sick and tired of the bottom, need a better vi...",ah baby \n after many tears fall from your eye...,oho oho oh oho oho \n ba da ba bum \n oho oho ...,high velocity bullet at close range \n can dam...,falling in love with love is falling for make-...,all you have to do is touch my hand \n to show...
98,98,willow weep for me \n willow weep for me \n be...,the unknown distance to the great beyond \n st...,"""there's"" a ghost down in the hall \n ""there's...","""i'm"" five years old, ""it's"" getting cold, ""i'...","""i'm"" looking through a hole in the sky \n ""i'...",my heart is like a hand-me-down made soft by o...,it was a cold day in london \n dark clouds rum...
