In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Coherence Analysis
Perform prepossing of data to be use for coherence analysis of Machine Generated Lyrics. 

In [None]:
!pip install langdetect
!pip install gensim==4.1.2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gensim==4.1.2
  Downloading gensim-4.1.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 82.9 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.1.2


In [None]:
import re
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

# import nltk and gensim library for token level embeddings
import nltk
from nltk.corpus import reuters
from nltk.data import find
import gensim

from langdetect import detect

In [None]:
#NOTE: Change this to your directory as needed
DATA_DIR = "/content/drive/MyDrive/W266 Project/w266-finalproj/data"

In [None]:
#@title NLTK & Word2Vec

# instantiate Word2Vec token level embedding
nltk.download('word2vec_sample')

word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))

model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

[nltk_data] Downloading package word2vec_sample to /root/nltk_data...
[nltk_data]   Unzipping models/word2vec_sample.zip.


In [None]:
#@title Embedding Matrix Creation

EMBEDDING_DIM = 300

# initialize embedding matrix and word-to-id map:
embedding_matrix = np.zeros((len(model.vocab.keys()) + 1, EMBEDDING_DIM))       
vocab_dict = {}

# build the embedding matrix and the word-to-id map:
for i, word in enumerate(model.vocab.keys()):
    embedding_vector = model[word]
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        vocab_dict[word] = i

# we can use the last index at the end of the vocab for unknown tokens
vocab_dict['[UNK]'] = len(vocab_dict)

In [None]:
# Load full dataset for some additional processing
data = pd.read_csv(f"{DATA_DIR}/02_intermediate/language-processed-data.csv", index_col=0)
data

Unnamed: 0,artist,genre,title,lyrics,word_num,language
0,Iron Maiden,metal,The Number of the Beast,"Woe to you, o'er Earth and Sea\nFor the Devil ...",303,en
1,Iron Maiden,metal,Fear of the Dark,I am a man who walks alone\nAnd when I'm walki...,358,en
2,Iron Maiden,metal,The Trooper,You'll take my life but I'll take yours too\nY...,242,en
3,Iron Maiden,metal,Hallowed Be Thy Name,"I'm waiting in my cold cell, when the bell beg...",261,en
4,Iron Maiden,metal,Run to the Hills,White man came across the sea\nHe brought us p...,199,en
...,...,...,...,...,...,...
4205,Donny Hathaway,soul,"The Ghetto - Live @ Troubadour, Hollywood, CA.","Whoo, yeah\nMm-hmm\nYes\nThis is the Ghetto\nS...",262,en
4206,Donny Hathaway,soul,What’s going on - live version,"Mother, mother\nThere's too many of you crying...",159,en
4207,Donny Hathaway,soul,Hey girl - live version,Hey girl\nI've been watching you\nThe rapid be...,161,en
4208,Donny Hathaway,soul,Make It Your Own,"Yesterday, you were mine\nNow, you're gone\nI ...",88,en


In [None]:
def process_lyric(data):
    """
      Function to process each line of lyric into sentences with tokenized word
      Inputs:
        data: pd.DataFrame()
      Output:
        TBD
    """

    data = data.copy()
    processed_lyric_by_genre = pd.DataFrame(columns=['genre', 'processed_lyric', 'word_num','language'])
    for i, row in data.iterrows():
        lyric = row['lyrics']
        lyric = lyric.replace('\n', ' \n ')
        lyric = lyric.replace('!', ' ! ')
        lyric = lyric.replace('?', ' ? ')
        lyric = lyric.replace('#', ' ')
        processed_lyric_tokenzied = [i.lower().replace(',', '').split() for i in lyric.split("\n")]
        temp_data = {
            "genre": row['genre'],
            "processed_lyric": processed_lyric_tokenzied,
            "word_num": row['word_num'],
            "language": row['language'],
        }
        processed_lyric_by_genre = processed_lyric_by_genre.append(temp_data, ignore_index=True)
    return processed_lyric_by_genre




In [None]:
processed_lyric_df = process_lyric(data)
processed_lyric_df

Unnamed: 0,genre,processed_lyric,word_num,language
0,metal,"[[woe, to, you, o'er, earth, and, sea], [for, ...",303,en
1,metal,"[[i, am, a, man, who, walks, alone], [and, whe...",358,en
2,metal,"[[you'll, take, my, life, but, i'll, take, you...",242,en
3,metal,"[[i'm, waiting, in, my, cold, cell, when, the,...",261,en
4,metal,"[[white, man, came, across, the, sea], [he, br...",199,en
...,...,...,...,...
53277,soul,"[[whoo, yeah], [mm-hmm], [yes], [this, is, the...",262,en
53278,soul,"[[mother, mother], [there's, too, many, of, yo...",159,en
53279,soul,"[[hey, girl], [i've, been, watching, you], [th...",161,en
53280,soul,"[[yesterday, you, were, mine], [now, you're, g...",88,en


# Semantic Similarity Graph
Implementation of Semantic Similarity Graph based on Putra et al. Class use to compute the coherence of a given text

In [None]:
class SemanticGraph():
  """
    Semantic Similar Graph Implementation
  """
  def __init__(self, data=None):
    self.lemma = []
    self.vectors = []
    self.G = None

    if data != None:
      self.set_document(data)


  def set_document(self, text):
    """
      Load the document into the system
    """
    document = {
        "lemma": []
        "vectors": []
    }

    document['lemma'] = text_to_tokens(text, True)