<a href="https://colab.research.google.com/github/11doris/jazz-maestro/blob/colab_word_embeddings/colab_h_lda_score.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sections as Input

In [1]:
pip install wandb



In [2]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mdoris[0m (use `wandb login --relogin` to force relogin)


In [3]:
import wandb

In [4]:
!pip uninstall gensim -y

Found existing installation: gensim 4.1.2
Uninstalling gensim-4.1.2:
  Successfully uninstalled gensim-4.1.2


In [5]:
!pip install gensim

Collecting gensim
  Using cached gensim-4.1.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
Installing collected packages: gensim
Successfully installed gensim-4.1.2


In [6]:
pip install pyLDAvis



In [7]:
import gensim
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import pprint
import pandas as pd
import numpy as np
from collections import Counter
import plotly.express as px
from tqdm import tqdm 
from gensim.models.doc2vec import Doc2Vec
from gensim.models.tfidfmodel import TfidfModel
from gensim.models.lsimodel import LsiModel
from gensim.models import CoherenceModel
from gensim import corpora
from gensim import similarities
import pickle
import os
import zipfile
from gensim.models.phrases import Phrases

In [8]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [9]:
print(gensim.__version__)

4.1.2


In [10]:
!rm data.csv

In [11]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Configuration


In [12]:
use_wandb = False

In [13]:
generate_webapp_data = False

In [14]:
chords_preprocessing = 'rootAndDegreesSimplified'

In [15]:
ngrams_for_input = [1]

In [16]:
remove_repetitions = False

In [17]:
input_files = {
    'sections': {
        # M7 and 6 reduced to major triad, m7 reduced to m, dominant 7, m7b5, diminished, and all (b5) left as they are.
        'rootAndDegreesPlus': '1BDYukaIj72jmi9PqZVrCAqKbyxg8Z4yq',
        'rootAndDegrees7': '',
        'rootAndDegreesSimplified': '1h9dRRpbTSEE_x9CpbQ9rOi-GXKcCFUCU'
    },
    'tunes': {
        'rootAndDegreesPlus': '',
        'rootAndDegrees7': '',
        'rootAndDegreesSimplified': '',
    }
}

In [18]:
tunes_eval_list = [
  "Sweet Sue, Just You [jazz1350]",
  "On The Sunny Side Of The Street [jazz1350]",
  "These Foolish Things [jazz1350]", 
  "Blue Moon [jazz1350]",
  "All Of Me [jazz1350]",
  "All God's Chillun Got Rhythm [jazz1350]",
  "I Got Rhythm [jazz1350]",
  "Bye Bye Blackbird [jazz1350]",
  "Old Fashioned Love [trad]",
  "Exactly Like You [jazz1350]",
  "Honeysuckle Rose [jazz1350]",
  "Misty [jazz1350]",
  "Naima [jazz1350]",
]

tunes_eval_list = sorted(tunes_eval_list)
tunes_eval_list

["All God's Chillun Got Rhythm [jazz1350]",
 'All Of Me [jazz1350]',
 'Blue Moon [jazz1350]',
 'Bye Bye Blackbird [jazz1350]',
 'Exactly Like You [jazz1350]',
 'Honeysuckle Rose [jazz1350]',
 'I Got Rhythm [jazz1350]',
 'Misty [jazz1350]',
 'Naima [jazz1350]',
 'Old Fashioned Love [trad]',
 'On The Sunny Side Of The Street [jazz1350]',
 'Sweet Sue, Just You [jazz1350]',
 'These Foolish Things [jazz1350]']

In [19]:
contrafacts = [
               ("26-2 [jazz1350]", "Confirmation [jazz1350]"),
               ("52nd Street Theme [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Ablution [jazz1350]", "All The Things You Are [jazz1350]"),
               ("Anthropology [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Ballade [jazz1350]", "As Long As I Live [jazz1350]"),
               ("Bright Mississippi [jazz1350]", "Sweet Georgia Brown [jazz1350]"),
               ("C.T.A. [jazz1350]", "I Got Rhythm [jazz1350]"),
               #( "Celia [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Cottontail [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Countdown [jazz1350]", "Tune Up [jazz1350]"),
               ("Dewey Square [jazz1350]", "Oh, Lady Be Good [jazz1350]"),
               ("Dexterity [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Dig [jazz1350]", "Sweet Georgia Brown [jazz1350]"),
               ("Donna Lee [jazz1350]", "Indiana (Back Home Again In) [jazz1350]"),
               ("Don't Be That Way [jazz1350]", "I Got Rhythm [jazz1350]"),
               #("Eternal Triangle [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Evidence [jazz1350]", "Just You, Just Me [jazz1350]"),
               ("Flintstones [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Four On Six [jazz1350]", "Summertime [jazz1350]"),
               ("Freight Train [jazz1350]", "Blues For Alice [jazz1350]"),
               ("Good Bait [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Hackensack [jazz1350]", "Oh, Lady Be Good [jazz1350]"),
               ("Half Nelson [jazz1350]", "Lady Bird [jazz1350]"),
               ("Hot House [jazz1350]", "What Is This Thing Called Love [jazz1350]"),
               ("Impressions [jazz1350]", "So What [jazz1350]"),
               ("In A Mellow Tone (In A Mellotone) [jazz1350]", "Rose Room [jazz1350]"),
               ("In Walked Bud [jazz1350]", "Blue Skies [jazz1350]"),
               ("Ko Ko [jazz1350]", "Cherokee [jazz1350]"),
               ("Lennie's Pennies [jazz1350]", "Pennies From Heaven [jazz1350]"),   ## Lennie's Pennies is in minor and therefore transposed to Amin... not possible to recognize like that
               #( "Let's Call This [jazz1350]", "Honeysuckle Rose [jazz1350]"),
               ("Little Rootie Tootie [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Little Willie Leaps [jazz1350]", "All God's Chillun Got Rhythm [jazz1350]"),
               ("Lullaby Of Birdland [jazz1350]", "Love Me Or Leave Me [jazz1350]"),
               #("Moose The Mooche [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("My Little Suede Shoes [jazz1350]", "Jeepers Creepers [jazz1350]"),
               #("Oleo [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Ornithology [jazz1350]", "How High The Moon [jazz1350]"),
               #("Passport [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Quasimodo (Theme) [jazz1350]", "Embraceable You [jazz1350]"),
               #("Rhythm-a-ning [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Room 608 [jazz1350]", "I Got Rhythm [jazz1350]"),
               #("Salt Peanuts [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Satellite [jazz1350]", "How High The Moon [jazz1350]"),
               ("Scrapple From The Apple [jazz1350]", "Honeysuckle Rose [jazz1350]"), # A section
               ("Scrapple From The Apple [jazz1350]", "I Got Rhythm [jazz1350]"), # B section
               #("Segment [jazz1350]", "I Got Rhythm [jazz1350]"),
               #("Seven Come Eleven [jazz1350]", "I Got Rhythm [jazz1350]"),
               #("Shaw 'Nuff [jazz1350]", "I Got Rhythm [jazz1350]"),
               #("Theme, The [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Tour De Force [jazz1350]", "Jeepers Creepers [jazz1350]"),
               ("Wow [jazz1350]", "You Can Depend On Me [jazz1350]"),
               ("Yardbird Suite [jazz1350]", "Rosetta [jazz1350]"),

               # following tunes are not from wikipedia),
               ("Sweet Sue, Just You [jazz1350]", "Honeysuckle Rose [jazz1350]"),  # A section
               #("All Of Me [jazz1350]", "Pennies From Heaven [jazz1350]"), # bars 25-28 of All of Me are same as bars 17-20 of Pennies From Heaven, but different key!
               ("Sweet Sue, Just You [jazz1350]", "Bye Bye Blackbird [jazz1350]"), # Bridge same
               ("These Foolish Things [jazz1350]", "Blue Moon [jazz1350]"), # first 8 bars same
               ("These Foolish Things [jazz1350]", "More Than You Know [jazz1350]"),
               ("These Foolish Things [jazz1350]", "Isn't It A Pity [jazz1350]"),
               ("These Foolish Things [jazz1350]", "Soultrain [jazz1350]"),
               ("These Foolish Things [jazz1350]", "Why Do I Love You [jazz1350]"),
               ("Misty [jazz1350]", "Portrait Of Jennie [jazz1350]"),
               ("Misty [jazz1350]", "September In The Rain [jazz1350]"),
               ("Misty [jazz1350]", "I May Be Wrong [jazz1350]"),  

               # identical tunes
               ("Five Foot Two [trad]", "Please Don't Talk About Me When I'm Gone [trad]"),
               ("What Is This Thing Called Love [jazz1350]", "Subconscious Lee [jazz1350]"),
               ("Sweet Georgia Brown [jazz1350]", "Dig [jazz1350]"),


               # almost identical tunes
               ("What Is This Thing Called Love [jazz1350]", "Hot House [jazz1350]"),
               ("Jeannie's Song [jazz1350]", "Shiny Stockings [jazz1350]"),
               ("Alone Together [jazz1350]", "Segment [jazz1350]"),
               ("Baubles, Bangles and Beads [jazz1350]", "Bossa Antigua [jazz1350]"),
               ("There Will Never Be Another You [jazz1350]", "A Weaver Of Dreams [jazz1350]"),
               ("Moten Swing [jazz1350]", "Once In A While (Ballad) [trad]"), # same bridge, similar A
               ("All I Do Is Dream Of You [trad]", "L-O-V-E [jazz1350]"),


               # same A section
               ("Nancy (With The Laughing Face) [jazz1350]", "Body And Soul [jazz1350]"),
               ("Exactly Like You [jazz1350]", "True (You Don't Love Me ) [trad]"),
               ("Exactly Like You [jazz1350]", "True (You Don't Love Me ) [trad]"),
               ("Exactly Like You [jazz1350]", "Jersey Bounce [trad]"),
               ("Take The A Train [jazz1350]", "Girl From Ipanema, The [jazz1350]"),
               ("My Heart Stood Still [jazz1350]", "All Too Soon [jazz1350]"),
               ("Undecided [jazz1350]", "Broadway [jazz1350]"),
               ("My Heart Stood Still [jazz1350]", "All Too Soon [jazz1350]"),
               ("Let's Fall In Love [jazz1350]", "Heart And Soul [jazz1350]"),
               ("Come Back To Me [jazz1350]", "I Wish I Knew [jazz1350]"),
               ("Wait Till You See Her [jazz1350]", "A Certain Smile [jazz1350]"),
               ("Killer Joe [jazz1350]", "Straight Life [jazz1350]"),
               ("Softly, As In A Morning Sunrise [jazz1350]", "Segment [jazz1350]"),
               ("Bei Mir Bist Du Schon (Root Hog Or Die) [trad]", "Egyptian Fantasy [trad]"),
               ("Bei Mir Bist Du Schon (Root Hog Or Die) [trad]", "Puttin' On The Ritz [jazz1350]"),
               ("Coquette [trad]", "Pretend You're Happy When You're Blue [trad]"),
               ("Softly, As In A Morning Sunrise [jazz1350]", "Strode Rode [jazz1350]"),
               ("Glory Of Love, The [jazz1350]", "I've Got My Fingers Crossed [trad]"),


               # same bridge
               ("If I Had You [jazz1350]", "Too Young To Go Steady [jazz1350]"),
               ("Undecided [jazz1350]", "Satin Doll [jazz1350]"),
               ("Billy Boy [jazz1350]", "Elora [jazz1350]"),
               ("Dearly Beloved [jazz1350]", "We See [jazz1350]"),
               ("Alone Together [jazz1350]", "A Night In Tunisia [jazz1350]"),
               ("A Night In Tunisia [jazz1350]", "Segment [jazz1350]"),
               ("Oh! Lady Be Good [trad]", "Sentimental Journey [jazz1350]"),
               ("You Can Depend On Me [jazz1350]", "Move [jazz1350]"),
               ("I Want To Be Happy [jazz1350]", "A Beautiful Friendship [jazz1350]"),
               ("Flying Home [jazz1350]", "Down For Double [jazz1350]"),
               ("Cheek To Cheek [jazz1350]", "Violets For Your Furs [jazz1350]"),
               ("Let's Fall In Love [jazz1350]", "At Last [jazz1350]"),
               ("Don't Be That Way [jazz1350]", "Long Ago And Far Away [jazz1350]"),
               ("On The Sunny Side Of The Street [jazz1350]", "I'm Confessin' That I Love You [jazz1350]"),
               ("On The Sunny Side Of The Street [jazz1350]", "Eclypso [jazz1350]"),
               ("On The Sunny Side Of The Street [jazz1350]", "You Stepped Out Of A Dream [jazz1350]"),
               

               # similar A section
               ("I Like The Likes Of You [jazz1350]", "Mountain Greenery [jazz1350]"),
               ("My Secret Love [jazz1350]", "Samba De Orfeu [jazz1350]"),
               ("Let's Call The Whole Thing Off [jazz1350]", "Fine And Dandy [jazz1350]"),


               # similar B section
               ("Folks Who Live On The Hill, The [jazz1350]", "My One And Only Love [jazz1350]"),
               ("As Long As I Live [jazz1350]", "I'm Glad There Is You [jazz1350]"),
               ("I May Be Wrong [jazz1350]", "Teach Me Tonight [jazz1350]"),
               ("Am I Blue [jazz1350]", "Come Back To Me [jazz1350]"),
               ("My One And Only Love [jazz1350]", "Am I Blue [jazz1350]"),
               ("On The Sunny Side Of The Street [jazz1350]", "September In The Rain [jazz1350]"),
               ("On The Sunny Side Of The Street [jazz1350]", "Mountain Greenery [jazz1350]"),
               ("On The Sunny Side Of The Street [jazz1350]", "There's No You [jazz1350]"),
               ("These Foolish Things [jazz1350]", "Embraceable You [jazz1350]"),
               ("These Foolish Things [jazz1350]", "Rosetta [jazz1350]"),

               # same C section
               ("Bill Bailey [jazz1350]", "Bourbon Street Parade [jazz1350]"),

               # Stella C is like Woody B
               ("Woody'n You [jazz1350]", "Stella By Starlight [jazz1350]"),

               # similar vocabulary, different progressions
               ("Tangerine [jazz1350]", "Tea For Two [jazz1350]"),
               ("I Can't Give You Anything But Love [jazz1350]", "You Can Depend On Me [jazz1350]"),
               ("This Year's Kisses [jazz1350]", "My Monday Date [trad]"),
               ("A Blossom Fell [jazz1350]", "Among My Souvenirs [jazz1350]"),



]

# Initialization

## Download the Data

In [20]:
input_data = input_files['sections'][chords_preprocessing]

input_path = f"https://docs.google.com/uc?export=download&id={input_data}"
data_file_name = 'data.csv'


In [21]:
input_path

'https://docs.google.com/uc?export=download&id=1h9dRRpbTSEE_x9CpbQ9rOi-GXKcCFUCU'

In [22]:
!wget --no-check-certificate "$input_path" -O "$data_file_name"

--2021-11-28 18:59:45--  https://docs.google.com/uc?export=download&id=1h9dRRpbTSEE_x9CpbQ9rOi-GXKcCFUCU
Resolving docs.google.com (docs.google.com)... 108.177.120.139, 108.177.120.113, 108.177.120.138, ...
Connecting to docs.google.com (docs.google.com)|108.177.120.139|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-10-4c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/8ke4b9csbrmb6q620tol4co075vqrrp1/1638125925000/14329102864480165501/*/1h9dRRpbTSEE_x9CpbQ9rOi-GXKcCFUCU?e=download [following]
--2021-11-28 18:59:45--  https://doc-10-4c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/8ke4b9csbrmb6q620tol4co075vqrrp1/1638125925000/14329102864480165501/*/1h9dRRpbTSEE_x9CpbQ9rOi-GXKcCFUCU?e=download
Resolving doc-10-4c-docs.googleusercontent.com (doc-10-4c-docs.googleusercontent.com)... 142.250.128.132, 2607:f8b0:4001:c32::84
Connecting to doc-10-4c-docs.googleusercontent.com (doc-10

### Read Chords Input Data

In [23]:
df = pd.read_csv(data_file_name, sep='\t', index_col="id")
df = df.reset_index()
df.head(5)

Unnamed: 0,id,file_name,title,title_playlist,tune_mode,tune_id,section_name,section_id,chords
0,0,dataset/jazz1350/26-2.xml,26-2,26-2 [jazz1350],major,0,A,1,CM7 Eb7 G#M7 B7 EM7 G7 Gm7 C7 FM7 G#7 C#M7 E7 ...
1,1,dataset/jazz1350/26-2.xml,26-2,26-2 [jazz1350],major,0,A,2,CM7 Eb7 G#M7 B7 EM7 G7 Gm7 C7 FM7 Eb7 G#M7 B7 ...
2,2,dataset/jazz1350/26-2.xml,26-2,26-2 [jazz1350],major,0,B,3,Gm7 C7 Bm7 E7 AM7 C7 FM7 Bbm7 Eb7 G#M7 Dm7 G7
3,3,dataset/jazz1350/26-2.xml,26-2,26-2 [jazz1350],major,0,A,4,CM7 Eb7 G#M7 B7 EM7 G7 Gm7 C7 FM7 Eb7 G#M7 B7 ...
4,4,dataset/jazz1350/500 Miles High.xml,500 Miles High,500 Miles High [jazz1350],minor,1,,0,Am7 Am7 Cm7 Cm7 EbM7 EbM7 Em7b5 A7 Dm7 Dm7 Bm7...


### Meta Data

In [24]:
titles = df.loc[:, ['id', 'tune_id', 'section_id', 'section_name', 'title', 'title_playlist', 'tune_mode']]
titles[:5]

Unnamed: 0,id,tune_id,section_id,section_name,title,title_playlist,tune_mode
0,0,0,1,A,26-2,26-2 [jazz1350],major
1,1,0,2,A,26-2,26-2 [jazz1350],major
2,2,0,3,B,26-2,26-2 [jazz1350],major
3,3,0,4,A,26-2,26-2 [jazz1350],major
4,4,1,0,,500 Miles High,500 Miles High [jazz1350],minor


In [25]:
titles_dict = titles.to_dict()

sectionid_to_title = titles_dict['title_playlist']
sectionid_to_titleid = titles_dict['tune_id']


In [26]:
tunes = df.loc[:, ['tune_id', 'title_playlist']].drop_duplicates()
tunes = tunes.set_index('tune_id').to_dict()
titleid_to_title = tunes['title_playlist']

In [27]:
title_to_titleid = {v: k for k, v in titleid_to_title.items()}


In [28]:
titles_rows = titles.to_dict(orient='records')
sectionid_to_section = []
for i, row in enumerate(titles_rows):
  name = f"{row['title']}, section{row['section_id']} ({row['section_name']})"
  sectionid_to_section.append(name)
  

In [29]:
title_to_sectionid = {}

for row in titles.iterrows():
  title = row[1]['title_playlist']
  if title not in title_to_sectionid:
    title_to_sectionid[title] = [row[1]['id']]
  else:
    title_to_sectionid[title].append(row[1]['id'])

### Create Directories on Colab

In [30]:
!rm -R output
!mkdir output

## Initialization for wandb variables

In [31]:
recommender_results_cols = ['reference', 'id', 'method', 'similar', 'score_div_max', 'score']
recommender_results = pd.DataFrame(columns=recommender_results_cols)

In [32]:
lsi_config = {
    'num_topics': 100,
}

In [33]:
doc2vec_config = {
    'general': {
        'chords_preprocessing': chords_preprocessing,
        'tag_sections_and_tunes': False,
    },
    'model': {
        'dm': 1,
        'vector_size': 100,
        'window': 4,
        'epochs': 40,
        #'workers': 1,
        'min_count': 1,
        'negative': 10,
        'sample': 0.001,
        'seed': 42
    }
}

In [34]:
if use_wandb:
  wandb.init(
        # Set entity to specify your username or team name
        # ex: entity="carey",
        # Set the project where this run will be logged
        project="jazztunes-lda", 
        
        # Track hyperparameters and run metadata
        config={
            "input_data": input_path,
            "ngrams_input": ngrams_for_input,
            "comparison": "sections",
            "remove_repeated_chords": remove_repetitions,
        }
    )

In [35]:
if use_wandb:
  artifact = wandb.Artifact('input_data', type='dataset')
  artifact.add_file('data.csv')
  wandb.log_artifact(artifact)

## Helpers functions

In [36]:
def ngrams(tokens, n=2, sep='-'):
    return [sep.join(ngram) for ngram in zip(*[tokens[i:] for i in range(n)])]

In [37]:
def raw_chords_to_df(tunes):
  tunes_chords = [item for tune in tunes for item in tune]
  counts = Counter(tunes_chords)
  df = pd.DataFrame(counts.items(),
                    columns=['chord', 'count']).sort_values(by='count', ascending=False)

  return df

# Data Preparation

In [38]:
def remove_chord_repetitions(chords):
  previous = ''
  chords_norep = []
  for c in chords:
    if c != previous:
      chords_norep.append(c)
      previous = c
  return chords_norep
  

In [39]:
lines = df.loc[:, 'chords'].tolist()
data = [line.split(' ') for line in lines]

In [40]:
processed_corpus = []
for line in data:
  tune_n = []
  if remove_repetitions:
    line = remove_chord_repetitions(line)
  for n in ngrams_for_input:
    tune_n.extend(ngrams(line, n=n))
  processed_corpus.append(tune_n)

for line in processed_corpus[:10]:
  print(line)

['CM7', 'Eb7', 'G#M7', 'B7', 'EM7', 'G7', 'Gm7', 'C7', 'FM7', 'G#7', 'C#M7', 'E7', 'Am7', 'D7', 'Dm7', 'G7']
['CM7', 'Eb7', 'G#M7', 'B7', 'EM7', 'G7', 'Gm7', 'C7', 'FM7', 'Eb7', 'G#M7', 'B7', 'EM7', 'G7', 'CM7']
['Gm7', 'C7', 'Bm7', 'E7', 'AM7', 'C7', 'FM7', 'Bbm7', 'Eb7', 'G#M7', 'Dm7', 'G7']
['CM7', 'Eb7', 'G#M7', 'B7', 'EM7', 'G7', 'Gm7', 'C7', 'FM7', 'Eb7', 'G#M7', 'B7', 'EM7', 'G7', 'CM7']
['Am7', 'Am7', 'Cm7', 'Cm7', 'EbM7', 'EbM7', 'Em7b5', 'A7', 'Dm7', 'Dm7', 'Bm7b5', 'Bm7b5', 'Bbm7', 'Bbm7', 'Fm7', 'Fm7', 'E7', 'E7', 'Fm7', 'Fm7', 'C#M7', 'C#M7', 'Fm7', 'Fm7', 'C#M7', 'C#M7']
['Am7', 'C#M7', 'Bm7b5', 'E7', 'Am7', 'C#M7', 'Bm7b5', 'E7', 'Cm7', 'F7', 'BbM7', 'G#m7', 'C#7', 'F#m7b5', 'B7', 'EM7', 'EM7', 'E7', 'Am7', 'C#M7', 'Bm7b5', 'E7', 'Am7', 'C#M7', 'Bm7b5', 'E7', 'Cm7', 'F7', 'BbM7', 'G#m7', 'C#7', 'F#m7b5', 'B7', 'Em7', 'Em7']
['C', 'Am7', 'Dm7', 'G7', 'C', 'Am7', 'Dm7', 'G7', 'C', 'Am7', 'Dm7', 'G7', 'C', 'G7', 'C']
['C', 'Am7', 'Dm7', 'G7', 'C', 'Am7', 'Dm7', 'G7', 'C', '

#### Corpus Overview

In [41]:
tokens = [item for l in processed_corpus for item in l]
total_tokens = len(tokens)
vocab_size = len(set(tokens))
vocab_prop = 100*vocab_size/total_tokens

print(f"Total Number of tokens: {total_tokens}")
print(f"Size of vocabulary: {vocab_size}")
print(f"Proportion of vocabulary in corpus: {vocab_prop:.02f}%")

Total Number of tokens: 79498
Size of vocabulary: 154
Proportion of vocabulary in corpus: 0.19%


In [42]:
df_chords = raw_chords_to_df(processed_corpus)
df_chords

Unnamed: 0,chord,count
5,G7,10468
14,Dm7,6970
30,C,5837
0,CM7,5068
21,A7,4261
13,D7,3720
12,Am7,3704
7,C7,3243
11,E7,3009
39,C6,2752


In [43]:
df_chords = pd.DataFrame.from_dict(df_chords)
df_chords.sort_values(by=['count'], ascending=False, inplace=True)
df_chords_top = df_chords.query('count > 100')

fig = px.bar(df_chords_top, x='chord', y='count', log_y=True)
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
fig.show()

In [44]:
if use_wandb:
  wandb.log(
      {"corpus": {
              "total_tokens": total_tokens,
              "vocab_size": vocab_size,
              "vocab_proportion_in_corpus": vocab_prop,
              }
      }
  )

# Test Helpers

In [45]:
!rm -R index
!mkdir index

In [46]:
def get_sim_scores(tunes, index, model):

    df_sim = pd.DataFrame(columns=['reference_title',
                                   'reference_titleid',
                                   'similar_title',
                                   'similar_titleid',
                                   'ref_section', 
                                   'similar_section', 
                                   'score', 
                                  ])

    for tune in tunes:
      print()
      print("-"*50)
      for s1 in title_to_sectionid[tune]:
      
          query = processed_corpus[s1]
          query_bow = dictionary.doc2bow(query)

          # perform a similarity query against the corpus
          similarities = index[model[query_bow]]
          sims = sorted(enumerate(similarities), key=lambda item: -item[1])
          
          print(s1, sectionid_to_section[s1])
          n = 0
          for s2, s2_score in sims:
            
            # store the top N best results
            if n > 30:
                break
            # don't count self-similarity between sections of the same tune
            if s2 not in title_to_sectionid[tune]:
                n += 1
            
                # print(f"\t{s2_score:.3f} {sectionid_to_section[s2]}")

                df_sim.loc[len(df_sim)] = [tune,
                                           title_to_titleid[tune],
                                           sectionid_to_title[s2],
                                           sectionid_to_titleid[s2],
                                           sectionid_to_section[s1], 
                                           sectionid_to_section[s2], 
                                           s2_score, 
                                           ]
    return df_sim

In [47]:
def recommend_tune(df, tune_name):

  threshold = 0.5 # arbitrary value, selected based on distribution of max scores

  df_tune = df.query(f'reference_title == "{tune_name}"')
  ff = df_tune.iloc[:]

  # get the maximum similarity score for each section and store in new column
  ff['max'] = ff.groupby('ref_section')['score'].transform('max')

  # consider only results for scores above threshold
  ff = ff.loc[(ff['max'] > threshold)]

  # scale the score with the maxmum value of each section
  ff['score_div_max'] = ff['score'] / ff['max']

  # for each similar title, aggregate all of its sections using the median value
  result = ff.groupby('similar_title').median().sort_values('score_div_max', ascending=False)
    
  # if multiple rows from the same similar tune, keep only the similar_title with the highest score_div_max
  result = result.groupby('similar_title').max('score_div_max').sort_values('score_div_max', ascending=False)

  # add the name of the reference tune
  result['reference'] = tune_name
  result['reference_titleid'] = title_to_titleid[tune_name]

  result = result.reset_index()
  result = result.loc[:, ['reference', 'reference_titleid', 'similar_title', 'score', 'max', 'score_div_max']]
  result['similar_titleid'] = result['similar_title'].apply(lambda x: title_to_titleid[x])
  result = result.reset_index()
  return result, ff

In [48]:
def test_contrafacts(tunes, index, model, N=15):
  matches = 0
  number_of_sections = 0
  results = {}

  for tune, similar_tune in tunes:

    # loop over all sections of the tune
    section_matches = 0
    for s1 in title_to_sectionid[tune]:
      query = processed_corpus[s1]
      query_bow = dictionary.doc2bow(query)

      # perform a similarity query against the corpus
      similarities = index[model[query_bow]]
      sims = sorted(enumerate(similarities), key=lambda item: -item[1])

      # check if the section matches the expected title; consider only the first 15 recommendations
      i = 0
      for sectionid, value in sims:
        if i >= N:
          break
        i += 1
        if sectionid_to_title[sectionid] == similar_tune:
          section_matches += 1

    # for each title, increase matches if at least one of the section matched the expected title
    if section_matches > 0:
      matches += 1  
      results[f'{tune}, {similar_tune}'] = 1
    else:
      results[f'{tune}, {similar_tune}'] = 0
  
  return matches, results

# Train LDA Model

In [49]:
from gensim.test.utils import common_texts
import gensim.models as models
import gensim.corpora as corpora
from gensim.corpora.dictionary import Dictionary

# Create a corpus from a list of texts
id2word = corpora.Dictionary(processed_corpus)

2021-11-28 18:59:48,668 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-11-28 18:59:48,670 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)
2021-11-28 18:59:48,672 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)", 'datetime': '2021-11-28T18:59:48.672276', 'gensim': '4.1.2', 'python': '3.7.12 (default, Sep 10 2021, 00:21:48) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'created'}
2021-11-28 18:59:48,757 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-11-28 18:59:48,885 : INFO : built Dictionary(154 unique tokens: ['Am7', 'B7', 'C#M7', 'C7', 'CM7']...) from 5199 documents (total 79498 corpus positions)
2021-11-28 18:59:48,893 : INFO : Dictionary lifecycle event {'msg': "bu

In [50]:
# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(tune) for tune in processed_corpus]

In [51]:
%%time
TOTAL_TOPICS = 30
lda_model = gensim.models.LdaModel(corpus=corpus, 
                                   id2word=id2word, 
                                   chunksize=200, 
                                   alpha='auto', 
                                   eta='auto', 
                                   #eta=0.1,
                                   random_state=42,
                                   #iterations=500, 
                                   num_topics=TOTAL_TOPICS, 
                                   passes=20, 
                                   per_word_topics=True,
                                   eval_every=None,
                                   update_every=0,  # batch learning, start from scratch everytime (<> online learning)
                                   )

2021-11-28 18:59:49,010 : INFO : using autotuned alpha, starting with [0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335]
2021-11-28 18:59:49,013 : INFO : using serial LDA version on this node
2021-11-28 18:59:49,016 : INFO : running batch LDA training, 30 topics, 20 passes over the supplied corpus of 5199 documents, updating model once every 5199 documents, evaluating perplexity every 0 documents, iterating 50x with a convergence threshold of 0.001000
2021-11-28 18:59:49,019 : INFO : PROGRESS: pass 0, at document #200/5199
2021-11-28 18:59:49,149 : INFO : optimized alpha [0.030731946, 0.03001313, 0.030011779, 0.03364776, 0.030729273, 0.03055904, 0.0316

CPU times: user 51 s, sys: 1 s, total: 52 s
Wall time: 53.3 s


In [52]:
lda_model.print_topics()

2021-11-28 19:00:42,348 : INFO : topic #6 (0.003): 0.125*"Dm7" + 0.083*"E7" + 0.079*"G7" + 0.076*"C#6" + 0.066*"Am7" + 0.064*"C6" + 0.045*"Am" + 0.036*"G#6" + 0.033*"Bm7b5" + 0.032*"CM7"
2021-11-28 19:00:42,352 : INFO : topic #15 (0.004): 0.266*"A" + 0.076*"Dm7" + 0.074*"Am7" + 0.065*"D6" + 0.059*"B7" + 0.059*"D" + 0.056*"C7" + 0.052*"G7" + 0.045*"E7" + 0.035*"F7"
2021-11-28 19:00:42,353 : INFO : topic #24 (0.006): 0.148*"G7" + 0.121*"D7" + 0.100*"FM7" + 0.084*"A7" + 0.076*"C7" + 0.073*"Dm7" + 0.056*"CM7" + 0.056*"Gm7" + 0.040*"Am7" + 0.039*"Bb7"
2021-11-28 19:00:42,355 : INFO : topic #22 (0.006): 0.144*"G7" + 0.116*"C" + 0.097*"Dm7" + 0.060*"A7" + 0.055*"C#dim" + 0.052*"D7" + 0.047*"CM7" + 0.033*"Am7" + 0.032*"C7" + 0.030*"E7"
2021-11-28 19:00:42,356 : INFO : topic #2 (0.008): 0.104*"C13" + 0.067*"G#13" + 0.062*"G7" + 0.061*"FmM7" + 0.054*"Bb7" + 0.046*"A7" + 0.045*"G#7" + 0.045*"A6" + 0.043*"G13" + 0.040*"A13"
2021-11-28 19:00:42,357 : INFO : topic #29 (0.009): 0.094*"G#7" + 0.092*"G

[(6,
  '0.125*"Dm7" + 0.083*"E7" + 0.079*"G7" + 0.076*"C#6" + 0.066*"Am7" + 0.064*"C6" + 0.045*"Am" + 0.036*"G#6" + 0.033*"Bm7b5" + 0.032*"CM7"'),
 (15,
  '0.266*"A" + 0.076*"Dm7" + 0.074*"Am7" + 0.065*"D6" + 0.059*"B7" + 0.059*"D" + 0.056*"C7" + 0.052*"G7" + 0.045*"E7" + 0.035*"F7"'),
 (24,
  '0.148*"G7" + 0.121*"D7" + 0.100*"FM7" + 0.084*"A7" + 0.076*"C7" + 0.073*"Dm7" + 0.056*"CM7" + 0.056*"Gm7" + 0.040*"Am7" + 0.039*"Bb7"'),
 (22,
  '0.144*"G7" + 0.116*"C" + 0.097*"Dm7" + 0.060*"A7" + 0.055*"C#dim" + 0.052*"D7" + 0.047*"CM7" + 0.033*"Am7" + 0.032*"C7" + 0.030*"E7"'),
 (2,
  '0.104*"C13" + 0.067*"G#13" + 0.062*"G7" + 0.061*"FmM7" + 0.054*"Bb7" + 0.046*"A7" + 0.045*"G#7" + 0.045*"A6" + 0.043*"G13" + 0.040*"A13"'),
 (29,
  '0.094*"G#7" + 0.092*"G7" + 0.088*"C7" + 0.085*"C#7" + 0.083*"Eb7" + 0.079*"Cdim" + 0.078*"D7" + 0.074*"Bb7" + 0.049*"A7" + 0.039*"C"'),
 (21,
  '0.361*"F6" + 0.123*"D7" + 0.080*"G7" + 0.048*"Gm6" + 0.044*"CM7" + 0.041*"Bb7" + 0.041*"Fm6" + 0.041*"Dm7" + 0.037*"C7" 

## Evaluating Topic Model Quality

We can use perplexity and coherence scores as measures to evaluate the topic
model. Typically, lower the perplexity, the better the model. Similarly, the lower the
UMass score and the higher the Cv score in coherence, the better the model.

In [53]:
cv_coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, corpus=corpus, 
                                                      texts=data,
                                                      dictionary=id2word, 
                                                      coherence='c_v')
avg_coherence_cv = cv_coherence_model_lda.get_coherence()

umass_coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, corpus=corpus, 
                                                         texts=data,
                                                         dictionary=id2word, 
                                                         coherence='u_mass')
avg_coherence_umass = umass_coherence_model_lda.get_coherence()

perplexity = lda_model.log_perplexity(corpus)

print('Avg. Coherence Score (Cv):', avg_coherence_cv)
print('Avg. Coherence Score (UMass):', avg_coherence_umass)
print('Model Perplexity:', perplexity)

2021-11-28 19:00:42,430 : INFO : using WordOccurrenceAccumulator to estimate probabilities from sliding windows
2021-11-28 19:00:42,567 : INFO : WordOccurrenceAccumulator accumulated stats from 1000 documents
2021-11-28 19:00:42,637 : INFO : WordOccurrenceAccumulator accumulated stats from 2000 documents
2021-11-28 19:00:42,704 : INFO : WordOccurrenceAccumulator accumulated stats from 3000 documents
2021-11-28 19:00:42,783 : INFO : WordOccurrenceAccumulator accumulated stats from 4000 documents
2021-11-28 19:00:42,853 : INFO : WordOccurrenceAccumulator accumulated stats from 5000 documents
2021-11-28 19:00:44,119 : INFO : CorpusAccumulator accumulated stats from 1000 documents
2021-11-28 19:00:44,130 : INFO : CorpusAccumulator accumulated stats from 2000 documents
2021-11-28 19:00:44,146 : INFO : CorpusAccumulator accumulated stats from 3000 documents
2021-11-28 19:00:44,166 : INFO : CorpusAccumulator accumulated stats from 4000 documents
2021-11-28 19:00:44,176 : INFO : CorpusAccumula

Avg. Coherence Score (Cv): 0.36824092639648076
Avg. Coherence Score (UMass): -3.007971759124289
Model Perplexity: -3.334246358135733


In [54]:
%%time
# Evaluate different Topic Sizes

if False:

  try_topics = [5, 10, 20, 30, 50, 70, 100]

  topic_quality = pd.DataFrame(columns=['num_topics', 'cv', 'umass', 'perp'])

  for t in try_topics:

    lda_model = gensim.models.LdaModel(corpus=corpus, 
                                      id2word=id2word, 
                                      chunksize=200, 
                                      alpha='auto', 
                                      eta='auto', 
                                      #eta=0.1,
                                      random_state=42,
                                      #iterations=500, 
                                      num_topics=t, 
                                      passes=20, 
                                      per_word_topics=True,
                                      eval_every=None,
                                      update_every=0,  # batch learning, start from scratch everytime (<> online learning)
                                      )


    cv_coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, corpus=corpus, 
                                                          texts=data,
                                                          dictionary=id2word, 
                                                          coherence='c_v')
    avg_coherence_cv = cv_coherence_model_lda.get_coherence()

    umass_coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, corpus=corpus, 
                                                            texts=data,
                                                            dictionary=id2word, 
                                                            coherence='u_mass')
    avg_coherence_umass = umass_coherence_model_lda.get_coherence()

    perplexity = lda_model.log_perplexity(corpus)

    print('Num Topics: ', t)
    print('Avg. Coherence Score (Cv):', avg_coherence_cv)
    print('Avg. Coherence Score (UMass):', avg_coherence_umass)
    print('Model Perplexity:', perplexity)

    topic_quality.loc[len(topic_quality)] =[t, avg_coherence_cv, avg_coherence_umass, perplexity]

    print(topic_quality)


CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.15 µs


### Dominant Tunes in Topics

In [55]:
topics = [[(term, round(wt, 3)) 
               for term, wt in lda_model.show_topic(n, topn=20)] 
                   for n in range(0, lda_model.num_topics)]

topics_df = pd.DataFrame([[term for term, wt in topic] 
                              for topic in topics], 
                         columns = ['Term'+str(i) for i in range(1, 21)],
                         index=['Topic '+str(t) for t in range(1, lda_model.num_topics+1)]).T


In [56]:
pd.set_option('display.max_colwidth', -1)
topics_df = pd.DataFrame([', '.join([term for term, wt in topic])  
                              for topic in topics],
                         columns = ['TermsPerTopic'],
                         index=['Topic'+str(t) for t in range(1, lda_model.num_topics+1)]
                         )
topics_df


Passing a negative integer is deprecated in version 1.0 and will not be supported in future version. Instead, use None to not limit the column width.



Unnamed: 0,TermsPerTopic
Topic1,"Am, E7, Bm7b5, D7, F7, B7, G7, AmM7, Dm, Dm7, A7, Am7, F#7, CM7, Gm, G, F#m7b5, Bb7, B7(+b5), FM7"
Topic2,"G13, F#dim7, Dm7, C6, G7, Dm7b5, A7, F6, Gm7, C7, Am7, G#dim7, Em7, G#7, C7(+b5), C, F7, Em7b5, Fm7, FM7"
Topic3,"C13, G#13, G7, FmM7, Bb7, A7, G#7, A6, G13, A13, Bb13, F#7, CM7, F#13, C7, Em7, D13, D7(+b5), Dm7b5, F7"
Topic4,"CM7, G7, Dm7, Em7, A7, Am7, FM7, Ebdim7, C#dim7, D7, Bb7, Fm7, Dm7b5, C7, Fm6, G#7, C6, E7, F7, Gm7"
Topic5,"F#7, GM7, AM7, BM7, C#m7, E7, D7, Bm7, C#m7b5, BbM7, Am7, G7, CM7, Eb13, Cm7, Em7, G#m7, F13, DM7, Fm7b5"
Topic6,"G, E, Cm, Cdim7, G7, Cm6, Dm7b5, F#, CM7, A7(+b5), D7, Am7, E7(+b5), Cm7, Am7b5, C#, Bbdim7, G#7, D13, G6"
Topic7,"Dm7, E7, G7, C#6, Am7, C6, Am, G#6, Bm7b5, CM7, A7, Em7, F7, G#7, D7, D13, FM7, C, Ebm7, B7"
Topic8,"Bb7, C7, Fm7, D7, Bm, B7, Edim7, FM7, G7, Am7, E7, G#7, Dm7, A7, F#7, F7, Em7, Gm7, Bm7b5, F#dim7"
Topic9,"Eb7, G#M7, Bbm7, Fm7, F7, Cm7, G7, F13, Bb13, Bb7, B13, Adim7, F#7(+b5), Bb7(+b5), Dm7, G#7, D7, G#6, Em7, C#7"
Topic10,"Dm7, A7, G7, Em7, CM7, Em7b5, Bb7, C6, F7, D7, Fm7, Ebdim7, E7, Dm7b5, Am7, B7, G#7, C7, Fm6, F#7"


In [57]:
titles.head()

Unnamed: 0,id,tune_id,section_id,section_name,title,title_playlist,tune_mode
0,0,0,1,A,26-2,26-2 [jazz1350],major
1,1,0,2,A,26-2,26-2 [jazz1350],major
2,2,0,3,B,26-2,26-2 [jazz1350],major
3,3,0,4,A,26-2,26-2 [jazz1350],major
4,4,1,0,,500 Miles High,500 Miles High [jazz1350],minor


In [58]:
corpus_topics = [sorted(topics[0], key=lambda record: -record[1])[0] for topics in lda_model[corpus] ]

In [59]:
corpus_topic_df = pd.DataFrame()
corpus_topic_df['Document'] = range(0, len(data))
corpus_topic_df['DominantTopic'] = [item[0]+1 for item in corpus_topics]
corpus_topic_df['Contribution%'] = [round(item[1]*100, 2) for item in corpus_topics]
#corpus_topic_df['TopicDesc'] = [topics_df.iloc[t[0]]['TermsPerTopic'] for t in corpus_topics]
corpus_topic_df['Tunes'] = processed_corpus
corpus_topic_df['Title'] = titles['title_playlist']
corpus_topic_df['Section'] = titles['section_id'].map(str) + '_' + titles['section_name'].map(str)

corpus_topic_df

Unnamed: 0,Document,DominantTopic,Contribution%,Tunes,Title,Section
0,0,20,18.29,"[CM7, Eb7, G#M7, B7, EM7, G7, Gm7, C7, FM7, G#7, C#M7, E7, Am7, D7, Dm7, G7]",26-2 [jazz1350],1_A
1,1,19,30.98,"[CM7, Eb7, G#M7, B7, EM7, G7, Gm7, C7, FM7, Eb7, G#M7, B7, EM7, G7, CM7]",26-2 [jazz1350],2_A
2,2,13,42.67,"[Gm7, C7, Bm7, E7, AM7, C7, FM7, Bbm7, Eb7, G#M7, Dm7, G7]",26-2 [jazz1350],3_B
3,3,19,30.98,"[CM7, Eb7, G#M7, B7, EM7, G7, Gm7, C7, FM7, Eb7, G#M7, B7, EM7, G7, CM7]",26-2 [jazz1350],4_A
4,4,29,56.66,"[Am7, Am7, Cm7, Cm7, EbM7, EbM7, Em7b5, A7, Dm7, Dm7, Bm7b5, Bm7b5, Bbm7, Bbm7, Fm7, Fm7, E7, E7, Fm7, Fm7, C#M7, C#M7, Fm7, Fm7, C#M7, C#M7]",500 Miles High [jazz1350],0_nan
...,...,...,...,...,...,...
5194,5194,12,40.81,"[F, FM7, F#dim, C, C, B7, B7, Em, B7, Dm7, G7]",You're Foolin' Someone [trad],3_B
5195,5195,12,69.95,"[C, C, C#dim, G7, G7, G7, G7, C, Ebdim, G7, Dm7, G7]",You're Foolin' Someone [trad],4_A
5196,5196,10,52.14,"[Dm7, G7, Em7b5, A7, Dm7, G#7, G7, C, C]",You're Lucky To Me [trad],1_A
5197,5197,10,64.52,"[Dm7, G7, Em7b5, A7, Dm7, G#7, G7, C, C, A7, A7, A7, A7, Dm7, G7, D7, G7, Dm7, G7, Em7b5, A7, Dm7, G#7, G7, C, C]",You're Lucky To Me [trad],2_A


In [60]:
corpus_topic_df.groupby('DominantTopic').apply(lambda topic_set: (topic_set.sort_values(by=['Contribution%'], 
                                                                                         ascending=False)
                                                                             .iloc[0])
)


Unnamed: 0_level_0,Document,DominantTopic,Contribution%,Tunes,Title,Section
DominantTopic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1259,1,96.41,"[Am, Am, Am, Am, Am, Am, E7, Am, Am, Am, Am, Am, Am, Am, E7, Am, E7, Am, E7, Am, E7, Am, E7, Am]",Fever [jazz1350],1_A
2,77,2,92.35,"[F6, F#dim7, C6, Gm7, C7, F6, F#dim7, C6, A7, Dm7b5, G7]",A Smooth One [jazz1350],3_B
3,476,3,91.61,"[FM7, F7, Fm6, G#7, Am7b5, D7(+b5), G13, G#13, A13, Bb13]",Blood Count [jazz1350],3_B
4,1639,4,96.39,"[CM7, Dm7, G7, CM7, A7, Dm7, G7, Dm7, G7, Dm7, G7, Dm7, G7, CM7, A7, Dm7, G7, Dm7, G7, CM7, Dm7, G7]",I Get Along Without You [jazz1350],1_A
5,210,5,90.89,"[Em7, Am7, D7, GM7, CM7, C#m7, F#7, BM7, BM7]",All The Things You Are [jazz1350],2_B
6,4996,6,95.18,"[Cm, G7, Cm, G7, Cm, Cm, Cm, G7, Cm, G7, D7, G7, Cm, G7, Cm, G7, Cm, Cm]",Shim-Me-Sha-Wabble [trad],1_A
7,3705,7,94.86,"[C6, E7, Am7, D13, G#6, C#6, E6, A6, G#6, G#6, Dm7, G7, Eb6, C#6, C6, Dm7, G7]",Summer In Central Park [jazz1350],4_C
8,2638,8,94.65,"[Bm, Bm, Bm, Bm, Bm, Bm, Bm, Bm, Bm, Bm, Bm, Bm, Bm, Bm, Bm, Bm]",Milestones (New) [jazz1350],3_B
9,1955,9,93.97,"[G#M7, Fm7, Bbm7, Eb7, G#6, F7, Bbm7, Eb7, G#M7, Fm7, Bbm7, Eb7, Dm7, G7]",In a Sentimental Mood [jazz1350],3_B
10,440,10,96.55,"[CM7, A7, Dm7, G7, Em7, A7, Dm7, G7, CM7, F7, Em7, A7, Dm7, G7, Dm7, G7, Em7b5, A7, Dm7, G7, Em7, A7, Dm7, G7]",Billy Boy [jazz1350],1_A


### List the Tunes of the individual Topics

In [61]:
def display_tunes_per_topic(df, topic):
  tunes = df.query(f"DominantTopic == {topic}").sort_values('Contribution%', ascending=False)
  print(f"\n*** TOPIC {n} ***")
  print(f"{len(tunes)} tunes")
  return tunes

In [62]:
topic_result = corpus_topic_df.loc[:, ['DominantTopic', 'Contribution%', 'Title', 'Section', 'Tunes']]

In [63]:
for n in range(1, TOTAL_TOPICS+1):
  print(display_tunes_per_topic(topic_result, topic=n).head(30))


*** TOPIC 1 ***
148 tunes
      DominantTopic  Contribution%                                           Title  Section                                                                                                                             Tunes
1259  1              96.41          Fever [jazz1350]                                1_A      [Am, Am, Am, Am, Am, Am, E7, Am, Am, Am, Am, Am, Am, Am, E7, Am, E7, Am, E7, Am, E7, Am, E7, Am]                                
4279  1              95.00          Work Song [jazz1350]                            0_nan    [Am, Am, Am, Am, Am, Am, E7, E7, Am, Am, Am, Am, A7, D7, B7, E7, Am]                                                            
701   1              94.71          Caravan [jazz1350]                              2_A      [E7, E7, E7, E7, E7, E7, E7, E7, E7, E7, E7, E7, Am, Am, Am, Am]                                                                
700   1              94.71          Caravan [jazz1350]                              1

## Visualizing Topics

In [64]:
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis

2021-11-28 19:00:53,820 : INFO : Generating grammar tables from /usr/lib/python3.7/lib2to3/Grammar.txt
2021-11-28 19:00:53,856 : INFO : Generating grammar tables from /usr/lib/python3.7/lib2to3/PatternGrammar.txt

Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated since Python 3.3,and in 3.9 it will stop working


Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated since Python 3.3,and in 3.9 it will stop working


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only



## Distance Metrics


In [65]:
from gensim.matutils import jaccard

# element 0 of the result from lda_model contains the topics that a tune belongs to, with the respective probabilities
jaccard(lda_model[corpus[505]][0], lda_model[corpus[925]][0])

1.0

In [66]:
topic_result.loc[:, ['DominantTopic']].to_csv('topics.csv', index=True, index_label='id')

# LSA (Latent Semantic Analysis), aka LSI (Latent Semantic Index) 

In [67]:
from collections import defaultdict

In [68]:
num_topics = lsi_config['num_topics']

# remove words that appear only once
frequency = defaultdict(int)
for text in processed_corpus:
    for token in text:
        frequency[token] += 1

data = [[token for token in text if frequency[token] > 1] for text in processed_corpus]
dictionary = corpora.Dictionary(data)

# doc2bow counts the number of occurences of each distinct word,
# converts the word to its integer word id and returns the result
# as a sparse vector

bow_corpus = [dictionary.doc2bow(text) for text in data]
lsi = LsiModel(bow_corpus, id2word=dictionary, num_topics=num_topics)  # num_topics can be maximum the size of the number of unique tokens

2021-11-28 19:00:58,787 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-11-28 19:00:58,907 : INFO : built Dictionary(153 unique tokens: ['Am7', 'B7', 'C#M7', 'C7', 'CM7']...) from 5199 documents (total 79497 corpus positions)
2021-11-28 19:00:58,909 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(153 unique tokens: ['Am7', 'B7', 'C#M7', 'C7', 'CM7']...) from 5199 documents (total 79497 corpus positions)", 'datetime': '2021-11-28T19:00:58.909527', 'gensim': '4.1.2', 'python': '3.7.12 (default, Sep 10 2021, 00:21:48) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'created'}
2021-11-28 19:00:59,167 : INFO : using serial LSI version on this node
2021-11-28 19:00:59,169 : INFO : updating model with new documents
2021-11-28 19:00:59,171 : INFO : preparing a new chunk of documents
2021-11-28 19:00:59,199 : INFO : using 100 extra samples and 2 power iterations
2021-11-28 19:00:59,200 : INFO : 1st phase: constructing (153

In [69]:
index_lsi = similarities.Similarity('/content/index/index_lsi', lsi[bow_corpus], num_features=len(dictionary))

2021-11-28 19:00:59,540 : INFO : starting similarity index under /content/index/index_lsi


In [70]:
#index_lsi = similarities.SparseMatrixSimilarity(lsi[bow_corpus], num_features = len(dictionary))
#index_lsi = similarities.MatrixSimilarity(lsi[bow_corpus])  # transform corpus to LSI space and index it


In [71]:
lsi.save('/content/index/lsi.model')
index_lsi.save('/content/index/lsi_matrixsim.index')


2021-11-28 19:01:00,173 : INFO : Projection lifecycle event {'fname_or_handle': '/content/index/lsi.model.projection', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-11-28T19:01:00.173271', 'gensim': '4.1.2', 'python': '3.7.12 (default, Sep 10 2021, 00:21:48) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'saving'}
2021-11-28 19:01:00,176 : INFO : saved /content/index/lsi.model.projection
2021-11-28 19:01:00,178 : INFO : LsiModel lifecycle event {'fname_or_handle': '/content/index/lsi.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': ['projection', 'dispatcher'], 'datetime': '2021-11-28T19:01:00.178288', 'gensim': '4.1.2', 'python': '3.7.12 (default, Sep 10 2021, 00:21:48) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'saving'}
2021-11-28 19:01:00,179 : INFO : not storing attribute projection
2021-11-28 19:01:00,181 : INFO : not storing attribute dispatch

In [72]:
!ls -la /content/index

total 3252
drwxr-xr-x 2 root root    4096 Nov 28 19:01 .
drwxr-xr-x 1 root root    4096 Nov 28 18:59 ..
-rw-r--r-- 1 root root 3182460 Nov 28 19:01 index_lsi.0
-rw-r--r-- 1 root root     742 Nov 28 19:01 lsi_matrixsim.index
-rw-r--r-- 1 root root    4414 Nov 28 19:01 lsi.model
-rw-r--r-- 1 root root  123889 Nov 28 19:01 lsi.model.projection


## Tests

### Tests for Single Tunes

In [73]:
%%time
df_sim = get_sim_scores(tunes_eval_list, index=index_lsi, model=lsi)

# save a copy of the results; all results will be concatenated at the end
df_lsi = df_sim[:]


--------------------------------------------------
188 All God's Chillun Got Rhythm, section1 (A)
189 All God's Chillun Got Rhythm, section2 (B)
190 All God's Chillun Got Rhythm, section3 (A)
191 All God's Chillun Got Rhythm, section4 (C)

--------------------------------------------------
198 All Of Me, section1 (A)
199 All Of Me, section2 (B)
200 All Of Me, section3 (A)
201 All Of Me, section4 (C)

--------------------------------------------------
496 Blue Moon, section1 (A)
497 Blue Moon, section2 (A)
498 Blue Moon, section3 (B)
499 Blue Moon, section4 (A)

--------------------------------------------------
653 Bye Bye Blackbird, section1 (A)
654 Bye Bye Blackbird, section2 (B)

--------------------------------------------------
1220 Exactly Like You, section1 (A)
1221 Exactly Like You, section2 (A)
1222 Exactly Like You, section3 (B)
1223 Exactly Like You, section4 (A)

--------------------------------------------------
1538 Honeysuckle Rose, section1 (A)
1539 Honeysuckle Rose, s

In [74]:
import plotly.express as px
fig = px.histogram(df_sim, x="score", nbins=50, title='LSI Scores')
fig.show()

In [75]:
if use_wandb:
  wandb.log({"scores_hist": fig})

In [76]:
result, details = recommend_tune(df_sim, 'These Foolish Things [jazz1350]')
result.head(30)

Unnamed: 0,index,reference,reference_titleid,similar_title,score,max,score_div_max,similar_titleid
0,0,These Foolish Things [jazz1350],1168,I Loves You Porgy [jazz1350],0.952582,0.968965,1.0,520
1,1,These Foolish Things [jazz1350],1168,More Than You Know [jazz1350],0.916389,0.916389,1.0,829
2,2,These Foolish Things [jazz1350],1168,Easy Street [jazz1350],0.92632,0.952582,0.972431,343
3,3,These Foolish Things [jazz1350],1168,"Party's Over, The [jazz1350]",0.932275,0.968965,0.969447,935
4,4,These Foolish Things [jazz1350],1168,Manhattan [jazz1350],0.920755,0.952582,0.966589,774
5,5,These Foolish Things [jazz1350],1168,It's Easy To Remember [jazz1350],0.918562,0.968965,0.964287,637
6,6,These Foolish Things [jazz1350],1168,Bewitched [jazz1350],0.920767,0.968965,0.962436,132
7,7,These Foolish Things [jazz1350],1168,Why Do I Love You [jazz1350],0.917576,0.960773,0.961577,1285
8,8,These Foolish Things [jazz1350],1168,They All Laughed [jazz1350],0.92686,0.968965,0.957797,1169
9,9,These Foolish Things [jazz1350],1168,"Way You Look Tonight, The [jazz1350]",0.91445,0.968965,0.957143,1253


In [77]:
dd = pd.DataFrame(columns = recommender_results_cols)

for tune in tunes_eval_list:
  result, details = recommend_tune(df_sim, tune)
  dd['id'] = result['index']
  dd['reference'] = tune
  dd['method'] = 'lsi'
  dd['similar'] = result['similar_title']
  dd['score_div_max'] = result['score_div_max']
  dd['score'] = result['score']

recommender_results = recommender_results.append(dd)

### Tests for Contrafacts

In [78]:
topN = 20
matches, results = test_contrafacts(contrafacts, index_lsi, lsi, topN)

print(f"Found matches: {matches} out of {len(results)}: {100*matches/len(results):.3f}%")
print() 
for rr, val in results.items():
  if val == 0:
    print(f"{val}: {rr}")


Found matches: 76 out of 111: 68.468%

0: 52nd Street Theme [jazz1350], I Got Rhythm [jazz1350]
0: Ballade [jazz1350], As Long As I Live [jazz1350]
0: Dewey Square [jazz1350], Oh, Lady Be Good [jazz1350]
0: Hackensack [jazz1350], Oh, Lady Be Good [jazz1350]
0: In A Mellow Tone (In A Mellotone) [jazz1350], Rose Room [jazz1350]
0: Lennie's Pennies [jazz1350], Pennies From Heaven [jazz1350]
0: Little Rootie Tootie [jazz1350], I Got Rhythm [jazz1350]
0: Little Willie Leaps [jazz1350], All God's Chillun Got Rhythm [jazz1350]
0: Lullaby Of Birdland [jazz1350], Love Me Or Leave Me [jazz1350]
0: My Little Suede Shoes [jazz1350], Jeepers Creepers [jazz1350]
0: Quasimodo (Theme) [jazz1350], Embraceable You [jazz1350]
0: Room 608 [jazz1350], I Got Rhythm [jazz1350]
0: Scrapple From The Apple [jazz1350], Honeysuckle Rose [jazz1350]
0: Tour De Force [jazz1350], Jeepers Creepers [jazz1350]
0: Wow [jazz1350], You Can Depend On Me [jazz1350]
0: Sweet Sue, Just You [jazz1350], Honeysuckle Rose [jazz135

In [79]:
model_name = 'lsi'
if use_wandb:
  wandb.log(
      {model_name: {
                'contrafacts': {
                    'topN': topN,
                    'success': matches/len(contrafacts),
                    #'results': results
                    }
                   },
       'all_models': {
           model_name: wandb.Table(
               columns=["Title", "Match"],
               data=[[key, value] for key, value in results.items()]),
       }
       })

### Get Recommender Data for WebApp

In [80]:
%%time
if generate_webapp_data:
  _tunes = list(tunes['title_playlist'].values())
  _tunes = tunes_eval_list

  method = 'lsi'

  df_sim = get_sim_scores(_tunes, index_lsi, lsi)

  result = None
  for tune in _tunes:
    tune_result, details = recommend_tune(df_sim, tune)
    if result is None:
      result = pd.DataFrame(columns=tune_result.columns)
    result = result.append(tune_result)

  # save to file
  (result
   .loc[:,['reference_titleid',
           'similar_titleid',
           'score']]
   .to_csv(f'output/recommender_{method}.csv', encoding='utf8')
  )
  with zipfile.ZipFile(f'output/recommender_{method}.zip', 'w') as zf:
    zf.write(f'output/recommender_{method}.csv')


  # save to wandb
  if use_wandb:
    model_artifact = wandb.Artifact(
        f"recommender_{method}", 
        type="csv",
        description=f"Recommendations for each Tune using {method} Model (csv file)",
        metadata="")

    model_artifact.add_file(f'output/recommender_{method}.zip')
    wandb.log_artifact(model_artifact)

CPU times: user 0 ns, sys: 11 µs, total: 11 µs
Wall time: 10.5 µs


## Store Model to W&B

In [81]:
if use_wandb:
  model_artifact = wandb.Artifact(
      "model_lsi", 
      type="model",
      description="LSI model",
      metadata="")

  model_artifact.add_file("/content/index/lsi.model")
  model_artifact.add_file("/content/index/lsi_matrixsim.index")
  model_artifact.add_file("/content/index/lsi.model.projection")
  wandb.log_artifact(model_artifact)

For unigrams, the best number of topics seems to be around 20.

For unigrams plus bigrams, the coherence score drops down until 100 and then continuously rises until 500 and continues to rise. Same for bigrams-only.

# W&B Logging and Finish

In [82]:
if use_wandb:
  wandb.finish()