<a href="https://colab.research.google.com/github/11doris/jazz-maestro/blob/colab_word_embeddings/colab_h_lda_score.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sections as Input

In [1]:
pip install wandb



In [2]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mdoris[0m (use `wandb login --relogin` to force relogin)


In [3]:
import wandb

In [4]:
!pip uninstall gensim -y

Found existing installation: gensim 4.1.2
Uninstalling gensim-4.1.2:
  Successfully uninstalled gensim-4.1.2


In [5]:
!pip install gensim

Collecting gensim
  Using cached gensim-4.1.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
Installing collected packages: gensim
Successfully installed gensim-4.1.2


In [6]:
pip install pyLDAvis



In [7]:
import gensim
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import pprint
import pandas as pd
import numpy as np
from collections import Counter
import plotly.express as px
from tqdm import tqdm 
from gensim.models.doc2vec import Doc2Vec
from gensim.models.tfidfmodel import TfidfModel
from gensim.models.lsimodel import LsiModel
from gensim.models import CoherenceModel
from gensim import corpora
from gensim import similarities
import pickle
import os
import zipfile
from gensim.models.phrases import Phrases

In [8]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [9]:
print(gensim.__version__)

4.1.2


In [10]:
!rm data.csv

In [11]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Configuration


In [12]:
use_wandb = False

In [13]:
generate_webapp_data = False

In [14]:
chords_preprocessing = 'rootAndDegreesPlus'

In [15]:
ngrams_for_input = [1]

In [16]:
remove_repetitions = False

In [17]:
input_files = {
    'sections': {
        # M7 and 6 reduced to major triad, m7 reduced to m, dominant 7, m7b5, diminished, and all (b5) left as they are.
        'rootAndDegreesPlus': '1BDYukaIj72jmi9PqZVrCAqKbyxg8Z4yq',
        'rootAndDegrees7': '',
        'rootAndDegreesSimplified': ''
    },
    'tunes': {
        'rootAndDegreesPlus': '',
        'rootAndDegrees7': '',
        'rootAndDegreesSimplified': '',
    }
}

In [18]:
tunes_eval_list = [
  "Sweet Sue, Just You [jazz1350]",
  "On The Sunny Side Of The Street [jazz1350]",
  "These Foolish Things [jazz1350]", 
  "Blue Moon [jazz1350]",
  "All Of Me [jazz1350]",
  "All God's Chillun Got Rhythm [jazz1350]",
  "I Got Rhythm [jazz1350]",
  "Bye Bye Blackbird [jazz1350]",
  "Old Fashioned Love [trad]",
  "Exactly Like You [jazz1350]",
  "Honeysuckle Rose [jazz1350]",
  "Misty [jazz1350]",
  "Naima [jazz1350]",
]

tunes_eval_list = sorted(tunes_eval_list)
tunes_eval_list

["All God's Chillun Got Rhythm [jazz1350]",
 'All Of Me [jazz1350]',
 'Blue Moon [jazz1350]',
 'Bye Bye Blackbird [jazz1350]',
 'Exactly Like You [jazz1350]',
 'Honeysuckle Rose [jazz1350]',
 'I Got Rhythm [jazz1350]',
 'Misty [jazz1350]',
 'Naima [jazz1350]',
 'Old Fashioned Love [trad]',
 'On The Sunny Side Of The Street [jazz1350]',
 'Sweet Sue, Just You [jazz1350]',
 'These Foolish Things [jazz1350]']

In [19]:
contrafacts = [
               ("26-2 [jazz1350]", "Confirmation [jazz1350]"),
               ("52nd Street Theme [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Ablution [jazz1350]", "All The Things You Are [jazz1350]"),
               ("Anthropology [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Ballade [jazz1350]", "As Long As I Live [jazz1350]"),
               ("Bright Mississippi [jazz1350]", "Sweet Georgia Brown [jazz1350]"),
               ("C.T.A. [jazz1350]", "I Got Rhythm [jazz1350]"),
               #( "Celia [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Cottontail [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Countdown [jazz1350]", "Tune Up [jazz1350]"),
               ("Dewey Square [jazz1350]", "Oh, Lady Be Good [jazz1350]"),
               ("Dexterity [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Dig [jazz1350]", "Sweet Georgia Brown [jazz1350]"),
               ("Donna Lee [jazz1350]", "Indiana (Back Home Again In) [jazz1350]"),
               ("Don't Be That Way [jazz1350]", "I Got Rhythm [jazz1350]"),
               #("Eternal Triangle [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Evidence [jazz1350]", "Just You, Just Me [jazz1350]"),
               ("Flintstones [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Four On Six [jazz1350]", "Summertime [jazz1350]"),
               ("Freight Train [jazz1350]", "Blues For Alice [jazz1350]"),
               ("Good Bait [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Hackensack [jazz1350]", "Oh, Lady Be Good [jazz1350]"),
               ("Half Nelson [jazz1350]", "Lady Bird [jazz1350]"),
               ("Hot House [jazz1350]", "What Is This Thing Called Love [jazz1350]"),
               ("Impressions [jazz1350]", "So What [jazz1350]"),
               ("In A Mellow Tone (In A Mellotone) [jazz1350]", "Rose Room [jazz1350]"),
               ("In Walked Bud [jazz1350]", "Blue Skies [jazz1350]"),
               ("Ko Ko [jazz1350]", "Cherokee [jazz1350]"),
               ("Lennie's Pennies [jazz1350]", "Pennies From Heaven [jazz1350]"),   ## Lennie's Pennies is in minor and therefore transposed to Amin... not possible to recognize like that
               #( "Let's Call This [jazz1350]", "Honeysuckle Rose [jazz1350]"),
               ("Little Rootie Tootie [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Little Willie Leaps [jazz1350]", "All God's Chillun Got Rhythm [jazz1350]"),
               ("Lullaby Of Birdland [jazz1350]", "Love Me Or Leave Me [jazz1350]"),
               #("Moose The Mooche [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("My Little Suede Shoes [jazz1350]", "Jeepers Creepers [jazz1350]"),
               #("Oleo [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Ornithology [jazz1350]", "How High The Moon [jazz1350]"),
               #("Passport [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Quasimodo (Theme) [jazz1350]", "Embraceable You [jazz1350]"),
               #("Rhythm-a-ning [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Room 608 [jazz1350]", "I Got Rhythm [jazz1350]"),
               #("Salt Peanuts [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Satellite [jazz1350]", "How High The Moon [jazz1350]"),
               ("Scrapple From The Apple [jazz1350]", "Honeysuckle Rose [jazz1350]"), # A section
               ("Scrapple From The Apple [jazz1350]", "I Got Rhythm [jazz1350]"), # B section
               #("Segment [jazz1350]", "I Got Rhythm [jazz1350]"),
               #("Seven Come Eleven [jazz1350]", "I Got Rhythm [jazz1350]"),
               #("Shaw 'Nuff [jazz1350]", "I Got Rhythm [jazz1350]"),
               #("Theme, The [jazz1350]", "I Got Rhythm [jazz1350]"),
               ("Tour De Force [jazz1350]", "Jeepers Creepers [jazz1350]"),
               ("Wow [jazz1350]", "You Can Depend On Me [jazz1350]"),
               ("Yardbird Suite [jazz1350]", "Rosetta [jazz1350]"),

               # following tunes are not from wikipedia),
               ("Sweet Sue, Just You [jazz1350]", "Honeysuckle Rose [jazz1350]"),  # A section
               #("All Of Me [jazz1350]", "Pennies From Heaven [jazz1350]"), # bars 25-28 of All of Me are same as bars 17-20 of Pennies From Heaven, but different key!
               ("Sweet Sue, Just You [jazz1350]", "Bye Bye Blackbird [jazz1350]"), # Bridge same
               ("These Foolish Things [jazz1350]", "Blue Moon [jazz1350]"), # first 8 bars same
               ("These Foolish Things [jazz1350]", "More Than You Know [jazz1350]"),
               ("These Foolish Things [jazz1350]", "Isn't It A Pity [jazz1350]"),
               ("These Foolish Things [jazz1350]", "Soultrain [jazz1350]"),
               ("These Foolish Things [jazz1350]", "Why Do I Love You [jazz1350]"),
               ("Misty [jazz1350]", "Portrait Of Jennie [jazz1350]"),
               ("Misty [jazz1350]", "September In The Rain [jazz1350]"),
               ("Misty [jazz1350]", "I May Be Wrong [jazz1350]"),  

               # identical tunes
               ("Five Foot Two [trad]", "Please Don't Talk About Me When I'm Gone [trad]"),
               ("What Is This Thing Called Love [jazz1350]", "Subconscious Lee [jazz1350]"),
               ("Sweet Georgia Brown [jazz1350]", "Dig [jazz1350]"),


               # almost identical tunes
               ("What Is This Thing Called Love [jazz1350]", "Hot House [jazz1350]"),
               ("Jeannie's Song [jazz1350]", "Shiny Stockings [jazz1350]"),
               ("Alone Together [jazz1350]", "Segment [jazz1350]"),
               ("Baubles, Bangles and Beads [jazz1350]", "Bossa Antigua [jazz1350]"),
               ("There Will Never Be Another You [jazz1350]", "A Weaver Of Dreams [jazz1350]"),
               ("Moten Swing [jazz1350]", "Once In A While (Ballad) [trad]"), # same bridge, similar A
               ("All I Do Is Dream Of You [trad]", "L-O-V-E [jazz1350]"),


               # same A section
               ("Nancy (With The Laughing Face) [jazz1350]", "Body And Soul [jazz1350]"),
               ("Exactly Like You [jazz1350]", "True (You Don't Love Me ) [trad]"),
               ("Exactly Like You [jazz1350]", "True (You Don't Love Me ) [trad]"),
               ("Exactly Like You [jazz1350]", "Jersey Bounce [trad]"),
               ("Take The A Train [jazz1350]", "Girl From Ipanema, The [jazz1350]"),
               ("My Heart Stood Still [jazz1350]", "All Too Soon [jazz1350]"),
               ("Undecided [trad]", "Broadway [jazz1350]"),
               ("My Heart Stood Still [jazz1350]", "All Too Soon [jazz1350]"),
               ("Let's Fall In Love [jazz1350]", "Heart And Soul [jazz1350]"),
               ("Come Back To Me [jazz1350]", "I Wish I Knew [jazz1350]"),
               ("Wait Till You See Her [jazz1350]", "A Certain Smile [jazz1350]"),
               ("Killer Joe [jazz1350]", "Straight Life [jazz1350]"),
               ("Softly, As In A Morning Sunrise [jazz1350]", "Segment [jazz1350]"),
               ("Bei Mir Bist Du Schon (Root Hog Or Die) [trad]", "Egyptian Fantasy [trad]"),
               ("Bei Mir Bist Du Schon (Root Hog Or Die) [trad]", "Puttin' On The Ritz [jazz1350]"),
               ("Coquette [trad]", "Pretend You're Happy When You're Blue [trad]"),
               ("Softly, As In A Morning Sunrise [jazz1350]", "Strode Rode [jazz1350]"),
               ("Glory Of Love, The [jazz1350]", "I've Got My Fingers Crossed [trad]"),


               # same bridge
               ("If I Had You [jazz1350]", "Too Young To Go Steady [jazz1350]"),
               ("Undecided [jazz1350]", "Satin Doll [jazz1350]"),
               ("Billy Boy [jazz1350]", "Elora [jazz1350]"),
               ("Dearly Beloved [jazz1350]", "We See [jazz1350]"),
               ("Alone Together [jazz1350]", "A Night In Tunisia [jazz1350]"),
               ("A Night In Tunisia [jazz1350]", "Segment [jazz1350]"),
               ("Oh! Lady Be Good [trad]", "Sentimental Journey [jazz1350]"),
               ("You Can Depend On Me [jazz1350]", "Move [jazz1350]"),
               ("I Want To Be Happy [jazz1350]", "A Beautiful Friendship [jazz1350]"),
               ("Flying Home [jazz1350]", "Down For Double [jazz1350]"),
               ("Cheek To Cheek [jazz1350]", "Violets For Your Furs [jazz1350]"),
               ("Let's Fall In Love [jazz1350]", "At Last [jazz1350]"),
               ("Don't Be That Way [jazz1350]", "Long Ago And Far Away [jazz1350]"),
               ("On The Sunny Side Of The Street [jazz1350]", "I'm Confessin' That I Love You [jazz1350]"),
               ("On The Sunny Side Of The Street [jazz1350]", "Eclypso [jazz1350]"),
               ("On The Sunny Side Of The Street [jazz1350]", "You Stepped Out Of A Dream [jazz1350]"),
               

               # similar A section
               ("I Like The Likes Of You [jazz1350]", "Mountain Greenery [jazz1350]"),
               ("My Secret Love [jazz1350]", "Samba De Orfeu [jazz1350]"),
               ("Let's Call The Whole Thing Off [jazz1350]", "Fine And Dandy [jazz1350]"),


               # similar B section
               ("Folks Who Live On The Hill, The [jazz1350]", "My One And Only Love [jazz1350]"),
               ("As Long As I Live [jazz1350]", "I'm Glad There Is You [jazz1350]"),
               ("I May Be Wrong [jazz1350]", "Teach Me Tonight [jazz1350]"),
               ("Am I Blue [jazz1350]", "Come Back To Me [jazz1350]"),
               ("My One And Only Love [jazz1350]", "Am I Blue [jazz1350]"),
               ("On The Sunny Side Of The Street [jazz1350]", "September In The Rain [jazz1350]"),
               ("On The Sunny Side Of The Street [jazz1350]", "Mountain Greenery [jazz1350]"),
               ("On The Sunny Side Of The Street [jazz1350]", "There's No You [jazz1350]"),
               ("These Foolish Things [jazz1350]", "Embraceable You [jazz1350]"),
               ("These Foolish Things [jazz1350]", "Rosetta [jazz1350]"),

               # same C section
               ("Bill Bailey [jazz1350]", "Bourbon Street Parade [jazz1350]"),

               # Stella C is like Woody B
               ("Woody'n You [jazz1350]", "Stella By Starlight [jazz1350]"),

               # similar vocabulary, different progressions
               ("Tangerine [jazz1350]", "Tea For Two [jazz1350]"),
               ("I Can't Give You Anything But Love [jazz1350]", "You Can Depend On Me [jazz1350]"),
               ("This Year's Kisses [jazz1350]", "My Monday Date [trad]"),
               ("A Blossom Fell [jazz1350]", "Among My Souvenirs [jazz1350]"),



]

# Initialization

## Download the Data

In [20]:
input_data = input_files['sections'][chords_preprocessing]

input_path = f"https://docs.google.com/uc?export=download&id={input_data}"
data_file_name = 'data.csv'


In [21]:
input_path

'https://docs.google.com/uc?export=download&id=1BDYukaIj72jmi9PqZVrCAqKbyxg8Z4yq'

In [22]:
!wget --no-check-certificate "$input_path" -O "$data_file_name"

--2021-11-28 14:21:03--  https://docs.google.com/uc?export=download&id=1BDYukaIj72jmi9PqZVrCAqKbyxg8Z4yq
Resolving docs.google.com (docs.google.com)... 209.85.200.139, 209.85.200.138, 209.85.200.113, ...
Connecting to docs.google.com (docs.google.com)|209.85.200.139|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-0k-4c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/ivjm2tp15nvqofnq8a18j4s864umfan9/1638109200000/14329102864480165501/*/1BDYukaIj72jmi9PqZVrCAqKbyxg8Z4yq?e=download [following]
--2021-11-28 14:21:03--  https://doc-0k-4c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/ivjm2tp15nvqofnq8a18j4s864umfan9/1638109200000/14329102864480165501/*/1BDYukaIj72jmi9PqZVrCAqKbyxg8Z4yq?e=download
Resolving doc-0k-4c-docs.googleusercontent.com (doc-0k-4c-docs.googleusercontent.com)... 142.250.128.132, 2607:f8b0:4001:c32::84
Connecting to doc-0k-4c-docs.googleusercontent.com (doc-0k-4c-

### Read Chords Input Data

In [23]:
df = pd.read_csv(data_file_name, sep='\t', index_col="id")
df = df.reset_index()
df.head(5)

Unnamed: 0,id,file_name,title,title_playlist,tune_mode,tune_id,section_name,section_id,chords
0,0,dataset/jazz1350/26-2.xml,26-2,26-2 [jazz1350],major,0,A,1,C Eb7 G# B7 E G7 Gm C7 F G#7 C# E7 Am D7 Dm G7
1,1,dataset/jazz1350/26-2.xml,26-2,26-2 [jazz1350],major,0,A,2,C Eb7 G# B7 E G7 Gm C7 F Eb7 G# B7 E G7 C
2,2,dataset/jazz1350/26-2.xml,26-2,26-2 [jazz1350],major,0,B,3,Gm C7 Bm E7 A C7 F Bbm Eb7 G# Dm G7
3,3,dataset/jazz1350/26-2.xml,26-2,26-2 [jazz1350],major,0,A,4,C Eb7 G# B7 E G7 Gm C7 F Eb7 G# B7 E G7 C
4,4,dataset/jazz1350/500 Miles High.xml,500 Miles High,500 Miles High [jazz1350],minor,1,,0,Am Am Cm Cm Eb Eb Em7b5 A7 Dm Dm Bm7b5 Bm7b5 B...


### Meta Data

In [24]:
titles = df.loc[:, ['id', 'tune_id', 'section_id', 'section_name', 'title', 'title_playlist', 'tune_mode']]
titles[:5]

Unnamed: 0,id,tune_id,section_id,section_name,title,title_playlist,tune_mode
0,0,0,1,A,26-2,26-2 [jazz1350],major
1,1,0,2,A,26-2,26-2 [jazz1350],major
2,2,0,3,B,26-2,26-2 [jazz1350],major
3,3,0,4,A,26-2,26-2 [jazz1350],major
4,4,1,0,,500 Miles High,500 Miles High [jazz1350],minor


In [25]:
titles_dict = titles.to_dict()

sectionid_to_title = titles_dict['title_playlist']
sectionid_to_titleid = titles_dict['tune_id']


In [26]:
tunes = df.loc[:, ['tune_id', 'title_playlist']].drop_duplicates()
tunes = tunes.set_index('tune_id').to_dict()
titleid_to_title = tunes['title_playlist']

In [27]:
title_to_titleid = {v: k for k, v in titleid_to_title.items()}


In [28]:
titles_rows = titles.to_dict(orient='records')
sectionid_to_section = []
for i, row in enumerate(titles_rows):
  name = f"{row['title']}, section{row['section_id']} ({row['section_name']})"
  sectionid_to_section.append(name)
  

In [29]:
title_to_sectionid = {}

for row in titles.iterrows():
  title = row[1]['title_playlist']
  if title not in title_to_sectionid:
    title_to_sectionid[title] = [row[1]['id']]
  else:
    title_to_sectionid[title].append(row[1]['id'])

### Create Directories on Colab

In [30]:
!rm -R output
!mkdir output

## Initialization for wandb variables

In [31]:
recommender_results_cols = ['reference', 'id', 'method', 'similar', 'score_div_max', 'score']
recommender_results = pd.DataFrame(columns=recommender_results_cols)

In [32]:
lsi_config = {
    'num_topics': 100,
}

In [33]:
doc2vec_config = {
    'general': {
        'chords_preprocessing': chords_preprocessing,
        'tag_sections_and_tunes': False,
    },
    'model': {
        'dm': 1,
        'vector_size': 100,
        'window': 4,
        'epochs': 40,
        #'workers': 1,
        'min_count': 1,
        'negative': 10,
        'sample': 0.001,
        'seed': 42
    }
}

In [34]:
if use_wandb:
  wandb.init(
        # Set entity to specify your username or team name
        # ex: entity="carey",
        # Set the project where this run will be logged
        project="jazztunes-lda", 
        
        # Track hyperparameters and run metadata
        config={
            "input_data": input_path,
            "ngrams_input": ngrams_for_input,
            "comparison": "sections",
            "remove_repeated_chords": remove_repetitions,
        }
    )

In [35]:
if use_wandb:
  artifact = wandb.Artifact('input_data', type='dataset')
  artifact.add_file('data.csv')
  wandb.log_artifact(artifact)

## Helpers functions

In [36]:
def ngrams(tokens, n=2, sep='-'):
    return [sep.join(ngram) for ngram in zip(*[tokens[i:] for i in range(n)])]

In [37]:
def raw_chords_to_df(tunes):
  tunes_chords = [item for tune in tunes for item in tune]
  counts = Counter(tunes_chords)
  df = pd.DataFrame(counts.items(),
                    columns=['chord', 'count']).sort_values(by='count', ascending=False)

  return df

# Data Preparation

In [38]:
def remove_chord_repetitions(chords):
  previous = ''
  chords_norep = []
  for c in chords:
    if c != previous:
      chords_norep.append(c)
      previous = c
  return chords_norep
  

In [39]:
lines = df.loc[:, 'chords'].tolist()
data = [line.split(' ') for line in lines]

In [40]:
processed_corpus = []
for line in data:
  tune_n = []
  if remove_repetitions:
    line = remove_chord_repetitions(line)
  for n in ngrams_for_input:
    tune_n.extend(ngrams(line, n=n))
  processed_corpus.append(tune_n)

for line in processed_corpus[:10]:
  print(line)

['C', 'Eb7', 'G#', 'B7', 'E', 'G7', 'Gm', 'C7', 'F', 'G#7', 'C#', 'E7', 'Am', 'D7', 'Dm', 'G7']
['C', 'Eb7', 'G#', 'B7', 'E', 'G7', 'Gm', 'C7', 'F', 'Eb7', 'G#', 'B7', 'E', 'G7', 'C']
['Gm', 'C7', 'Bm', 'E7', 'A', 'C7', 'F', 'Bbm', 'Eb7', 'G#', 'Dm', 'G7']
['C', 'Eb7', 'G#', 'B7', 'E', 'G7', 'Gm', 'C7', 'F', 'Eb7', 'G#', 'B7', 'E', 'G7', 'C']
['Am', 'Am', 'Cm', 'Cm', 'Eb', 'Eb', 'Em7b5', 'A7', 'Dm', 'Dm', 'Bm7b5', 'Bm7b5', 'Bbm', 'Bbm', 'Fm', 'Fm', 'E7', 'E7', 'Fm', 'Fm', 'C#', 'C#', 'Fm', 'Fm', 'C#', 'C#']
['Am', 'C#', 'Bm7b5', 'E7', 'Am', 'C#', 'Bm7b5', 'E7', 'Cm', 'F7', 'Bb', 'G#m', 'C#7', 'F#m7b5', 'B7', 'E', 'E', 'E7', 'Am', 'C#', 'Bm7b5', 'E7', 'Am', 'C#', 'Bm7b5', 'E7', 'Cm', 'F7', 'Bb', 'G#m', 'C#7', 'F#m7b5', 'B7', 'Em', 'Em']
['C', 'Am', 'Dm', 'G7', 'C', 'Am', 'Dm', 'G7', 'C', 'Am', 'Dm', 'G7', 'C', 'G7', 'C']
['C', 'Am', 'Dm', 'G7', 'C', 'Am', 'Dm', 'G7', 'C', 'Am', 'Dm', 'G7', 'C', 'G7', 'C']
['C7', 'C7', 'F', 'F', 'D7', 'D7', 'G7', 'G7']
['C', 'Am', 'Dm', 'G7', 'C', 'Am', 

#### Corpus Overview

In [41]:
tokens = [item for l in processed_corpus for item in l]
total_tokens = len(tokens)
vocab_size = len(set(tokens))
vocab_prop = 100*vocab_size/total_tokens

print(f"Total Number of tokens: {total_tokens}")
print(f"Size of vocabulary: {vocab_size}")
print(f"Proportion of vocabulary in corpus: {vocab_prop:.02f}%")

Total Number of tokens: 82622
Size of vocabulary: 72
Proportion of vocabulary in corpus: 0.09%


In [42]:
df_chords = raw_chords_to_df(processed_corpus)
df_chords

Unnamed: 0,chord,count
0,C,14387
5,G7,11144
14,Dm,7991
12,Am,5724
21,A7,4467
13,D7,3986
8,F,3746
7,C7,3428
11,E7,3175
29,Em,2710


In [43]:
df_chords = pd.DataFrame.from_dict(df_chords)
df_chords.sort_values(by=['count'], ascending=False, inplace=True)
df_chords_top = df_chords.query('count > 100')

fig = px.bar(df_chords_top, x='chord', y='count', log_y=True)
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
fig.show()

In [44]:
if use_wandb:
  wandb.log(
      {"corpus": {
              "total_tokens": total_tokens,
              "vocab_size": vocab_size,
              "vocab_proportion_in_corpus": vocab_prop,
              }
      }
  )

# Test Helpers

In [45]:
!rm -R index
!mkdir index

In [46]:
def get_sim_scores(tunes, index, model):

    df_sim = pd.DataFrame(columns=['reference_title',
                                   'reference_titleid',
                                   'similar_title',
                                   'similar_titleid',
                                   'ref_section', 
                                   'similar_section', 
                                   'score', 
                                  ])

    for tune in tunes:
      print()
      print("-"*50)
      for s1 in title_to_sectionid[tune]:
      
          query = processed_corpus[s1]
          query_bow = dictionary.doc2bow(query)

          # perform a similarity query against the corpus
          similarities = index[model[query_bow]]
          sims = sorted(enumerate(similarities), key=lambda item: -item[1])
          
          print(s1, sectionid_to_section[s1])
          n = 0
          for s2, s2_score in sims:
            
            # store the top N best results
            if n > 30:
                break
            # don't count self-similarity between sections of the same tune
            if s2 not in title_to_sectionid[tune]:
                n += 1
            
                # print(f"\t{s2_score:.3f} {sectionid_to_section[s2]}")

                df_sim.loc[len(df_sim)] = [tune,
                                           title_to_titleid[tune],
                                           sectionid_to_title[s2],
                                           sectionid_to_titleid[s2],
                                           sectionid_to_section[s1], 
                                           sectionid_to_section[s2], 
                                           s2_score, 
                                           ]
    return df_sim

In [47]:
def recommend_tune(df, tune_name):

  threshold = 0.5 # arbitrary value, selected based on distribution of max scores

  df_tune = df.query(f'reference_title == "{tune_name}"')
  ff = df_tune.iloc[:]

  # get the maximum similarity score for each section and store in new column
  ff['max'] = ff.groupby('ref_section')['score'].transform('max')

  # consider only results for scores above threshold
  ff = ff.loc[(ff['max'] > threshold)]

  # scale the score with the maxmum value of each section
  ff['score_div_max'] = ff['score'] / ff['max']

  # for each similar title, aggregate all of its sections using the median value
  result = ff.groupby('similar_title').median().sort_values('score_div_max', ascending=False)
    
  # if multiple rows from the same similar tune, keep only the similar_title with the highest score_div_max
  result = result.groupby('similar_title').max('score_div_max').sort_values('score_div_max', ascending=False)

  # add the name of the reference tune
  result['reference'] = tune_name
  result['reference_titleid'] = title_to_titleid[tune_name]

  result = result.reset_index()
  result = result.loc[:, ['reference', 'reference_titleid', 'similar_title', 'score', 'max', 'score_div_max']]
  result['similar_titleid'] = result['similar_title'].apply(lambda x: title_to_titleid[x])
  result = result.reset_index()
  return result, ff

In [48]:
def test_contrafacts(tunes, index, model, N=15):
  matches = 0
  number_of_sections = 0
  results = {}

  for tune, similar_tune in tunes:

    # loop over all sections of the tune
    section_matches = 0
    for s1 in title_to_sectionid[tune]:
      query = processed_corpus[s1]
      query_bow = dictionary.doc2bow(query)

      # perform a similarity query against the corpus
      similarities = index[model[query_bow]]
      sims = sorted(enumerate(similarities), key=lambda item: -item[1])

      # check if the section matches the expected title; consider only the first 15 recommendations
      i = 0
      for sectionid, value in sims:
        if i >= N:
          break
        i += 1
        if sectionid_to_title[sectionid] == similar_tune:
          section_matches += 1

    # for each title, increase matches if at least one of the section matched the expected title
    if section_matches > 0:
      matches += 1  
      results[f'{tune}, {similar_tune}'] = 1
    else:
      results[f'{tune}, {similar_tune}'] = 0
  
  return matches, results

# Train LDA Model

In [49]:
from gensim.test.utils import common_texts
import gensim.models as models
import gensim.corpora as corpora
from gensim.corpora.dictionary import Dictionary

# Create a corpus from a list of texts
id2word = corpora.Dictionary(processed_corpus)

2021-11-28 14:21:06,170 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-11-28 14:21:06,173 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)
2021-11-28 14:21:06,175 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)", 'datetime': '2021-11-28T14:21:06.175107', 'gensim': '4.1.2', 'python': '3.7.12 (default, Sep 10 2021, 00:21:48) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'created'}
2021-11-28 14:21:06,262 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-11-28 14:21:06,382 : INFO : built Dictionary(72 unique tokens: ['Am', 'B7', 'C', 'C#', 'C7']...) from 5391 documents (total 82622 corpus positions)
2021-11-28 14:21:06,383 : INFO : Dictionary lifecycle event {'msg': "built Di

In [50]:
# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(tune) for tune in processed_corpus]

In [51]:
%%time
TOTAL_TOPICS = 30
lda_model = gensim.models.LdaModel(corpus=corpus, 
                                   id2word=id2word, 
                                   chunksize=200, 
                                   alpha='auto', 
                                   eta='auto', 
                                   #eta=0.1,
                                   random_state=42,
                                   #iterations=500, 
                                   num_topics=TOTAL_TOPICS, 
                                   passes=20, 
                                   per_word_topics=True,
                                   eval_every=None,
                                   update_every=0,  # batch learning, start from scratch everytime (<> online learning)
                                   )

2021-11-28 14:21:06,489 : INFO : using autotuned alpha, starting with [0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335, 0.033333335]
2021-11-28 14:21:06,494 : INFO : using serial LDA version on this node
2021-11-28 14:21:06,499 : INFO : running batch LDA training, 30 topics, 30 passes over the supplied corpus of 5391 documents, updating model once every 5391 documents, evaluating perplexity every 0 documents, iterating 50x with a convergence threshold of 0.001000
2021-11-28 14:21:06,503 : INFO : PROGRESS: pass 0, at document #200/5391
2021-11-28 14:21:06,637 : INFO : optimized alpha [0.029824633, 0.029642295, 0.02982278, 0.031614684, 0.030010518, 0.030379768, 0.03

CPU times: user 1min 20s, sys: 1.77 s, total: 1min 22s
Wall time: 1min 24s


In [52]:
lda_model.print_topics()

2021-11-28 14:22:31,229 : INFO : topic #9 (0.002): 0.181*"C" + 0.140*"G7" + 0.097*"Dm" + 0.074*"Am" + 0.053*"A7" + 0.049*"D7" + 0.049*"F" + 0.041*"C7" + 0.040*"E7" + 0.033*"Em"
2021-11-28 14:22:31,232 : INFO : topic #3 (0.004): 0.228*"G7(+b5)" + 0.171*"A7(+b5)" + 0.147*"Dm" + 0.103*"C" + 0.094*"Bb7" + 0.069*"G7" + 0.057*"C7" + 0.022*"Em" + 0.019*"F" + 0.013*"Am"
2021-11-28 14:22:31,233 : INFO : topic #7 (0.009): 0.203*"Em" + 0.189*"Dm" + 0.138*"Fm" + 0.121*"C" + 0.095*"G7" + 0.068*"A7" + 0.049*"F" + 0.030*"Bb7" + 0.025*"Ebdim" + 0.017*"Am"
2021-11-28 14:22:31,235 : INFO : topic #15 (0.010): 0.359*"Cdim" + 0.185*"G#m" + 0.131*"Fdim" + 0.123*"Adim" + 0.049*"B7" + 0.028*"Gm" + 0.024*"C" + 0.024*"F#" + 0.012*"E7" + 0.012*"F#7"
2021-11-28 14:22:31,236 : INFO : topic #19 (0.012): 0.525*"A" + 0.189*"E7" + 0.057*"Edim" + 0.036*"Bm" + 0.033*"F#m" + 0.033*"D" + 0.020*"C#m" + 0.019*"Dm" + 0.016*"G7" + 0.014*"B"
2021-11-28 14:22:31,238 : INFO : topic #18 (0.017): 0.793*"Bm" + 0.084*"E7" + 0.028*"D

[(9,
  '0.181*"C" + 0.140*"G7" + 0.097*"Dm" + 0.074*"Am" + 0.053*"A7" + 0.049*"D7" + 0.049*"F" + 0.041*"C7" + 0.040*"E7" + 0.033*"Em"'),
 (3,
  '0.228*"G7(+b5)" + 0.171*"A7(+b5)" + 0.147*"Dm" + 0.103*"C" + 0.094*"Bb7" + 0.069*"G7" + 0.057*"C7" + 0.022*"Em" + 0.019*"F" + 0.013*"Am"'),
 (7,
  '0.203*"Em" + 0.189*"Dm" + 0.138*"Fm" + 0.121*"C" + 0.095*"G7" + 0.068*"A7" + 0.049*"F" + 0.030*"Bb7" + 0.025*"Ebdim" + 0.017*"Am"'),
 (15,
  '0.359*"Cdim" + 0.185*"G#m" + 0.131*"Fdim" + 0.123*"Adim" + 0.049*"B7" + 0.028*"Gm" + 0.024*"C" + 0.024*"F#" + 0.012*"E7" + 0.012*"F#7"'),
 (19,
  '0.525*"A" + 0.189*"E7" + 0.057*"Edim" + 0.036*"Bm" + 0.033*"F#m" + 0.033*"D" + 0.020*"C#m" + 0.019*"Dm" + 0.016*"G7" + 0.014*"B"'),
 (18,
  '0.793*"Bm" + 0.084*"E7" + 0.028*"D7" + 0.019*"F#7" + 0.019*"Em" + 0.008*"Bb7" + 0.008*"C" + 0.007*"G7" + 0.007*"Am" + 0.004*"A7"'),
 (24,
  '0.405*"Ebm" + 0.219*"G#7" + 0.098*"Dm" + 0.061*"Fm" + 0.047*"G7" + 0.035*"Bb7" + 0.035*"A7" + 0.018*"C" + 0.016*"Ddim" + 0.015*"Fm7b5"')

### Visualize Topics

In [53]:
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis

2021-11-28 14:22:31,386 : INFO : Generating grammar tables from /usr/lib/python3.7/lib2to3/Grammar.txt
2021-11-28 14:22:31,413 : INFO : Generating grammar tables from /usr/lib/python3.7/lib2to3/PatternGrammar.txt

Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated since Python 3.3,and in 3.9 it will stop working


Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated since Python 3.3,and in 3.9 it will stop working


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only



### Dominant Tunes in Topics

In [73]:
#tm_results = lda_model[corpus] 

topics = [[(term, round(wt, 3)) 
               for term, wt in lda_model.show_topic(n, topn=20)] 
                   for n in range(0, lda_model.num_topics)]

topics_df = pd.DataFrame([[term for term, wt in topic] 
                              for topic in topics], 
                         columns = ['Term'+str(i) for i in range(1, 21)],
                         index=['Topic '+str(t) for t in range(1, lda_model.num_topics+1)]).T


In [72]:
pd.set_option('display.max_colwidth', -1)
topics_df = pd.DataFrame([', '.join([term for term, wt in topic])  
                              for topic in topics],
                         columns = ['TermsPerTopic'],
                         index=['Topic'+str(t) for t in range(1, lda_model.num_topics+1)]
                         )
topics_df


Passing a negative integer is deprecated in version 1.0 and will not be supported in future version. Instead, use None to not limit the column width.



Unnamed: 0,TermsPerTopic
Topic1,"G#7, C#, G7, Bb7, F#7, C, F7, Dm7b5, A7, C7, Ebm, Cm7b5, D7, Dm, Fm, E7, B7, Cm, Bb, Am"
Topic2,"F#7, C#m, B, G#m, Em, C#7, G#m7b5, C#m7b5, C, Ebm, G#7, F7, Eb7, D, Ebm7b5, A7, Bbm7b5, Fm7b5, E7, Bb7"
Topic3,"A7, E7, D7, G7, C, F, F#dim, Gdim, Dm, Bb7, B7, Fm, F7, G#7, D7(+b5), Bbm, Em, C7, Dm7b5, Am7b5"
Topic4,"G7(+b5), A7(+b5), Dm, C, Bb7, G7, C7, Em, F, Am, G#, Fm, Gm, Eb, C#7, Eb7, A7, D7, Ebm, Ebdim"
Topic5,"Am, D7, Dm, G#dim, F7, C#7(+b5), E7(+b5), E7, G7, F7(+b5), Em, G7(+b5), Bb7, Eb7, B7, C, F#m7b5, G, A7, F#m"
Topic6,"Em7b5, A7, C, Dm, G7, B7, F#m7b5, Dm7b5, Fm, D7, F7, Bb7, Em, Am, E7, F, F#m, G#7, F#7, Bm7b5"
Topic7,"Fm, Bb7, G7, Dm, C, A7, D7, Em, Eb, G#7, Am, F, C7, B7, Em7b5, Gm, Ebm, C#7, Am7b5, C#dim"
Topic8,"Em, Dm, Fm, C, G7, A7, F, Bb7, Ebdim, Am, Gm, C7, Eb7, C#7, Eb7(+b5), E7, D7, F#7, F#m7b5, G#7"
Topic9,"Cm, G7, F7, Gm, Dm, C, Fm, D7, G#7, C7, Am, Ebm, G#, C#dim, Em7b5, Em, Eb7, F, A7, Bb"
Topic10,"C, G7, Dm, Am, A7, D7, F, C7, E7, Em, F7, Fm, Bb7, B7, Gm, G, G#7, Bm7b5, Ebdim, F#m7b5"


In [87]:
corpus_topics = [sorted(topics[0], key=lambda record: -record[1])[0] for topics in tm_results]

corpus_topic_df = pd.DataFrame()
corpus_topic_df['Document'] = range(0, len(data))
corpus_topic_df['DominantTopic'] = [item[0]+1 for item in corpus_topics]
corpus_topic_df['Contribution%'] = [round(item[1]*100, 2) for item in corpus_topics]
#corpus_topic_df['TopicDesc'] = [topics_df.iloc[t[0]]['TermsPerTopic'] for t in corpus_topics]
corpus_topic_df['Tunes'] = processed_corpus
corpus_topic_df['Title'] = titles['title']

corpus_topic_df

Unnamed: 0,Document,DominantTopic,Contribution%,Tunes,Title
0,0,28,24.86,"[C, Eb7, G#, B7, E, G7, Gm, C7, F, G#7, C#, E7, Am, D7, Dm, G7]",26-2
1,1,14,26.94,"[C, Eb7, G#, B7, E, G7, Gm, C7, F, Eb7, G#, B7, E, G7, C]",26-2
2,2,28,41.10,"[Gm, C7, Bm, E7, A, C7, F, Bbm, Eb7, G#, Dm, G7]",26-2
3,3,14,26.94,"[C, Eb7, G#, B7, E, G7, Gm, C7, F, Eb7, G#, B7, E, G7, C]",26-2
4,4,7,36.08,"[Am, Am, Cm, Cm, Eb, Eb, Em7b5, A7, Dm, Dm, Bm7b5, Bm7b5, Bbm, Bbm, Fm, Fm, E7, E7, Fm, Fm, C#, C#, Fm, Fm, C#, C#]",500 Miles High
...,...,...,...,...,...
5386,5386,23,55.31,"[Dm, G7, Em7b5, A7, Dm, G#7, G7, C, C]",You're Lucky To Me
5387,5387,23,40.98,"[Dm, G7, Em7b5, A7, Dm, G#7, G7, C, C, A7, A7, A7, A7, Dm, G7, D7, G7, Dm, G7, Em7b5, A7, Dm, G#7, G7, C, C]",You're Lucky To Me
5388,5388,23,65.56,"[C, E7, A7, A7, Dm, G7, C, Em, C, G7, Em, Cdim, Dm, Dm, Dm, D7, Dm, G7, Dm, G7, C, E7, A7, A7, Dm, A7, Dm, A7, Dm, F, Cdim, C, E7, Dm, D7, G7, C, C]",You're Nobody Till Somebody Loves You
5389,5389,30,64.77,"[G7, G7, C, C, G7, G7, C, F, G#, C, C, G7, G7, C, F, C, C, G7, C, G7, F7, F7, F7, C, F7, F7, C, C, F7, F7, F7, C, F7, F7, C, F, C]",Your Feet's Too Big


In [88]:
corpus_topic_df.groupby('DominantTopic').apply(lambda topic_set: (topic_set.sort_values(by=['Contribution%'], 
                                                                                         ascending=False)
                                                                             .iloc[0])
)


Unnamed: 0_level_0,Document,DominantTopic,Contribution%,Tunes,Title
DominantTopic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,3316,1,86.92,"[C#, C#, C#, C#, C#, C#, C#, C#]","Saga Of Harrison Crabfeathers, The"
2,3580,2,77.81,"[B, G#m, C#m, F#7, Ebm, G#7, C#m, F#7, B, G#m, C#m, F#7, B, Em, Dm, Em7b5, A7]",Sophisticated Lady
3,3846,3,88.34,"[A7, A7, D7, D7, E7, A7, D7, D7, G7]",There'll Be Some Changes Made
4,1652,4,50.48,"[G7(+b5), C, G7(+b5), C, G7(+b5), C, C, C7, F, Bb7, Dm, C, F, Bb7, Dm, C]",I Got The Sun In The Morning
5,1344,5,93.15,"[Am, D7, Am, D7, Am, D7, Am, E7, Am, D7, Am, D7, Am, D7, Am, Am]",Full House
6,2351,6,88.07,"[C, F#m7b5, C, F#m7b5, B7, Em7b5, A7, Em7b5, A7]",Let's Get Lost
7,202,7,86.88,"[Fm, C, Fm, Fm, Fm, C, Fm, Fm]",All Of You
8,195,8,93.54,"[Dm, Fm, Bb7, Em, Ebdim, Dm, Em, A7, Dm, G7, Em, A7, Dm, Fm, G7, C, Em, A7]",All My Tomorrows
9,5146,9,78.37,"[Cm, G7, Cm, G7, Cm, Cm, Cm, G7, Cm, G7, D7, G7, Cm, G7, Cm, G7, Cm, Cm]",Shim-Me-Sha-Wabble
11,1234,11,92.89,"[D, Em, D, Em, D, Em, D, Em, Eb, Em, Eb, Em, Eb, Em, Eb, Em]",Fantasy in D (or Ugetsu)


### List the Tunes of the individual Topics

In [89]:
def display_tunes_per_topic(df, topic):
  tunes = df.query(f"DominantTopic == {topic}").sort_values('Contribution%', ascending=False)
  print(f"\n*** TOPIC {n} ***")
  print(f"{len(tunes)} tunes")
  return tunes

In [90]:
topic_result = corpus_topic_df.loc[:, ['DominantTopic', 'Contribution%', 'Title', 'Tunes']]

In [91]:
for n in range(1, TOTAL_TOPICS+1):
  print(display_tunes_per_topic(topic_result, topic=n).head(30))


*** TOPIC 1 ***
60 tunes
      DominantTopic  Contribution%                               Title                                                                                                                                Tunes
3316  1              86.92          Saga Of Harrison Crabfeathers, The  [C#, C#, C#, C#, C#, C#, C#, C#]                                                                                                   
4116  1              86.86          Walkin' up                          [G#7, G#7, G#7, G#7, G7, G7, G7, G7]                                                                                               
1425  1              65.45          Goodbye Pork Pie Hat                [C7, G#7, C#, F#7, Bb7, G#7, Bb7, C7, Fm, G#7, Dm7b5, G7, A7, D7, G#7, C#, F#7, F7, G7, Bb7, C7, G#7, C#, F#7, C#, F#7, C7, C#, Cm]
1423  1              64.93          Goodbye Pork Pie Hat                [C7, G#7, C#, F#7, Bb7, G#7, Bb7, C7, Fm, G#7, Dm7b5, G7, A7, D7, G#7, C#, F#7, F7, G7

In [92]:
from gensim.matutils import jaccard

# element 0 of the result from lda_model contains the topics that a tune belongs to, with the respective probabilities
jaccard(lda_model[corpus[505]][0], lda_model[corpus[925]][0])

0.987698240937541

In [94]:
topic_result.loc[:, ['DominantTopic']].to_csv('topics.csv', index=True, index_label='id')

# LSA (Latent Semantic Analysis), aka LSI (Latent Semantic Index) 

In [54]:
from collections import defaultdict

In [55]:
num_topics = lsi_config['num_topics']

# remove words that appear only once
frequency = defaultdict(int)
for text in processed_corpus:
    for token in text:
        frequency[token] += 1

data = [[token for token in text if frequency[token] > 1] for text in processed_corpus]
dictionary = corpora.Dictionary(data)

# doc2bow counts the number of occurences of each distinct word,
# converts the word to its integer word id and returns the result
# as a sparse vector

bow_corpus = [dictionary.doc2bow(text) for text in data]
lsi = LsiModel(bow_corpus, id2word=dictionary, num_topics=num_topics)  # num_topics can be maximum the size of the number of unique tokens

2021-11-28 14:22:36,449 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-11-28 14:22:36,564 : INFO : built Dictionary(72 unique tokens: ['Am', 'B7', 'C', 'C#', 'C7']...) from 5391 documents (total 82622 corpus positions)
2021-11-28 14:22:36,565 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(72 unique tokens: ['Am', 'B7', 'C', 'C#', 'C7']...) from 5391 documents (total 82622 corpus positions)", 'datetime': '2021-11-28T14:22:36.565680', 'gensim': '4.1.2', 'python': '3.7.12 (default, Sep 10 2021, 00:21:48) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'created'}
2021-11-28 14:22:36,814 : INFO : using serial LSI version on this node
2021-11-28 14:22:36,816 : INFO : updating model with new documents
2021-11-28 14:22:36,820 : INFO : preparing a new chunk of documents
2021-11-28 14:22:36,845 : INFO : using 100 extra samples and 2 power iterations
2021-11-28 14:22:36,847 : INFO : 1st phase: constructing (72, 200) action

In [56]:
index_lsi = similarities.Similarity('/content/index/index_lsi', lsi[bow_corpus], num_features=len(dictionary))

2021-11-28 14:22:37,053 : INFO : starting similarity index under /content/index/index_lsi


In [57]:
#index_lsi = similarities.SparseMatrixSimilarity(lsi[bow_corpus], num_features = len(dictionary))
#index_lsi = similarities.MatrixSimilarity(lsi[bow_corpus])  # transform corpus to LSI space and index it


In [58]:
lsi.save('/content/index/lsi.model')
index_lsi.save('/content/index/lsi_matrixsim.index')


2021-11-28 14:22:37,584 : INFO : Projection lifecycle event {'fname_or_handle': '/content/index/lsi.model.projection', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-11-28T14:22:37.583997', 'gensim': '4.1.2', 'python': '3.7.12 (default, Sep 10 2021, 00:21:48) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'saving'}
2021-11-28 14:22:37,587 : INFO : saved /content/index/lsi.model.projection
2021-11-28 14:22:37,589 : INFO : LsiModel lifecycle event {'fname_or_handle': '/content/index/lsi.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': ['projection', 'dispatcher'], 'datetime': '2021-11-28T14:22:37.589661', 'gensim': '4.1.2', 'python': '3.7.12 (default, Sep 10 2021, 00:21:48) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'saving'}
2021-11-28 14:22:37,592 : INFO : not storing attribute projection
2021-11-28 14:22:37,595 : INFO : not storing attribute dispatch

In [59]:
!ls -la /content/index

total 1580
drwxr-xr-x 2 root root    4096 Nov 28 14:22 .
drwxr-xr-x 1 root root    4096 Nov 28 14:21 ..
-rw-r--r-- 1 root root 1553280 Nov 28 14:22 index_lsi.0
-rw-r--r-- 1 root root     742 Nov 28 14:22 lsi_matrixsim.index
-rw-r--r-- 1 root root    2728 Nov 28 14:22 lsi.model
-rw-r--r-- 1 root root   42728 Nov 28 14:22 lsi.model.projection


## Tests

### Tests for Single Tunes

In [60]:
%%time
df_sim = get_sim_scores(tunes_eval_list, index=index_lsi, model=lsi)

# save a copy of the results; all results will be concatenated at the end
df_lsi = df_sim[:]


--------------------------------------------------
188 All God's Chillun Got Rhythm, section1 (A)
189 All God's Chillun Got Rhythm, section2 (B)
190 All God's Chillun Got Rhythm, section3 (A)
191 All God's Chillun Got Rhythm, section4 (C)

--------------------------------------------------
198 All Of Me, section1 (A)
199 All Of Me, section2 (B)
200 All Of Me, section3 (A)
201 All Of Me, section4 (C)

--------------------------------------------------
496 Blue Moon, section1 (A)
497 Blue Moon, section2 (A)
498 Blue Moon, section3 (B)
499 Blue Moon, section4 (A)

--------------------------------------------------
653 Bye Bye Blackbird, section1 (A)
654 Bye Bye Blackbird, section2 (B)

--------------------------------------------------
1220 Exactly Like You, section1 (A)
1221 Exactly Like You, section2 (A)
1222 Exactly Like You, section3 (B)
1223 Exactly Like You, section4 (A)

--------------------------------------------------
1538 Honeysuckle Rose, section1 (A)
1539 Honeysuckle Rose, s

In [61]:
import plotly.express as px
fig = px.histogram(df_sim, x="score", nbins=50, title='LSI Scores')
fig.show()

In [62]:
if use_wandb:
  wandb.log({"scores_hist": fig})

In [63]:
result, details = recommend_tune(df_sim, 'These Foolish Things [jazz1350]')
result.head(30)

Unnamed: 0,index,reference,reference_titleid,similar_title,score,max,score_div_max,similar_titleid
0,0,These Foolish Things [jazz1350],1168,More Than You Know [jazz1350],0.916515,0.916515,1.0,829
1,1,These Foolish Things [jazz1350],1168,I Loves You Porgy [jazz1350],0.954864,0.972306,0.997729,520
2,2,These Foolish Things [jazz1350],1168,It's Easy To Remember [jazz1350],0.951895,0.964712,0.986822,637
3,3,These Foolish Things [jazz1350],1168,"Way You Look Tonight, The [jazz1350]",0.942543,0.972306,0.977049,1253
4,4,These Foolish Things [jazz1350],1168,Rosetta [jazz1350],0.895443,0.916515,0.977008,996
5,5,These Foolish Things [jazz1350],1168,Rosetta [trad],0.895443,0.916515,0.977008,1676
6,6,These Foolish Things [jazz1350],1168,Stairway To The Stars [jazz1350],0.937159,0.964712,0.971599,1097
7,7,These Foolish Things [jazz1350],1168,They All Laughed [jazz1350],0.926855,0.972306,0.970866,1169
8,8,These Foolish Things [jazz1350],1168,Easy Street [jazz1350],0.926302,0.954669,0.970286,343
9,9,These Foolish Things [jazz1350],1168,Among My Souvenirs [jazz1350],0.92582,0.954669,0.969782,75


In [64]:
dd = pd.DataFrame(columns = recommender_results_cols)

for tune in tunes_eval_list:
  result, details = recommend_tune(df_sim, tune)
  dd['id'] = result['index']
  dd['reference'] = tune
  dd['method'] = 'lsi'
  dd['similar'] = result['similar_title']
  dd['score_div_max'] = result['score_div_max']
  dd['score'] = result['score']

recommender_results = recommender_results.append(dd)

### Tests for Contrafacts

In [65]:
topN = 20
matches, results = test_contrafacts(contrafacts, index_lsi, lsi, topN)

print(f"Found matches: {matches} out of {len(results)}: {100*matches/len(results):.3f}%")
print() 
for rr, val in results.items():
  if val == 0:
    print(f"{val}: {rr}")


Found matches: 95 out of 111: 85.586%

0: 52nd Street Theme [jazz1350], I Got Rhythm [jazz1350]
0: Ballade [jazz1350], As Long As I Live [jazz1350]
0: Dewey Square [jazz1350], Oh, Lady Be Good [jazz1350]
0: Don't Be That Way [jazz1350], I Got Rhythm [jazz1350]
0: Good Bait [jazz1350], I Got Rhythm [jazz1350]
0: Lennie's Pennies [jazz1350], Pennies From Heaven [jazz1350]
0: Little Rootie Tootie [jazz1350], I Got Rhythm [jazz1350]
0: Little Willie Leaps [jazz1350], All God's Chillun Got Rhythm [jazz1350]
0: My Little Suede Shoes [jazz1350], Jeepers Creepers [jazz1350]
0: Quasimodo (Theme) [jazz1350], Embraceable You [jazz1350]
0: Room 608 [jazz1350], I Got Rhythm [jazz1350]
0: Sweet Sue, Just You [jazz1350], Honeysuckle Rose [jazz1350]
0: Sweet Sue, Just You [jazz1350], Bye Bye Blackbird [jazz1350]
0: These Foolish Things [jazz1350], Blue Moon [jazz1350]
0: These Foolish Things [jazz1350], Soultrain [jazz1350]
0: Take The A Train [jazz1350], Girl From Ipanema, The [jazz1350]
0: My Secret

In [66]:
model_name = 'lsi'
if use_wandb:
  wandb.log(
      {model_name: {
                'contrafacts': {
                    'topN': topN,
                    'success': matches/len(contrafacts),
                    #'results': results
                    }
                   },
       'all_models': {
           model_name: wandb.Table(
               columns=["Title", "Match"],
               data=[[key, value] for key, value in results.items()]),
       }
       })

### Get Recommender Data for WebApp

In [67]:
%%time
if generate_webapp_data:
  _tunes = list(tunes['title_playlist'].values())
  _tunes = tunes_eval_list

  method = 'lsi'

  df_sim = get_sim_scores(_tunes, index_lsi, lsi)

  result = None
  for tune in _tunes:
    tune_result, details = recommend_tune(df_sim, tune)
    if result is None:
      result = pd.DataFrame(columns=tune_result.columns)
    result = result.append(tune_result)

  # save to file
  (result
   .loc[:,['reference_titleid',
           'similar_titleid',
           'score']]
   .to_csv(f'output/recommender_{method}.csv', encoding='utf8')
  )
  with zipfile.ZipFile(f'output/recommender_{method}.zip', 'w') as zf:
    zf.write(f'output/recommender_{method}.csv')


  # save to wandb
  if use_wandb:
    model_artifact = wandb.Artifact(
        f"recommender_{method}", 
        type="csv",
        description=f"Recommendations for each Tune using {method} Model (csv file)",
        metadata="")

    model_artifact.add_file(f'output/recommender_{method}.zip')
    wandb.log_artifact(model_artifact)

CPU times: user 10 µs, sys: 1e+03 ns, total: 11 µs
Wall time: 10.3 µs


## Store Model to W&B

In [68]:
if use_wandb:
  model_artifact = wandb.Artifact(
      "model_lsi", 
      type="model",
      description="LSI model",
      metadata="")

  model_artifact.add_file("/content/index/lsi.model")
  model_artifact.add_file("/content/index/lsi_matrixsim.index")
  model_artifact.add_file("/content/index/lsi.model.projection")
  wandb.log_artifact(model_artifact)

For unigrams, the best number of topics seems to be around 20.

For unigrams plus bigrams, the coherence score drops down until 100 and then continuously rises until 500 and continues to rise. Same for bigrams-only.

# W&B Logging and Finish

In [69]:
if use_wandb:
  wandb.finish()