<a href="https://colab.research.google.com/github/11doris/jazz-maestro/blob/colab_word_embeddings/recommender_score.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sections as Input

In [1]:
!pip uninstall gensim -y

Found existing installation: gensim 4.1.2
Uninstalling gensim-4.1.2:
  Successfully uninstalled gensim-4.1.2


In [2]:
!pip install gensim

Collecting gensim
  Using cached gensim-4.1.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
Installing collected packages: gensim
Successfully installed gensim-4.1.2


In [3]:
import gensim
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import pprint
import pandas as pd
import numpy as np
from collections import Counter
import plotly.express as px
from tqdm import tqdm 
from gensim.models.doc2vec import Doc2Vec
import pickle
import os

In [4]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [5]:
print(gensim.__version__)

4.1.2


# Initialization

## Download the Data

### Basic Plus Chords
M7 and 6 reduced to major triad, m7 reduced to m, dominant 7, m7b5, diminished, and all (b5) left as they are.

In [6]:
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=17djlZRWFSUCviOxRTOF-nwbbZqU5gUP9' -O data.csv

--2021-11-19 22:12:57--  https://docs.google.com/uc?export=download&id=17djlZRWFSUCviOxRTOF-nwbbZqU5gUP9
Resolving docs.google.com (docs.google.com)... 74.125.70.138, 74.125.70.113, 74.125.70.101, ...
Connecting to docs.google.com (docs.google.com)|74.125.70.138|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-0s-4c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/58lp8r0s32ccb2710dpde55birdrke3k/1637359950000/14329102864480165501/*/17djlZRWFSUCviOxRTOF-nwbbZqU5gUP9?e=download [following]
--2021-11-19 22:12:58--  https://doc-0s-4c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/58lp8r0s32ccb2710dpde55birdrke3k/1637359950000/14329102864480165501/*/17djlZRWFSUCviOxRTOF-nwbbZqU5gUP9?e=download
Resolving doc-0s-4c-docs.googleusercontent.com (doc-0s-4c-docs.googleusercontent.com)... 108.177.111.132, 2607:f8b0:4001:c07::84
Connecting to doc-0s-4c-docs.googleusercontent.com (doc-0s-4c-docs

### Read Chords Input Data

In [7]:
path_to_file = '/content/data.csv'
df = pd.read_csv(path_to_file, sep='\t', index_col="id")
df = df.reset_index()
df.head(5)

Unnamed: 0,id,file_name,title,tune_mode,tune_id,section_name,section_id,chords
0,0,dataset/jazz1350/26-2.xml,26-2,major,0,A,1,C Eb7 G# B7 E G7 Gm C7 F G#7 C# E7 Am D7 Dm G7
1,1,dataset/jazz1350/26-2.xml,26-2,major,0,A,2,C Eb7 G# B7 E G7 Gm C7 F Eb7 G# B7 E G7 C
2,2,dataset/jazz1350/26-2.xml,26-2,major,0,B,3,Gm C7 Bm E7 A C7 F Bbm Eb7 G# Dm G7
3,3,dataset/jazz1350/26-2.xml,26-2,major,0,A,4,C Eb7 G# B7 E G7 Gm C7 F Eb7 G# B7 E G7 C
4,4,dataset/jazz1350/500 Miles High.xml,500 Miles High,minor,1,,0,Am Am Cm Cm Eb Eb Em7b5 A7 Dm Dm Bm7b5 Bm7b5 B...


### Meta Data

In [8]:
titles = df.loc[:, ['id', 'tune_id', 'section_id', 'section_name', 'title', 'tune_mode']]
titles[:10]

Unnamed: 0,id,tune_id,section_id,section_name,title,tune_mode
0,0,0,1,A,26-2,major
1,1,0,2,A,26-2,major
2,2,0,3,B,26-2,major
3,3,0,4,A,26-2,major
4,4,1,0,,500 Miles High,minor
5,5,2,0,,502 Blues,minor
6,6,3,1,A,52nd Street Theme,major
7,7,3,2,A,52nd Street Theme,major
8,8,3,3,B,52nd Street Theme,major
9,9,3,4,A,52nd Street Theme,major


In [9]:
titles_dict = titles.to_dict()

sectionid_to_title = titles_dict['title']
sectionid_to_titleid = titles_dict['tune_id']


In [10]:
tunes = df.loc[:, ['tune_id', 'title']].drop_duplicates()
tunes = tunes.set_index('tune_id').to_dict()
titleid_to_title = tunes['title']

In [11]:
titles_rows = titles.to_dict(orient='records')
sectionid_to_section = []
for i, row in enumerate(titles_rows):
  name = f"{row['title']}, section{row['section_id']} ({row['section_name']})"
  sectionid_to_section.append(name)
  

In [12]:
title_to_sectionid = {}

for row in titles.iterrows():
  title = row[1]['title']
  if title not in title_to_sectionid:
    title_to_sectionid[title] = [row[1]['id']]
  else:
    title_to_sectionid[title].append(row[1]['id'])

## Helpers for N-Grams

In [13]:
def ngrams(tokens, n=2, sep='-'):
    return [sep.join(ngram) for ngram in zip(*[tokens[i:] for i in range(n)])]

# Data Preparation

In [14]:
ngrams_for_input = [1]

In [15]:
lines = df.loc[:, 'chords'].tolist()
data = [line.split(' ') for line in lines]

In [16]:
processed_corpus = []
for line in data:
  tune_n = []
  for n in ngrams_for_input:
    tune_n.extend(ngrams(line, n=n))
  processed_corpus.append(tune_n)

for line in processed_corpus[:5]:
  print(line)

['C', 'Eb7', 'G#', 'B7', 'E', 'G7', 'Gm', 'C7', 'F', 'G#7', 'C#', 'E7', 'Am', 'D7', 'Dm', 'G7']
['C', 'Eb7', 'G#', 'B7', 'E', 'G7', 'Gm', 'C7', 'F', 'Eb7', 'G#', 'B7', 'E', 'G7', 'C']
['Gm', 'C7', 'Bm', 'E7', 'A', 'C7', 'F', 'Bbm', 'Eb7', 'G#', 'Dm', 'G7']
['C', 'Eb7', 'G#', 'B7', 'E', 'G7', 'Gm', 'C7', 'F', 'Eb7', 'G#', 'B7', 'E', 'G7', 'C']
['Am', 'Am', 'Cm', 'Cm', 'Eb', 'Eb', 'Em7b5', 'A7', 'Dm', 'Dm', 'Bm7b5', 'Bm7b5', 'Bbm', 'Bbm', 'Fm', 'Fm', 'E7', 'E7', 'Fm', 'Fm', 'C#', 'C#', 'Fm', 'Fm', 'C#', 'C#']


# TF-IDF

In [17]:
from gensim import corpora
from gensim import similarities


In [18]:
dictionary = corpora.Dictionary(processed_corpus)

2021-11-19 22:12:59,639 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-11-19 22:12:59,756 : INFO : built Dictionary(72 unique tokens: ['Am', 'B7', 'C', 'C#', 'C7']...) from 5403 documents (total 82790 corpus positions)
2021-11-19 22:12:59,759 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(72 unique tokens: ['Am', 'B7', 'C', 'C#', 'C7']...) from 5403 documents (total 82790 corpus positions)", 'datetime': '2021-11-19T22:12:59.759047', 'gensim': '4.1.2', 'python': '3.7.12 (default, Sep 10 2021, 00:21:48) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'created'}


In [19]:
# dictionary.token2id

In [20]:
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

In [21]:
!rm -R index
!mkdir index

In [22]:
# Note: SparseMatrixSimilarity crashes due to lacking memory
#from gensim.test.utils import get_tmpfile
#index_tmpfile = get_tmpfile("index")
#index_tfidf = similarities.Similarity(index_tmpfile, bow_corpus, num_features=len(dictionary))

index_tfidf = similarities.Similarity('/content/index/index_tfidf', bow_corpus, num_features=len(dictionary))

2021-11-19 22:13:00,103 : INFO : starting similarity index under /content/index/index_tfidf


In [23]:
ls -la /content/index

total 8
drwxr-xr-x 2 root root 4096 Nov 19 22:12 [0m[01;34m.[0m/
drwxr-xr-x 1 root root 4096 Nov 19 22:12 [01;34m..[0m/


## Test for single tunes

In [24]:
def get_sim_scores(tunes, index):

    df_sim = pd.DataFrame(columns=['reference_title',
                                  'similar_title',
                                  'ref_section', 
                                  'similar_section', 
                                  'score', 
                                  ])

    for tune in tunes:
      print()
      print("-"*50)
      for s1 in title_to_sectionid[tune]:
      
          query = processed_corpus[s1]
          query_bow = dictionary.doc2bow(query)

          # perform a similarity query against the corpus
          similarities = index[query_bow]
          sims = sorted(enumerate(similarities), key=lambda item: -item[1])
          
          print(s1, sectionid_to_section[s1])
          n = 0
          for s2, s2_score in sims:
            
            # store the top N best results
            if n > 40:
                break
            # don't count self-similarity between sections of the same tune
            if s2 not in title_to_sectionid[tune]:
                n += 1
            
                # print(f"\t{s2_score:.3f} {sectionid_to_section[s2]}")

                df_sim.loc[len(df_sim)] = [tune,
                                sectionid_to_title[s2],
                                sectionid_to_section[s1], 
                                sectionid_to_section[s2], 
                                s2_score, 
                                ]
    return df_sim

In [25]:
tunes_eval_list = [
  'Sweet Sue, Just You',
  'These Foolish Things', 
  'Blue Moon',
  'All Of Me',
  "All God's Chillun Got Rhythm",
  'I Got Rhythm',
  'Bye Bye Blackbird',
  'Old Fashioned Love',
  'Dinah',
  'Honeysuckle Rose',
  'Misty'
]

In [26]:
%%time
df_sim = get_sim_scores(tunes_eval_list, index_tfidf)

2021-11-19 22:13:01,758 : INFO : creating sparse index
2021-11-19 22:13:01,760 : INFO : creating sparse matrix from corpus
2021-11-19 22:13:01,765 : INFO : PROGRESS: at document #0/5403



--------------------------------------------------


2021-11-19 22:13:01,922 : INFO : created <5403x72 sparse matrix of type '<class 'numpy.float32'>'
	with 41308 stored elements in Compressed Sparse Row format>
2021-11-19 22:13:01,931 : INFO : creating sparse shard #0
2021-11-19 22:13:01,933 : INFO : saving index shard to /content/index/index_tfidf.0
2021-11-19 22:13:01,935 : INFO : SparseMatrixSimilarity lifecycle event {'fname_or_handle': '/content/index/index_tfidf.0', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-11-19T22:13:01.935564', 'gensim': '4.1.2', 'python': '3.7.12 (default, Sep 10 2021, 00:21:48) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'saving'}
2021-11-19 22:13:01,942 : INFO : saved /content/index/index_tfidf.0
2021-11-19 22:13:01,951 : INFO : loading SparseMatrixSimilarity object from /content/index/index_tfidf.0
2021-11-19 22:13:01,953 : INFO : SparseMatrixSimilarity lifecycle event {'fname': '/content/index/index_tfidf.0', 'datetime':

3752 Sweet Sue, Just You, section1 (A)
3753 Sweet Sue, Just You, section2 (A)
3754 Sweet Sue, Just You, section3 (B)
3755 Sweet Sue, Just You, section4 (A)

--------------------------------------------------
3866 These Foolish Things, section1 (A)
3867 These Foolish Things, section2 (A)
3868 These Foolish Things, section3 (B)
3869 These Foolish Things, section4 (A)

--------------------------------------------------
496 Blue Moon, section1 (A)
497 Blue Moon, section2 (A)
498 Blue Moon, section3 (B)
499 Blue Moon, section4 (A)

--------------------------------------------------
198 All Of Me, section1 (A)
199 All Of Me, section2 (B)
200 All Of Me, section3 (A)
201 All Of Me, section4 (C)

--------------------------------------------------
188 All God's Chillun Got Rhythm, section1 (A)
189 All God's Chillun Got Rhythm, section2 (B)
190 All God's Chillun Got Rhythm, section3 (A)
191 All God's Chillun Got Rhythm, section4 (C)

--------------------------------------------------
1647 I Got R

In [27]:
import plotly.express as px
fig = px.histogram(df_sim, x="score", nbins=50)
fig.show()

In [28]:
def recommend_tune(df, tune_name):

  df_tune = df.query(f"reference_title == '{tune_name}'")
  ff = df_tune.iloc[:]

  # get the maximum similarity score for each section and store in new column
  ff['max'] = ff.groupby('ref_section')['score'].transform('max')

  # scale the score with the maxmum value of each section
  ff['score_div_max'] = ff['score'] / ff['max']

  # consider only the top N tunes for each group
  ff = ff.sort_values(['ref_section', 'score_div_max'], ascending=[True, False])
  result = ff.groupby('ref_section').head(30)
  
  # if multiple rows from the same similar tune, keep only the similar_title with the highest score_div_max
  result = result.groupby('similar_title').max('score_div_max').sort_values('score_div_max', ascending=False)

  return result, ff

In [29]:
result, details = recommend_tune(df_sim, 'Blue Moon')
result.head(30)

2021-11-19 22:13:11,660 : INFO : NumExpr defaulting to 2 threads.


Unnamed: 0_level_0,score,max,score_div_max
similar_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Things We Did Last Summer, The",0.927634,0.927634,1.0
At Last,0.978261,0.978261,1.0
Be Careful It's My Heart,0.928477,0.928477,1.0
"Touch Of Your Lips, The",0.95673,0.978261,0.989706
Long Ago And Far Away,0.968004,0.978261,0.989515
Palo Alto,0.91409,0.927634,0.9854
Speak Low,0.914301,0.928477,0.984732
Jeepers Creepers,0.956929,0.978261,0.982
All God's Chillun Got Rhythm,0.909509,0.927634,0.980461
Mountain Greenery,0.958373,0.978261,0.97967


In [30]:
details.query('ref_section == "These Foolish Things, section3 (B)"').head(10)

Unnamed: 0,reference_title,similar_title,ref_section,similar_section,score,max,score_div_max


# SVD based LSI

In [31]:
from gensim import corpora, models, similarities
from collections import defaultdict

In [32]:
frequency = defaultdict(int)
for text in processed_corpus:
    for token in text:
        frequency[token] += 1

data_ngrams = [[token for token in text if frequency[token] > 1] for text in processed_corpus]
dictionary = corpora.Dictionary(data_ngrams)

# doc2bow counts the number of occurences of each distinct word,
# converts the word to its integer word id and returns the result
# as a sparse vector

bow_corpus = [dictionary.doc2bow(text) for text in data_ngrams]
lsi = models.LsiModel(bow_corpus, id2word=dictionary, num_topics=200)  # num_topics can be maximum the size of the number of unique tokens

2021-11-19 22:13:11,779 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-11-19 22:13:11,889 : INFO : built Dictionary(72 unique tokens: ['Am', 'B7', 'C', 'C#', 'C7']...) from 5403 documents (total 82790 corpus positions)
2021-11-19 22:13:11,891 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(72 unique tokens: ['Am', 'B7', 'C', 'C#', 'C7']...) from 5403 documents (total 82790 corpus positions)", 'datetime': '2021-11-19T22:13:11.891815', 'gensim': '4.1.2', 'python': '3.7.12 (default, Sep 10 2021, 00:21:48) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'created'}
2021-11-19 22:13:11,947 : INFO : using serial LSI version on this node
2021-11-19 22:13:11,950 : INFO : updating model with new documents
2021-11-19 22:13:11,953 : INFO : preparing a new chunk of documents
2021-11-19 22:13:11,985 : INFO : using 100 extra samples and 2 power iterations
2021-11-19 22:13:11,987 : INFO : 1st phase: constructing (72, 300) action

In [33]:
index_lsi = similarities.Similarity('/content/index/index_lsi', bow_corpus, num_features=len(dictionary))

2021-11-19 22:13:12,234 : INFO : starting similarity index under /content/index/index_lsi


### Test for single tunes

In [34]:
%%time
df_sim = get_sim_scores(tunes_eval_list, index_lsi)

2021-11-19 22:13:13,765 : INFO : creating sparse index
2021-11-19 22:13:13,768 : INFO : creating sparse matrix from corpus
2021-11-19 22:13:13,771 : INFO : PROGRESS: at document #0/5403
2021-11-19 22:13:13,918 : INFO : created <5403x72 sparse matrix of type '<class 'numpy.float32'>'
	with 41308 stored elements in Compressed Sparse Row format>
2021-11-19 22:13:13,920 : INFO : creating sparse shard #0
2021-11-19 22:13:13,923 : INFO : saving index shard to /content/index/index_lsi.0
2021-11-19 22:13:13,926 : INFO : SparseMatrixSimilarity lifecycle event {'fname_or_handle': '/content/index/index_lsi.0', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-11-19T22:13:13.926919', 'gensim': '4.1.2', 'python': '3.7.12 (default, Sep 10 2021, 00:21:48) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'saving'}
2021-11-19 22:13:13,931 : INFO : saved /content/index/index_lsi.0
2021-11-19 22:13:13,933 : INFO : loading SparseMat


--------------------------------------------------
3752 Sweet Sue, Just You, section1 (A)
3753 Sweet Sue, Just You, section2 (A)
3754 Sweet Sue, Just You, section3 (B)
3755 Sweet Sue, Just You, section4 (A)

--------------------------------------------------
3866 These Foolish Things, section1 (A)
3867 These Foolish Things, section2 (A)
3868 These Foolish Things, section3 (B)
3869 These Foolish Things, section4 (A)

--------------------------------------------------
496 Blue Moon, section1 (A)
497 Blue Moon, section2 (A)
498 Blue Moon, section3 (B)
499 Blue Moon, section4 (A)

--------------------------------------------------
198 All Of Me, section1 (A)
199 All Of Me, section2 (B)
200 All Of Me, section3 (A)
201 All Of Me, section4 (C)

--------------------------------------------------
188 All God's Chillun Got Rhythm, section1 (A)
189 All God's Chillun Got Rhythm, section2 (B)
190 All God's Chillun Got Rhythm, section3 (A)
191 All God's Chillun Got Rhythm, section4 (C)

-----------

In [35]:
df_sim

Unnamed: 0,reference_title,similar_title,ref_section,similar_section,score
0,"Sweet Sue, Just You",A Certain Smile,"Sweet Sue, Just You, section1 (A)","A Certain Smile, section1 (A)",1.000000
1,"Sweet Sue, Just You",A Certain Smile,"Sweet Sue, Just You, section1 (A)","A Certain Smile, section3 (A)",1.000000
2,"Sweet Sue, Just You",Brazil (Aquarela Do Brasil),"Sweet Sue, Just You, section1 (A)","Brazil (Aquarela Do Brasil), section5 (D)",1.000000
3,"Sweet Sue, Just You",Little Girl Blue,"Sweet Sue, Just You, section1 (A)","Little Girl Blue, section1 (verse)",1.000000
4,"Sweet Sue, Just You",Mimi,"Sweet Sue, Just You, section1 (A)","Mimi, section1 (A)",1.000000
...,...,...,...,...,...
2045,Misty,Chicken,"Misty, section4 (A)","Chicken, section2 (A)",0.919183
2046,Misty,Time After Time,"Misty, section4 (A)","Time After Time, section2 (B)",0.918715
2047,Misty,Could It Be You,"Misty, section4 (A)","Could It Be You, section2 (A)",0.917067
2048,Misty,"Glory Of Love, The","Misty, section4 (A)","Glory Of Love, The, section4 (A)",0.916380


In [36]:
import plotly.express as px
fig = px.histogram(df_sim, x="score", nbins=50)
fig.show()

In [37]:
result, details = recommend_tune(df_sim, 'These Foolish Things')
result.head(30)

Unnamed: 0_level_0,score,max,score_div_max
similar_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Way You Look Tonight, The",0.966796,0.974755,1.0
I Loves You Porgy,0.974755,0.974755,1.0
More Than You Know,0.916515,0.916515,1.0
It's Easy To Remember,0.951972,0.974755,0.997175
Stairway To The Stars,0.942223,0.974755,0.986963
"Party's Over, The",0.958709,0.974755,0.983539
Rosetta,0.895443,0.916515,0.977008
They All Laughed,0.95098,0.974755,0.975609
And The Angels Sing,0.947678,0.974755,0.972222
Let's Fall In Love,0.930484,0.974755,0.971464


# Doc2Vec

In [38]:
def get_tagged_documents(corpus):
  for i, tokens in enumerate(corpus):
    yield gensim.models.doc2vec.TaggedDocument(tokens, [i, f'titleid_{sectionid_to_titleid[i]}'])
    yield gensim.models.doc2vec.TaggedDocument(tokens, [i])  # diatonic chord distance is a bit better

In [39]:
train_corpus = list(get_tagged_documents(processed_corpus))
train_corpus[1000]

TaggedDocument(words=['C', 'Am', 'Dm', 'G7', 'Em', 'Am', 'Dm', 'G7', 'Gm', 'C7', 'F', 'Bb7', 'Am', 'D7', 'Dm', 'G7'], tags=[500, 'titleid_160'])

In [40]:
import multiprocessing

common_kwargs = dict(
    epochs=40, 
    min_count=1,
    workers=multiprocessing.cpu_count(), 
    negative=10, 
)

print(f"CPU Count: {multiprocessing.cpu_count()}")

dm = 0 # PV-DBOW
dm = 1 # PV-DM
sample = 0.001  # huge influence; 0.001 is best for a high value in the similarity of diatonic chords of the Cmaj scale
window = 2  # 2 is best for diatonic chords
negative = 10 # 10 is best for diatonic chords

model = gensim.models.doc2vec.Doc2Vec(train_corpus,
                                      dm=1, 
                                      vector_size=100, 
                                      window=window, 
                                      epochs=40,
                                      workers=1,
                                      min_count=1,
                                      negative=negative,
                                      sample=sample,
                                      seed=42,
                                      )

2021-11-19 22:13:23,275 : INFO : collecting all words and their counts
2021-11-19 22:13:23,278 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2021-11-19 22:13:23,326 : INFO : PROGRESS: at example #10000, processed 149208 words (3364535/s), 72 word types, 1619 tags
2021-11-19 22:13:23,361 : INFO : collected 72 word types and 7215 unique tags from a corpus of 10806 examples and 165580 words
2021-11-19 22:13:23,364 : INFO : Creating a fresh vocabulary
2021-11-19 22:13:23,369 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=1 retains 72 unique words (100.0%% of original 72, drops 0)', 'datetime': '2021-11-19T22:13:23.369297', 'gensim': '4.1.2', 'python': '3.7.12 (default, Sep 10 2021, 00:21:48) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'prepare_vocab'}
2021-11-19 22:13:23,378 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 165580 word corpus (100.0%% of original 165580, drops 0

CPU Count: 2


2021-11-19 22:13:24,015 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-11-19 22:13:24,017 : INFO : EPOCH - 1 : training on 165580 raw words (53265 effective words) took 0.6s, 87938 effective words/s
2021-11-19 22:13:24,649 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-11-19 22:13:24,651 : INFO : EPOCH - 2 : training on 165580 raw words (53222 effective words) took 0.6s, 85472 effective words/s
2021-11-19 22:13:25,247 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-11-19 22:13:25,248 : INFO : EPOCH - 3 : training on 165580 raw words (53152 effective words) took 0.6s, 90207 effective words/s
2021-11-19 22:13:25,864 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-11-19 22:13:25,866 : INFO : EPOCH - 4 : training on 165580 raw words (53217 effective words) took 0.6s, 88758 effective words/s
2021-11-19 22:13:26,469 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-11-19 22

### Test the distance between C and the diatonic chords

Test if the model can capture that the diatonic chords are closer to the root.

Do this only if the input corpus contains the unigrams. 

In [41]:
test_tokens = ['C',
              'Dm',
              'Em',
              'F',
              'G7',
              'Am',
              'Bm7b5',
              'F7',
              'D7',
              'A7',
              'E7',
              'B7',
              'F#7',
               'Cm',
               'D']

if 1 in ngrams_for_input:
  ref = 'C'
  for t in test_tokens:
    print(f"{model.wv.similarity(ref, t):.3f}: {ref} <-> {t}")

1.000: C <-> C
0.746: C <-> Dm
0.594: C <-> Em
0.578: C <-> F
0.629: C <-> G7
0.632: C <-> Am
0.588: C <-> Bm7b5
0.463: C <-> F7
0.598: C <-> D7
0.732: C <-> A7
0.678: C <-> E7
0.597: C <-> B7
0.247: C <-> F#7
0.449: C <-> Cm
0.428: C <-> D


In [42]:
if 1 in ngrams_for_input:
  model.wv.similar_by_word('C', topn=20)

Plot the generated word vectors in 2D space

In [43]:
from sklearn.manifold import TSNE

# input data: vectors for all tokens
weights = model.wv.vectors

# only do this when the vocabulary is not too big....
if len(weights) < 100:

  # do T-SNE
  tsne = TSNE(n_components=2, 
              random_state=42,
              perplexity=30,
              learning_rate='auto',
              init='pca',
              n_iter=2000
              )
  T = tsne.fit_transform(weights)

  # plot
  projected = pd.DataFrame(T)

  fig = px.scatter(
      projected, 
      x=0, y=1,
      #color='mode',
      text=model.wv.index_to_key,
      width=800, height=600,
      title="T-SNE applied to Chord Vectors for Tunes in Cmaj/Amin"
  )
  fig.update_traces(textposition='top center')
  fig.update_traces(textfont_size=12, selector=dict(type='scatter'))
  fig.show()


The PCA initialization in TSNE will change to have the standard deviation of PC1 equal to 1e-4 in 1.2. This will ensure better convergence.



### Test for single tunes

In [44]:
len(sectionid_to_section)

5403

In [45]:
len(model.dv)

7215

In [46]:
title_to_sectionid['These Foolish Things']

[3866, 3867, 3868, 3869]

In [47]:
section_key = 3868

In [48]:
model.dv.similar_by_key(section_key, topn=20)

[(1889, 0.8104585409164429),
 (2760, 0.7966096997261047),
 (1166, 0.7420021891593933),
 (1283, 0.7402357459068298),
 (1511, 0.7371323108673096),
 (5136, 0.7291520237922668),
 (3296, 0.7258016467094421),
 (3595, 0.7229234576225281),
 (5316, 0.7193516492843628),
 ('titleid_1170', 0.710716962814331),
 (5011, 0.7084869146347046),
 (1514, 0.7073755860328674),
 (1906, 0.7019268870353699),
 ('titleid_464', 0.699430525302887),
 (51, 0.6843920946121216),
 (311, 0.6798987984657288),
 (4799, 0.6794383525848389),
 (3875, 0.6720521450042725),
 (3234, 0.6661337614059448),
 ('titleid_1244', 0.665622889995575)]

In [49]:
sectionid_to_section[1889]

"I've Told Eve'ry Little Star, section3 (B)"

In [50]:
titles.query('title == "These Foolish Things"')

Unnamed: 0,id,tune_id,section_id,section_name,title,tune_mode
3866,3866,1170,1,A,These Foolish Things,major
3867,3867,1170,2,A,These Foolish Things,major
3868,3868,1170,3,B,These Foolish Things,major
3869,3869,1170,4,A,These Foolish Things,major


In [51]:
tune_key = 'titleid_1170'

In [52]:
model.dv.similar_by_key(tune_key, topn=20)

[('titleid_616', 0.7653118968009949),
 (3867, 0.7396513819694519),
 ('titleid_520', 0.7178188562393188),
 (1889, 0.7116326689720154),
 (3868, 0.7107169032096863),
 (1698, 0.6806668043136597),
 ('titleid_1240', 0.6595369577407837),
 (2031, 0.6540782451629639),
 (1511, 0.6465381383895874),
 (2033, 0.6448988318443298),
 ('titleid_521', 0.6437559723854065),
 ('titleid_1334', 0.6411613821983337),
 (311, 0.6371945142745972),
 (1166, 0.6207894086837769),
 ('titleid_164', 0.6198546886444092),
 ('titleid_1517', 0.6185333132743835),
 (4347, 0.6144623160362244),
 (1514, 0.6139699220657349),
 (3796, 0.613869845867157),
 (3282, 0.6052935123443604)]

In [53]:
titles.iloc[755]

id                                           755
tune_id                                      234
section_id                                     5
section_name                                   D
title           Chega De Saudade (No More Blues)
tune_mode                                  minor
Name: 755, dtype: object

#### Section Similarity


In [54]:
def get_section_scores_doc2vec(tunes):

    df_sim = pd.DataFrame(columns=['reference_title',
                                  'similar_title',
                                  'ref_section', 
                                  'similar_section', 
                                  'score', 
                                  ])

    for tune in tunes:
      print()
      print("-"*50)
      for s1 in title_to_sectionid[tune]:
      
          # sections are tagged with the numeric sectionid and can be used for querying
          sims = model.dv.similar_by_key(s1, topn=40)
          
          print(s1, sectionid_to_section[s1])
          n = 0
          for s2, s2_score in sims:
              if type(s2) == int:  # only consider similarities to sectionids; tuneids are strings
                  # don't count self-similarity between sections of the same tune
                  if s2 not in title_to_sectionid[tune]:
                      n += 1
                  
                      # print(f"\t{s2_score:.3f} {sectionid_to_section[s2]}")

                      df_sim.loc[len(df_sim)] = [tune,
                                      sectionid_to_title[s2],
                                      sectionid_to_section[s1], 
                                      sectionid_to_section[s2], 
                                      s2_score, 
                                      ]
    return df_sim

In [55]:
df_section_sim = get_section_scores_doc2vec(tunes_eval_list)


--------------------------------------------------
3752 Sweet Sue, Just You, section1 (A)
3753 Sweet Sue, Just You, section2 (A)
3754 Sweet Sue, Just You, section3 (B)
3755 Sweet Sue, Just You, section4 (A)

--------------------------------------------------
3866 These Foolish Things, section1 (A)
3867 These Foolish Things, section2 (A)
3868 These Foolish Things, section3 (B)
3869 These Foolish Things, section4 (A)

--------------------------------------------------
496 Blue Moon, section1 (A)
497 Blue Moon, section2 (A)
498 Blue Moon, section3 (B)
499 Blue Moon, section4 (A)

--------------------------------------------------
198 All Of Me, section1 (A)
199 All Of Me, section2 (B)
200 All Of Me, section3 (A)
201 All Of Me, section4 (C)

--------------------------------------------------
188 All God's Chillun Got Rhythm, section1 (A)
189 All God's Chillun Got Rhythm, section2 (B)
190 All God's Chillun Got Rhythm, section3 (A)
191 All God's Chillun Got Rhythm, section4 (C)

-----------

In [56]:
#df_section_sim.head(50)

In [57]:
import plotly.express as px
fig = px.histogram(df_section_sim, x="score", nbins=50, title='Scores for Section Similarities')
fig.show()

In [58]:
result, details = recommend_tune(df_section_sim, 'These Foolish Things')
result.head(30)

Unnamed: 0_level_0,score,max,score_div_max
similar_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
I've Told Eve'ry Little Star,0.810459,0.810459,1.0
I Loves You Porgy,0.94385,0.94385,1.0
More Than You Know,0.79661,0.810459,0.982912
With A Song In My Heart,0.870388,0.911948,0.954428
Isn't It A Pity,0.863836,0.911948,0.947243
Embraceable You,0.742002,0.810459,0.915534
"Folks Who Live On The Hill, The",0.740236,0.810459,0.913354
Here's To Life,0.737132,0.810459,0.909525
Rosetta,0.729152,0.810459,0.899678
Don't Know Why,0.841925,0.94385,0.896281


#### Document Similarity


The TitleId is tagged with the prefix `title_id`, so that it can be distinguished from the sectionid tags, which are numeric.

In [59]:
tunes_eval_list

['Sweet Sue, Just You',
 'These Foolish Things',
 'Blue Moon',
 'All Of Me',
 "All God's Chillun Got Rhythm",
 'I Got Rhythm',
 'Bye Bye Blackbird',
 'Old Fashioned Love',
 'Dinah',
 'Honeysuckle Rose',
 'Misty']

In [60]:
# convert titles to titleid; if multiple tunes with same title, takes the first one. 
titleid_eval = []
for title in tunes_eval_list:
  titleid_eval.append(titles.query(f'title == "{title}"').reset_index().loc[0,'tune_id'])
titleid_eval

[1138, 1170, 159, 60, 57, 505, 206, 1642, 308, 472, 807]

In [61]:
for titleid in titleid_eval:
  print()
  print(titleid_to_title[titleid])
  sims = model.dv.similar_by_key(f'titleid_{titleid}', topn=30)
  for key, value in sims:
    if type(key) == str:  # only display document similarities, skip all section similarities
      print(f"{value:.3f} {titleid_to_title[int(key.replace('titleid_', ''))]}")



Sweet Sue, Just You
0.952 Sweet Sue
0.900 I Want To Be Happy
0.884 New York, New York
0.867 Valse Hot
0.837 Avalon

These Foolish Things
0.765 Isn't It A Pity
0.718 I Loves You Porgy
0.660 Very Thought Of You, The
0.644 I May Be Wrong
0.641 You're The Top
0.620 Blue Turning Grey Over You
0.619 Hundred Years From Today, A
0.604 Woody'n You
0.595 They Can't Take That Away From Me
0.593 Eiderdown
0.585 Apple Blossom Time
0.579 Look To The Rainbow
0.579 It Might As Well Be Spring

Blue Moon
0.737 Friday The 13th
0.663 Sleepy Time Gal
0.654 It's You Or No One
0.647 Blue Daniel
0.646 C.T.A.
0.630 All Through The Day
0.623 African Queen, The
0.618 I Mean You

All Of Me
0.809 I'll Be Seeing You
0.785 Shine
0.763 Tempus Fugit

All God's Chillun Got Rhythm
0.917 But Beautiful
0.890 Let's Get Lost
0.850 Lullaby Of Birdland
0.806 Teach Me Tonight
0.801 Cry Me A River
0.800 Besame Mucho
0.799 Too Young To Go Steady
0.796 Come Back To Me
0.790 I'm Getting Sentimental Over You
0.788 I've Got My Love