<a href="https://colab.research.google.com/github/11doris/jazz-maestro/blob/colab_word_embeddings/recommender_score.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sections as Input

In [1]:
!pip uninstall gensim -y

Found existing installation: gensim 4.1.2
Uninstalling gensim-4.1.2:
  Successfully uninstalled gensim-4.1.2


In [2]:
!pip install gensim

Collecting gensim
  Using cached gensim-4.1.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
Installing collected packages: gensim
Successfully installed gensim-4.1.2


In [38]:
import gensim
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import pprint
import pandas as pd
import numpy as np
from collections import Counter
import plotly.express as px
from tqdm import tqdm 
from gensim.models.doc2vec import Doc2Vec
import pickle
import os

In [39]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [40]:
print(gensim.__version__)

4.1.2


# Initialization

## Download the Data

### Basic Plus Chords
M7 and 6 reduced to major triad, m7 reduced to m, dominant 7, m7b5, diminished, and all (b5) left as they are.

In [41]:
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=17djlZRWFSUCviOxRTOF-nwbbZqU5gUP9' -O data.csv

--2021-11-19 18:07:00--  https://docs.google.com/uc?export=download&id=17djlZRWFSUCviOxRTOF-nwbbZqU5gUP9
Resolving docs.google.com (docs.google.com)... 209.85.200.139, 209.85.200.101, 209.85.200.102, ...
Connecting to docs.google.com (docs.google.com)|209.85.200.139|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-0s-4c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/mc55bl6bv8ah1v9tr3p2lm2409cfg5i4/1637345175000/14329102864480165501/*/17djlZRWFSUCviOxRTOF-nwbbZqU5gUP9?e=download [following]
--2021-11-19 18:07:00--  https://doc-0s-4c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/mc55bl6bv8ah1v9tr3p2lm2409cfg5i4/1637345175000/14329102864480165501/*/17djlZRWFSUCviOxRTOF-nwbbZqU5gUP9?e=download
Resolving doc-0s-4c-docs.googleusercontent.com (doc-0s-4c-docs.googleusercontent.com)... 108.177.111.132, 2607:f8b0:4001:c07::84
Connecting to doc-0s-4c-docs.googleusercontent.com (doc-0s-4c-

### Read Chords Input Data

In [42]:
path_to_file = '/content/data.csv'
df = pd.read_csv(path_to_file, sep='\t', index_col="id")
df = df.reset_index()
df.head(5)

Unnamed: 0,id,file_name,title,tune_mode,tune_id,section_name,section_id,chords
0,0,dataset/jazz1350/26-2.xml,26-2,major,0,A,1,C Eb7 G# B7 E G7 Gm C7 F G#7 C# E7 Am D7 Dm G7
1,1,dataset/jazz1350/26-2.xml,26-2,major,0,A,2,C Eb7 G# B7 E G7 Gm C7 F Eb7 G# B7 E G7 C
2,2,dataset/jazz1350/26-2.xml,26-2,major,0,B,3,Gm C7 Bm E7 A C7 F Bbm Eb7 G# Dm G7
3,3,dataset/jazz1350/26-2.xml,26-2,major,0,A,4,C Eb7 G# B7 E G7 Gm C7 F Eb7 G# B7 E G7 C
4,4,dataset/jazz1350/500 Miles High.xml,500 Miles High,minor,1,,0,Am Am Cm Cm Eb Eb Em7b5 A7 Dm Dm Bm7b5 Bm7b5 B...


### Meta Data

In [43]:
titles = df.loc[:, ['id', 'tune_id', 'section_id', 'section_name', 'title', 'tune_mode']]
titles[:10]

Unnamed: 0,id,tune_id,section_id,section_name,title,tune_mode
0,0,0,1,A,26-2,major
1,1,0,2,A,26-2,major
2,2,0,3,B,26-2,major
3,3,0,4,A,26-2,major
4,4,1,0,,500 Miles High,minor
5,5,2,0,,502 Blues,minor
6,6,3,1,A,52nd Street Theme,major
7,7,3,2,A,52nd Street Theme,major
8,8,3,3,B,52nd Street Theme,major
9,9,3,4,A,52nd Street Theme,major


In [44]:
titles_dict = titles.to_dict()

sectionid_to_title = titles_dict['title']
sectionid_to_title_id = titles_dict['tune_id']


In [45]:
titles_rows = titles.to_dict(orient='records')
sectionid_to_section = []
for i, row in enumerate(titles_rows):
  name = f"{row['title']}, section{row['section_id']} ({row['section_name']})"
  sectionid_to_section.append(name)
  

In [46]:
title_to_sectionid = {}

for row in titles.iterrows():
  title = row[1]['title']
  if title not in title_to_sectionid:
    title_to_sectionid[title] = [row[1]['id']]
  else:
    title_to_sectionid[title].append(row[1]['id'])

## Helpers for N-Grams

In [47]:
def ngrams(tokens, n=2, sep='-'):
    return [sep.join(ngram) for ngram in zip(*[tokens[i:] for i in range(n)])]

# Data Preparation

In [48]:
ngrams_for_input = [1, 2]

In [49]:
lines = df.loc[:, 'chords'].tolist()
data = [line.split(' ') for line in lines]

In [50]:
processed_corpus = []
for line in data:
  tune_n = []
  for n in ngrams_for_input:
    tune_n.extend(ngrams(line, n=n))
  processed_corpus.append(tune_n)

for line in processed_corpus[:5]:
  print(line)

['C', 'Eb7', 'G#', 'B7', 'E', 'G7', 'Gm', 'C7', 'F', 'G#7', 'C#', 'E7', 'Am', 'D7', 'Dm', 'G7', 'C-Eb7', 'Eb7-G#', 'G#-B7', 'B7-E', 'E-G7', 'G7-Gm', 'Gm-C7', 'C7-F', 'F-G#7', 'G#7-C#', 'C#-E7', 'E7-Am', 'Am-D7', 'D7-Dm', 'Dm-G7']
['C', 'Eb7', 'G#', 'B7', 'E', 'G7', 'Gm', 'C7', 'F', 'Eb7', 'G#', 'B7', 'E', 'G7', 'C', 'C-Eb7', 'Eb7-G#', 'G#-B7', 'B7-E', 'E-G7', 'G7-Gm', 'Gm-C7', 'C7-F', 'F-Eb7', 'Eb7-G#', 'G#-B7', 'B7-E', 'E-G7', 'G7-C']
['Gm', 'C7', 'Bm', 'E7', 'A', 'C7', 'F', 'Bbm', 'Eb7', 'G#', 'Dm', 'G7', 'Gm-C7', 'C7-Bm', 'Bm-E7', 'E7-A', 'A-C7', 'C7-F', 'F-Bbm', 'Bbm-Eb7', 'Eb7-G#', 'G#-Dm', 'Dm-G7']
['C', 'Eb7', 'G#', 'B7', 'E', 'G7', 'Gm', 'C7', 'F', 'Eb7', 'G#', 'B7', 'E', 'G7', 'C', 'C-Eb7', 'Eb7-G#', 'G#-B7', 'B7-E', 'E-G7', 'G7-Gm', 'Gm-C7', 'C7-F', 'F-Eb7', 'Eb7-G#', 'G#-B7', 'B7-E', 'E-G7', 'G7-C']
['Am', 'Am', 'Cm', 'Cm', 'Eb', 'Eb', 'Em7b5', 'A7', 'Dm', 'Dm', 'Bm7b5', 'Bm7b5', 'Bbm', 'Bbm', 'Fm', 'Fm', 'E7', 'E7', 'Fm', 'Fm', 'C#', 'C#', 'Fm', 'Fm', 'C#', 'C#', 'Am-Am', '

# TF-IDF

In [51]:
from gensim import corpora
from gensim import similarities


In [52]:
dictionary = corpora.Dictionary(processed_corpus)

2021-11-19 18:07:05,150 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-11-19 18:07:05,393 : INFO : built Dictionary(2095 unique tokens: ['Am', 'Am-D7', 'B7', 'B7-E', 'C']...) from 5403 documents (total 160177 corpus positions)
2021-11-19 18:07:05,398 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(2095 unique tokens: ['Am', 'Am-D7', 'B7', 'B7-E', 'C']...) from 5403 documents (total 160177 corpus positions)", 'datetime': '2021-11-19T18:07:05.397942', 'gensim': '4.1.2', 'python': '3.7.12 (default, Sep 10 2021, 00:21:48) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'created'}


In [53]:
# dictionary.token2id

In [54]:
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

In [121]:
!rm -R index
!mkdir index

In [122]:
# Note: SparseMatrixSimilarity crashes due to lacking memory
#from gensim.test.utils import get_tmpfile
#index_tmpfile = get_tmpfile("index")
#index_tfidf = similarities.Similarity(index_tmpfile, bow_corpus, num_features=len(dictionary))

index_tfidf = similarities.Similarity('/content/index/index_tfidf', bow_corpus, num_features=len(dictionary))

2021-11-19 18:22:43,228 : INFO : starting similarity index under /content/index/index_tfidf


In [123]:
ls -la /content/index

total 8
drwxr-xr-x 2 root root 4096 Nov 19 18:22 [0m[01;34m.[0m/
drwxr-xr-x 1 root root 4096 Nov 19 18:22 [01;34m..[0m/


## Test for single tunes

In [124]:
def get_sim_scores(tunes, index):

    df_sim = pd.DataFrame(columns=['reference_title',
                                  'similar_title',
                                  'ref_section', 
                                  'similar_section', 
                                  'score', 
                                  ])

    for tune in tunes:
      print()
      print("-"*50)
      for s1 in title_to_sectionid[tune]:
      
          query = processed_corpus[s1]
          query_bow = dictionary.doc2bow(query)

          # perform a similarity query against the corpus
          similarities = index[query_bow]
          sims = sorted(enumerate(similarities), key=lambda item: -item[1])
          
          print(s1, sectionid_to_section[s1])
          n = 0
          for s2, s2_score in sims:
            
            # store the top N best results
            if n > 40:
                break
            # don't count self-similarity between sections of the same tune
            if s2 not in title_to_sectionid[tune]:
                n += 1
            
                # print(f"\t{s2_score:.3f} {sectionid_to_section[s2]}")

                df_sim.loc[len(df_sim)] = [tune,
                                sectionid_to_title[s2],
                                sectionid_to_section[s1], 
                                sectionid_to_section[s2], 
                                s2_score, 
                                ]
    return df_sim

In [125]:
tunes_eval_list = [
  'Sweet Sue, Just You',
  'These Foolish Things', 
  'Blue Moon',
  'All Of Me',
  "All God's Chillun Got Rhythm",
  'I Got Rhythm',
  'Bye Bye Blackbird',
  'Old Fashioned Love',
  'Dinah',
  'Honeysuckle Rose',
  'Misty'
]

In [126]:
%%time
df_sim = get_sim_scores(tunes_eval_list, index_tfidf)

2021-11-19 18:22:46,133 : INFO : creating sparse index
2021-11-19 18:22:46,135 : INFO : creating sparse matrix from corpus
2021-11-19 18:22:46,138 : INFO : PROGRESS: at document #0/5403



--------------------------------------------------


2021-11-19 18:22:46,452 : INFO : created <5403x1602 sparse matrix of type '<class 'numpy.float32'>'
	with 97814 stored elements in Compressed Sparse Row format>
2021-11-19 18:22:46,455 : INFO : creating sparse shard #0
2021-11-19 18:22:46,464 : INFO : saving index shard to /content/index/index_tfidf.0
2021-11-19 18:22:46,468 : INFO : SparseMatrixSimilarity lifecycle event {'fname_or_handle': '/content/index/index_tfidf.0', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-11-19T18:22:46.468892', 'gensim': '4.1.2', 'python': '3.7.12 (default, Sep 10 2021, 00:21:48) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'saving'}
2021-11-19 18:22:46,474 : INFO : saved /content/index/index_tfidf.0
2021-11-19 18:22:46,477 : INFO : loading SparseMatrixSimilarity object from /content/index/index_tfidf.0
2021-11-19 18:22:46,487 : INFO : SparseMatrixSimilarity lifecycle event {'fname': '/content/index/index_tfidf.0', 'datetime

3752 Sweet Sue, Just You, section1 (A)
3753 Sweet Sue, Just You, section2 (A)
3754 Sweet Sue, Just You, section3 (B)
3755 Sweet Sue, Just You, section4 (A)

--------------------------------------------------
3866 These Foolish Things, section1 (A)
3867 These Foolish Things, section2 (A)
3868 These Foolish Things, section3 (B)
3869 These Foolish Things, section4 (A)

--------------------------------------------------
496 Blue Moon, section1 (A)
497 Blue Moon, section2 (A)
498 Blue Moon, section3 (B)
499 Blue Moon, section4 (A)

--------------------------------------------------
198 All Of Me, section1 (A)
199 All Of Me, section2 (B)
200 All Of Me, section3 (A)
201 All Of Me, section4 (C)

--------------------------------------------------
188 All God's Chillun Got Rhythm, section1 (A)
189 All God's Chillun Got Rhythm, section2 (B)
190 All God's Chillun Got Rhythm, section3 (A)
191 All God's Chillun Got Rhythm, section4 (C)

--------------------------------------------------
1647 I Got R

In [127]:
import plotly.express as px
fig = px.histogram(df_sim, x="score", nbins=50)
fig.show()

In [128]:
def recommend_tune(df, tune_name):

  df_tune = df.query(f"reference_title == '{tune_name}'")
  ff = df_tune.iloc[:]

  # get the maximum similarity score for each section and store in new column
  ff['max'] = ff.groupby('ref_section')['score'].transform('max')

  # scale the score with the maxmum value of each section
  ff['score_div_max'] = ff['score'] / ff['max']

  # consider only the top N tunes for each group
  ff = ff.sort_values(['ref_section', 'score_div_max'], ascending=[True, False])
  result = ff.groupby('ref_section').head(30)
  
  # if multiple rows from the same similar tune, keep only the similar_title with the highest score_div_max
  result = result.groupby('similar_title').max('score_div_max').sort_values('score_div_max', ascending=False)

  return result, ff

In [129]:
result, details = recommend_tune(df_sim, 'Blue Moon')
result.head(30)

Unnamed: 0_level_0,score,max,score_div_max
similar_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
I've Heard That Song Before,0.840168,0.840168,1.0
"Touch Of Your Lips, The",0.936239,0.949495,1.0
Mountain Greenery,0.949495,0.949495,1.0
They All Laughed,0.838398,0.840168,0.997893
Long Ago And Far Away,0.941537,0.949495,0.991619
Jeepers Creepers,0.934512,0.949495,0.990198
All Through The Day,0.82963,0.840168,0.987457
It Could Happen To You,0.868778,0.880281,0.986933
At Last,0.937009,0.949495,0.98685
Heart And Soul,0.932527,0.949495,0.982129


In [130]:
details.query('ref_section == "These Foolish Things, section3 (B)"').head(10)

Unnamed: 0,reference_title,similar_title,ref_section,similar_section,score,max,score_div_max


# SVD based LSI

In [131]:
from gensim import corpora, models, similarities
from collections import defaultdict

In [132]:
frequency = defaultdict(int)
for text in processed_corpus:
    for token in text:
        frequency[token] += 1

data_ngrams = [[token for token in text if frequency[token] > 1] for text in processed_corpus]
dictionary = corpora.Dictionary(data_ngrams)

# doc2bow counts the number of occurences of each distinct word,
# converts the word to its integer word id and returns the result
# as a sparse vector

bow_corpus = [dictionary.doc2bow(text) for text in data_ngrams]
lsi = models.LsiModel(bow_corpus, id2word=dictionary, num_topics=200)  # num_topics can be maximum the size of the number of unique tokens

2021-11-19 18:22:55,952 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-11-19 18:22:56,183 : INFO : built Dictionary(1602 unique tokens: ['Am', 'Am-D7', 'B7', 'B7-E', 'C']...) from 5403 documents (total 159684 corpus positions)
2021-11-19 18:22:56,186 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(1602 unique tokens: ['Am', 'Am-D7', 'B7', 'B7-E', 'C']...) from 5403 documents (total 159684 corpus positions)", 'datetime': '2021-11-19T18:22:56.186693', 'gensim': '4.1.2', 'python': '3.7.12 (default, Sep 10 2021, 00:21:48) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'created'}
2021-11-19 18:22:56,319 : INFO : using serial LSI version on this node
2021-11-19 18:22:56,331 : INFO : updating model with new documents
2021-11-19 18:22:56,332 : INFO : preparing a new chunk of documents
2021-11-19 18:22:56,391 : INFO : using 100 extra samples and 2 power iterations
2021-11-19 18:22:56,397 : INFO : 1st phase: constructing 

In [133]:
index_lsi = similarities.Similarity('/content/index/index_lsi', bow_corpus, num_features=len(dictionary))

2021-11-19 18:22:57,393 : INFO : starting similarity index under /content/index/index_lsi


In [139]:
%%time
df_sim = get_sim_scores(tunes_eval_list, index_lsi)


--------------------------------------------------
3752 Sweet Sue, Just You, section1 (A)
3753 Sweet Sue, Just You, section2 (A)
3754 Sweet Sue, Just You, section3 (B)
3755 Sweet Sue, Just You, section4 (A)

--------------------------------------------------
3866 These Foolish Things, section1 (A)
3867 These Foolish Things, section2 (A)
3868 These Foolish Things, section3 (B)
3869 These Foolish Things, section4 (A)

--------------------------------------------------
496 Blue Moon, section1 (A)
497 Blue Moon, section2 (A)
498 Blue Moon, section3 (B)
499 Blue Moon, section4 (A)

--------------------------------------------------
198 All Of Me, section1 (A)
199 All Of Me, section2 (B)
200 All Of Me, section3 (A)
201 All Of Me, section4 (C)

--------------------------------------------------
188 All God's Chillun Got Rhythm, section1 (A)
189 All God's Chillun Got Rhythm, section2 (B)
190 All God's Chillun Got Rhythm, section3 (A)
191 All God's Chillun Got Rhythm, section4 (C)

-----------

In [135]:
df_sim

Unnamed: 0,reference_title,similar_title,ref_section,similar_section,score
0,"Sweet Sue, Just You",My Secret Love,"Sweet Sue, Just You, section1 (A)","My Secret Love, section2 (A)",0.978496
1,"Sweet Sue, Just You",I Want To Be Happy,"Sweet Sue, Just You, section1 (A)","I Want To Be Happy, section2 (A)",0.974397
2,"Sweet Sue, Just You",I Want To Be Happy,"Sweet Sue, Just You, section1 (A)","I Want To Be Happy, section4 (A)",0.974397
3,"Sweet Sue, Just You",Don't Fence Me In,"Sweet Sue, Just You, section1 (A)","Don't Fence Me In, section1 (A)",0.971730
4,"Sweet Sue, Just You",Get Me To The Church On Time,"Sweet Sue, Just You, section1 (A)","Get Me To The Church On Time, section1 (A)",0.969736
...,...,...,...,...,...
2045,Misty,"Blue Room, The","Misty, section4 (A)","Blue Room, The, section4 (C)",0.831235
2046,Misty,Stranger On The Shore,"Misty, section4 (A)","Stranger On The Shore, section0 (nan)",0.830201
2047,Misty,Deed I Do,"Misty, section4 (A)","Deed I Do, section1 (A)",0.829928
2048,Misty,Rockin' Chair,"Misty, section4 (A)","Rockin' Chair, section3 (A)",0.829928


In [138]:
import plotly.express as px
fig = px.histogram(df_sim, x="score", nbins=50)
fig.show()

In [137]:
result, details = recommend_tune(df_sim, 'These Foolish Things')
result.head(30)

Unnamed: 0_level_0,score,max,score_div_max
similar_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
More Than You Know,0.836242,0.836242,1.0
"Way You Look Tonight, The",0.929954,0.929954,1.0
Mountain Greenery,0.89901,0.929954,1.0
"Party's Over, The",0.918547,0.929954,0.987734
I Like The Likes Of You,0.901338,0.929954,0.981706
I Won't Dance,0.893457,0.929954,0.977259
They All Laughed,0.900987,0.929954,0.968851
Heart And Soul,0.899528,0.929954,0.967282
Too Young To Go Steady,0.850118,0.881356,0.964557
Let's Fall In Love,0.888975,0.929954,0.96036
