<a href="https://colab.research.google.com/github/11doris/jazz-maestro/blob/colab_word_embeddings/tune_similarity_doc2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip uninstall gensim -y

Found existing installation: gensim 3.6.0
Uninstalling gensim-3.6.0:
  Successfully uninstalled gensim-3.6.0


In [1]:
!pip install gensim



In [2]:
import gensim
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import pprint
import pandas as pd
import numpy as np
from collections import Counter
import plotly.express as px
from tqdm import tqdm 
from gensim.models.doc2vec import Doc2Vec
import pickle
import os

In [3]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
print(gensim.__version__)

4.1.2


# Initialization

## Download the Data

Major Triads, Minor Triads and Dom7 chords:

In [5]:
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=13WKnD2PIZlJA0TjN9-IbFpD-M7eUCgrd' -O input_seq.txt

--2021-11-13 18:55:13--  https://docs.google.com/uc?export=download&id=13WKnD2PIZlJA0TjN9-IbFpD-M7eUCgrd
Resolving docs.google.com (docs.google.com)... 172.217.5.206, 2607:f8b0:4007:814::200e
Connecting to docs.google.com (docs.google.com)|172.217.5.206|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-08-4c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/holbt9lnghfl5e44pc8ae3oghi7muppm/1636829700000/14329102864480165501/*/13WKnD2PIZlJA0TjN9-IbFpD-M7eUCgrd?e=download [following]
--2021-11-13 18:55:14--  https://doc-08-4c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/holbt9lnghfl5e44pc8ae3oghi7muppm/1636829700000/14329102864480165501/*/13WKnD2PIZlJA0TjN9-IbFpD-M7eUCgrd?e=download
Resolving doc-08-4c-docs.googleusercontent.com (doc-08-4c-docs.googleusercontent.com)... 142.250.68.1, 2607:f8b0:4007:80f::2001
Connecting to doc-08-4c-docs.googleusercontent.com (doc-08-4c-docs.googleuse

Meta Data:

In [6]:
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1Qs_SNXKBCeHOZO4VTwIPLR6OsPzEjk6q' -O input_meta.csv

--2021-11-13 18:55:14--  https://docs.google.com/uc?export=download&id=1Qs_SNXKBCeHOZO4VTwIPLR6OsPzEjk6q
Resolving docs.google.com (docs.google.com)... 142.250.72.142, 2607:f8b0:4007:80d::200e
Connecting to docs.google.com (docs.google.com)|142.250.72.142|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-14-4c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/2ffu1p8cmlin53r0m8rvgul468nltdm9/1636829700000/14329102864480165501/*/1Qs_SNXKBCeHOZO4VTwIPLR6OsPzEjk6q?e=download [following]
--2021-11-13 18:55:15--  https://doc-14-4c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/2ffu1p8cmlin53r0m8rvgul468nltdm9/1636829700000/14329102864480165501/*/1Qs_SNXKBCeHOZO4VTwIPLR6OsPzEjk6q?e=download
Resolving doc-14-4c-docs.googleusercontent.com (doc-14-4c-docs.googleusercontent.com)... 142.250.68.1, 2607:f8b0:4007:80f::2001
Connecting to doc-14-4c-docs.googleusercontent.com (doc-14-4c-docs.googleu

In [7]:
path_to_file = '/content/input_seq.txt'

with open(path_to_file) as f: 
  lines = f.read().splitlines()

In [8]:
data = [line.split(' ') for line in lines]

### Meta Data

In [9]:
meta = pd.read_csv('input_meta.csv', sep='\t')
meta.columns

Index(['id', 'file_name', 'title', 'composer', 'year', 'year_truncated',
       'tonality', 'tune_key', 'tune_mode', 'structure', 'num_bars',
       'time_signature', 'cycle_fifths_order', 'style', 'musicbrainz_id',
       'wikidata_id', 'wikidata_allmusic', 'wiki_link', 'wikidata_description',
       'lyricist'],
      dtype='object')

In [10]:
titles = meta.loc[:, ['id', 'title', 'tune_mode']].drop_duplicates()
titles.set_index('id', inplace=True)

In [11]:
assert len(data) == len(titles)
print(len(data))

1812


In [12]:
titles_dict = titles.to_dict()
index_to_title = titles_dict['title']
index_to_title[1170]

'These Foolish Things'

In [13]:
title_to_index = {value: index for index, value in index_to_title.items()}
title_to_index['These Foolish Things']

1170

# Visualize Chord N-Grams



In [14]:
def ngrams(tokens, n=2, sep='-'):
    return [sep.join(ngram) for ngram in zip(*[tokens[i:] for i in range(n)])]

In [15]:
def list_to_ngrams(data, n=2, sep='-'):
  out = []
  for row in data:
    out.append(ngrams(row, n, sep))

  return out

In [16]:
data_ngrams = list_to_ngrams(data, 3)

In [17]:
def raw_chords_to_df(tunes):
  tunes_chords = [item for tune in tunes for item in tune]
  counts = Counter(tunes_chords)
  df = pd.DataFrame(counts.items(),
                    columns=['chord', 'count']).sort_values(by='count', ascending=False)

  return df

In [18]:
df_chords = raw_chords_to_df(data_ngrams)
df_chords.head(10)

Unnamed: 0,chord,count
14,Dm7-G7-C,3344
75,G7-C-C,1630
132,A7-Dm7-G7,1506
37,Em7-A7-Dm7,1132
410,C-C-C,1059
179,D7-G7-C,838
85,G7-G7-C,826
71,Am7-Dm7-G7,663
74,C-G7-C,638
6,Gm7-C7-F,615


In [19]:
df_chords = pd.DataFrame.from_dict(df_chords)
df_chords.sort_values(by=['count'], ascending=False, inplace=True)
df_chords_top = df_chords.query('count > 100')

fig = px.bar(df_chords_top, x='chord', y='count', log_y=True)
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
fig.show()

2021-11-13 18:55:19,828 : INFO : NumExpr defaulting to 2 threads.


# Generate Test Data

In [35]:
contrafacts = {
               "26-2": "Confirmation",
               "52nd Street Theme": "I Got Rhythm",
               "Ablution": "All The Things You Are",
               "Anthropology": "I Got Rhythm",
               "Ballade": "As Long As I Live",
               "Bright Mississippi": "Sweet Georgia Brown",
               "C.T.A.": "I Got Rhythm",
               "Celia": "I Got Rhythm",
               "Cottontail": "I Got Rhythm",
               "Countdown": "Tune Up",
               "Dewey Square": "Oh, Lady Be Good",
               "Dexterity": "I Got Rhythm",
               "Dig": "Sweet Georgia Brown",
               "Donna Lee": "Indiana (Back Home Again In)",
               "Don't Be That Way": "I Got Rhythm",
               "Eternal Triangle": "I Got Rhythm",
               "Evidence": "Just You, Just Me",
               "Flintstones": "I Got Rhythm",
               "Four On Six": "Summertime",
               "Freight Train": "Blues For Alice",
               "Good Bait": "I Got Rhythm",
               "Hackensack": "Oh, Lady Be Good",
               "Half Nelson": "Lady Bird",
               "Hot House": "What Is This Thing Called Love",
               "Impressions": "So What",
               "In A Mellow Tone (In A Mellotone)": "Rose Room",
               "In Walked Bud": "Blue Skies",
               "Ko Ko": "Cherokee",
               "Lennie's Pennies": "Pennies From Heaven",   ## Lennie's Pennies is in minor and therefore transposed to Amin... not possible to recognize like that
               # "Let's Call This": "Honeysuckle Rose",
               "Little Rootie Tootie": "I Got Rhythm",
               "Little Willie Leaps": "All God's Chillun Got Rhythm",
               "Lullaby Of Birdland": "Love Me Or Leave Me",
               "Moose The Mooche": "I Got Rhythm",
               "My Little Suede Shoes": "Jeepers Creepers",
               "Oleo": "I Got Rhythm",
               "Ornithology": "How High The Moon",
               "Passport": "I Got Rhythm",
               "Quasimodo (Theme)": "Embraceable You",
               "Rhythm-a-ning": "I Got Rhythm",
               "Room 608": "I Got Rhythm",
               "Salt Peanuts": "I Got Rhythm",
               "Satellite": "How High The Moon",
               "Scrapple From The Apple": "Honeysuckle Rose", # A section
               "Scrapple From The Apple": "I Got Rhythm", # B section
               "Segment": "I Got Rhythm",
               "Seven Come Eleven": "I Got Rhythm",
               "Shaw 'Nuff": "I Got Rhythm",
               "Theme, The": "I Got Rhythm",
               "Tour De Force": "Jeepers Creepers",
               "Wow": "You Can Depend On Me",
               "Yardbird Suite": "Rosetta",
               # following tunes are not from wikipedia:
               "Sweet Sue, Just You": "Honeysuckle Rose",  # A section
               # "All Of Me": "Pennies From Heaven", # bars 25-28 of All of Me are same as bars 17-20 of Pennies From Heaven, but different key!
}

contrafacts_test = []
for reference, compare in contrafacts.items():
  contrafacts_test.append([title_to_index[reference], title_to_index[compare]])
  

In [36]:
contrafacts_test

[[0, 251],
 [3, 1527],
 [30, 63],
 [80, 1527],
 [104, 1372],
 [190, 1136],
 [211, 1527],
 [222, 1527],
 [260, 1527],
 [262, 1219],
 [303, 894],
 [304, 1527],
 [307, 1136],
 [328, 604],
 [318, 1527],
 [360, 1527],
 [367, 670],
 [387, 1527],
 [404, 1726],
 [408, 168],
 [436, 1527],
 [447, 894],
 [448, 682],
 [474, 1269],
 [588, 1061],
 [590, 1677],
 [599, 1404],
 [677, 236],
 [699, 1660],
 [730, 1527],
 [733, 57],
 [761, 752],
 [829, 1527],
 [848, 651],
 [901, 1527],
 [925, 479],
 [940, 1527],
 [977, 352],
 [992, 1527],
 [996, 1527],
 [1007, 1527],
 [1011, 479],
 [1018, 1527],
 [1023, 1527],
 [1032, 1527],
 [1035, 1527],
 [1163, 1527],
 [1208, 651],
 [1304, 1800],
 [1306, 1679],
 [1138, 1515]]

# TF-IDF

## Build Model

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
data_tfidf = [" ".join(i) for i in data_ngrams]
data_tfidf[:5]

['C-Eb7-G# Eb7-G#-B7 G#-B7-E B7-E-G7 E-G7-Gm7 G7-Gm7-C7 Gm7-C7-F C7-F-G#7 F-G#7-C# G#7-C#-E7 C#-E7-Am7 E7-Am7-D7 Am7-D7-Dm7 D7-Dm7-G7 Dm7-G7-C G7-C-Eb7 C-Eb7-G# Eb7-G#-B7 G#-B7-E B7-E-G7 E-G7-Gm7 G7-Gm7-C7 Gm7-C7-F C7-F-Eb7 F-Eb7-G# Eb7-G#-B7 G#-B7-E B7-E-G7 E-G7-C G7-C-Gm7 C-Gm7-C7 Gm7-C7-Bm7 C7-Bm7-E7 Bm7-E7-A E7-A-C7 A-C7-F C7-F-Bbm7 F-Bbm7-Eb7 Bbm7-Eb7-G# Eb7-G#-Dm7 G#-Dm7-G7 Dm7-G7-C G7-C-Eb7 C-Eb7-G# Eb7-G#-B7 G#-B7-E B7-E-G7 E-G7-Gm7 G7-Gm7-C7 Gm7-C7-F C7-F-Eb7 F-Eb7-G# Eb7-G#-B7 G#-B7-E B7-E-G7 E-G7-C',
 'Am7-Am7-Cm7 Am7-Cm7-Cm7 Cm7-Cm7-Eb Cm7-Eb-Eb Eb-Eb-Em7 Eb-Em7-A7 Em7-A7-Dm7 A7-Dm7-Dm7 Dm7-Dm7-Bm7 Dm7-Bm7-Bm7 Bm7-Bm7-Bbm7 Bm7-Bbm7-Bbm7 Bbm7-Bbm7-Fm7 Bbm7-Fm7-Fm7 Fm7-Fm7-E7 Fm7-E7-E7 E7-E7-Fm7 E7-Fm7-Fm7 Fm7-Fm7-C# Fm7-C#-C# C#-C#-Fm7 C#-Fm7-Fm7 Fm7-Fm7-C# Fm7-C#-C#',
 'Am7-C#-Bm7 C#-Bm7-E7 Bm7-E7-Am7 E7-Am7-C# Am7-C#-Bm7 C#-Bm7-E7 Bm7-E7-Cm7 E7-Cm7-F7 Cm7-F7-Bb F7-Bb-G#m7 Bb-G#m7-C#7 G#m7-C#7-F#m7 C#7-F#m7-B7 F#m7-B7-E B7-E-E E-E-E7 E-E7-Am7 E7-Am7-C# Am7-C#-Bm7 C#-Bm7-E7 

In [24]:
tfidf_vectorizer = TfidfVectorizer()
dt = tfidf_vectorizer.fit_transform(data_tfidf)

In [25]:
type(dt)

scipy.sparse.csr.csr_matrix

In [84]:
df_cos = pd.DataFrame(cosine_similarity(tfidf_matrix),
                      columns=titles['title'],
                      index=titles['title']
                      )
df_cos

title,26-2,500 Miles High,502 Blues,52nd Street Theme,9.20 Special,A Ballad,A Beautiful Friendship,A Blossom Fell,A Certain Smile,A Child Is Born,A Felicidade,A Fine Romance,A Flower Is A Lovesome Thing,A Foggy Day,A Ghost Of A Chance,A Kiss To Build A Dream On,A Little Tear,A Lovely Way To Spend An Evening,A Night In Tunisia,A Nightingale Sang In Berkeley Square,A Pretty Girl Is Like A Melody,A Sleepin' Bee,A Smooth One,A Sound For Sore Ears,A Sunday Kind Of Love,A Taste Of Honey,A Time For Love,A Tisket A Tasket,A Weaver Of Dreams,A Wonderful Day Like Today,Ablution,Ac-Cent-Tchu-Ate The Positive,Across The Alley From The Alamo,Adam's Apple,Affirmation,"African Queen, The",Afro Blue,Afro Centric,After You've Gone,After You,...,When It's Sleepy Time Down South,When My Dreamboat Comes Home,When Somebody Thinks You're Wonderful,When The Midnight Choo Choo Leaves For Alabam,When The Moon Comes Over The Mountain,When The Saints Go Marching In,When You're Smiling,Whispering,"White Cliffs Of Dover, The","White Sport Coat And A Pink Carnation, A",Who's Sorry Now,Why Don't You Go Down To New Orleans,Willie The Weeper,Winin' Boy Blues,Wish Me Luck As You Wave Me Goodbye,"Without You For An Inspiration, Dear",Wooden Heart,Working Man Blues,Wrap Your Troubles In Dreams,Yama Yama Man,Yearning,Yellow Dog Blues,"Yes Sir, That's My Baby",Yes! We Have No Bananas,You Always Hurt The One You Love,You Are My Sunshine,You Belong To Me,You Broke Your Promise,You Can Depend On Me,You Meet The Nicest People In Your Dreams,You Must Have Been A Beautiful Baby,"You Tell Me Your Dreams, I'll Tell You Mine",You Were Only Fooling,You're A Real Sweetheart,You're Driving Me Crazy,You're Foolin' Someone,You're Lucky To Me,You're Nobody Till Somebody Loves You,Your Feet's Too Big,Zing Went The Strings Of My Heart
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
26-2,1.000000,0.111764,0.213623,0.321292,0.193126,0.247102,0.268104,0.327810,0.314778,0.169535,0.388903,0.259668,0.201880,0.402508,0.386387,0.257777,0.177972,0.467172,0.090736,0.365390,0.718987,0.299512,0.459669,0.338969,0.319770,0.044873,0.303548,0.271070,0.389645,0.330207,0.535083,0.334294,0.334947,0.304290,0.330155,0.054397,0.114383,0.085960,0.289932,0.417576,...,0.407979,0.371576,0.397886,0.300931,0.345253,0.308970,0.145367,0.407089,0.288436,0.140083,0.201541,0.367471,0.177561,0.164976,0.275750,0.300035,0.325527,0.317329,0.274141,0.351894,0.394415,0.271571,0.345896,0.536116,0.162030,0.391773,0.159949,0.155375,0.272049,0.311482,0.252204,0.167375,0.240757,0.362132,0.357766,0.430585,0.215786,0.146579,0.102567,0.115419
500 Miles High,0.111764,1.000000,0.308800,0.111728,0.126774,0.206469,0.135857,0.075321,0.195516,0.134072,0.220334,0.115953,0.017066,0.119592,0.385305,0.355685,0.145446,0.164412,0.143589,0.320076,0.110700,0.160836,0.036370,0.135843,0.397566,0.057898,0.287456,0.131034,0.224478,0.157595,0.266671,0.081658,0.232776,0.052591,0.128760,0.057787,0.220494,0.237518,0.155346,0.262432,...,0.065289,0.069538,0.055706,0.085088,0.032694,0.000000,0.038057,0.053720,0.039654,0.000000,0.107854,0.000000,0.081755,0.047846,0.084146,0.066939,0.000000,0.000000,0.116414,0.016613,0.000000,0.012402,0.000000,0.024022,0.038379,0.000000,0.015242,0.025631,0.114438,0.018898,0.077826,0.015189,0.145656,0.003525,0.024538,0.030911,0.133645,0.113945,0.000000,0.000000
502 Blues,0.213623,0.308800,1.000000,0.168464,0.235336,0.343072,0.115551,0.000000,0.287100,0.359525,0.533636,0.192603,0.000000,0.099193,0.177529,0.105393,0.051666,0.243559,0.277577,0.591927,0.210666,0.280767,0.000000,0.416900,0.082158,0.142315,0.712924,0.037231,0.433704,0.192368,0.384732,0.000000,0.095170,0.066069,0.246530,0.203470,0.532898,0.529578,0.209625,0.213792,...,0.287009,0.000000,0.064644,0.052608,0.069586,0.000000,0.000000,0.129609,0.028994,0.000000,0.195063,0.000000,0.225196,0.060936,0.164679,0.000000,0.000000,0.000000,0.262041,0.057233,0.116145,0.146671,0.000000,0.139404,0.010103,0.000000,0.000000,0.066227,0.043122,0.000000,0.120359,0.000000,0.193710,0.000000,0.198373,0.069863,0.028315,0.101343,0.199959,0.000000
52nd Street Theme,0.321292,0.111728,0.168464,1.000000,0.160205,0.186956,0.566583,0.729172,0.884869,0.399836,0.727814,0.563742,0.247791,0.716143,0.537907,0.654375,0.525248,0.796998,0.138078,0.501783,0.514722,0.471106,0.555711,0.315821,0.621108,0.209999,0.472217,0.614731,0.607049,0.735095,0.626329,0.797582,0.803878,0.171735,0.839449,0.393684,0.405467,0.000000,0.455546,0.808095,...,0.296463,0.736029,0.562478,0.547797,0.621673,0.457937,0.253665,0.509936,0.467971,0.289720,0.286885,0.550638,0.121057,0.301264,0.503204,0.641155,0.625866,0.598792,0.290865,0.724855,0.335390,0.296935,0.638542,0.321047,0.277113,0.327152,0.242761,0.277483,0.551278,0.562602,0.369179,0.344635,0.369295,0.446082,0.409130,0.665478,0.583389,0.328288,0.212426,0.239046
9.20 Special,0.193126,0.126774,0.235336,0.160205,1.000000,0.135049,0.203359,0.365849,0.104081,0.059001,0.123240,0.355479,0.326204,0.258383,0.393789,0.417615,0.114992,0.214989,0.101912,0.120123,0.223669,0.357073,0.319753,0.125477,0.202413,0.003517,0.062929,0.210759,0.209834,0.226687,0.213862,0.186212,0.145307,0.520538,0.098898,0.151268,0.000000,0.000000,0.227577,0.246179,...,0.077022,0.264253,0.190822,0.249485,0.303037,0.273816,0.152942,0.149164,0.276407,0.074179,0.127769,0.319835,0.077755,0.123488,0.109832,0.229580,0.205894,0.197571,0.060273,0.232147,0.088574,0.544272,0.231408,0.343886,0.224607,0.493226,0.158661,0.071046,0.351263,0.243849,0.282693,0.106926,0.177941,0.421222,0.097663,0.204996,0.157823,0.091347,0.289611,0.063130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
You're Foolin' Someone,0.430585,0.030911,0.069863,0.665478,0.204996,0.263134,0.385087,0.793854,0.634384,0.185196,0.323897,0.532458,0.238292,0.541823,0.503090,0.788578,0.430199,0.511749,0.055666,0.309220,0.466865,0.545563,0.543811,0.438278,0.470842,0.000000,0.213426,0.593344,0.237790,0.691034,0.485064,0.847358,0.854198,0.055887,0.251442,0.019663,0.000000,0.000000,0.271119,0.613413,...,0.467867,0.674546,0.692306,0.572908,0.571270,0.597235,0.311276,0.735008,0.489120,0.394702,0.347951,0.700406,0.226282,0.434789,0.625287,0.746786,0.886350,0.759710,0.314426,0.769071,0.766897,0.383266,0.871470,0.514202,0.367779,0.302726,0.345367,0.378031,0.581923,0.711166,0.399262,0.381065,0.483752,0.522247,0.693664,1.000000,0.588676,0.307200,0.306577,0.344995
You're Lucky To Me,0.215786,0.133645,0.028315,0.583389,0.157823,0.546186,0.833700,0.657263,0.683722,0.124012,0.453799,0.642657,0.279912,0.738875,0.635761,0.578758,0.856885,0.591674,0.121851,0.307095,0.590535,0.648442,0.550924,0.257199,0.844643,0.072705,0.274393,0.893020,0.404737,0.885198,0.480229,0.777067,0.643120,0.054181,0.333628,0.011539,0.000000,0.112691,0.582555,0.601993,...,0.265784,0.638929,0.634721,0.694935,0.363277,0.350481,0.384628,0.495281,0.319899,0.243881,0.524680,0.421898,0.150383,0.469519,0.440951,0.807574,0.520145,0.490050,0.333235,0.488054,0.284054,0.238092,0.523437,0.349878,0.319088,0.177652,0.318195,0.233579,0.731382,0.622156,0.482311,0.418664,0.309630,0.343836,0.415567,0.588676,1.000000,0.591584,0.179912,0.202457
You're Nobody Till Somebody Loves You,0.146579,0.113945,0.101343,0.328288,0.091347,0.317722,0.515517,0.360153,0.454522,0.164177,0.304303,0.624036,0.186249,0.438626,0.373762,0.317867,0.516809,0.345664,0.165065,0.201599,0.417631,0.492494,0.355361,0.303728,0.496742,0.064867,0.285229,0.509300,0.298602,0.548415,0.318993,0.436304,0.387732,0.030167,0.199152,0.012644,0.000000,0.057571,0.383760,0.412229,...,0.204658,0.390564,0.501948,0.626229,0.205259,0.175159,0.790526,0.491238,0.180700,0.749796,0.729457,0.222080,0.241641,0.840929,0.326220,0.624089,0.259951,0.290586,0.343093,0.281852,0.141961,0.124206,0.274016,0.304405,0.607659,0.088784,0.555828,0.751180,0.420214,0.376250,0.400944,0.468562,0.421648,0.516142,0.516519,0.307200,0.591584,1.000000,0.089914,0.510913
Your Feet's Too Big,0.102567,0.000000,0.199959,0.212426,0.289611,0.049791,0.121831,0.234787,0.187964,0.071634,0.141421,0.127261,0.087483,0.169434,0.126573,0.243095,0.111329,0.156114,0.009418,0.058348,0.242116,0.549205,0.193387,0.169526,0.133743,0.000000,0.030517,0.177419,0.067261,0.273643,0.114280,0.273451,0.300425,0.101465,0.059192,0.047177,0.000000,0.000000,0.079353,0.210751,...,0.090495,0.211131,0.185256,0.193590,0.194751,0.231010,0.081841,0.156491,0.123650,0.152671,0.134587,0.270917,0.067578,0.168176,0.227247,0.260894,0.342840,0.293856,0.080647,0.297477,0.187227,0.770782,0.337084,0.112360,0.075358,0.117094,0.098609,0.146222,0.153162,0.275079,0.125489,0.147396,0.187115,0.190590,0.174480,0.306577,0.179912,0.089914,1.000000,0.133444


## Test of the Contrafacts tunes

In [49]:
t1 = []
t2 = []
simval = []
for tune1, tune2 in contrafacts_test:
  t1.append(index_to_title[tune1])
  t2.append(index_to_title[tune2])
  simval.append(cosine_similarity(tfidf_matrix[tune1], tfidf_matrix[tune2])[0][0])
  

df_eval_tfidf = pd.DataFrame(zip(t1, t2, simval), columns=['tune1', 'tune2', 'cos_sim'])  
df_eval_tfidf

Unnamed: 0,tune1,tune2,cos_sim
0,26-2,Confirmation,0.574305
1,52nd Street Theme,I Got Rhythm,0.825817
2,Ablution,All The Things You Are,0.87981
3,Anthropology,I Got Rhythm,0.784389
4,Ballade,As Long As I Live,0.647502
5,Bright Mississippi,Sweet Georgia Brown,0.968172
6,C.T.A.,I Got Rhythm,0.704302
7,Celia,I Got Rhythm,0.486457
8,Cottontail,I Got Rhythm,0.912508
9,Countdown,Tune Up,0.712231


# Doc2Vec Embeddings

## Read and Tag Data

In [None]:
import smart_open

def read_corpus(fname, ngram_size, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = ngrams(line.split(), ngram_size, '-')
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

## Doc2Vec Hyperparameters

In [None]:
!rm -R models_doc2vec
!rm -R traindata_doc2vec

rm: cannot remove 'models_doc2vec': No such file or directory
rm: cannot remove 'traindata_doc2vec': No such file or directory


In [None]:
!mkdir models_doc2vec
!mkdir traindata_doc2vec

In [None]:
model_path = './models_doc2vec'
model_prefix = 'autos'
data_path = './traindata_doc2vec'

param_grid = {'d2v': {'variant': [#'pv-dm', 
                                  'pv-dbow'], 
                      'window': [
                                 #2, 3, 
                                 4, 
                                 #10
                                 ],
                      'sample': [0.1],
                      'ngrams': [
                                 #1, 2, 
                                 3, 
                                 #4
                                 ],
                     }
              }
size = 100
epochs = 50

for algo, params in param_grid.items():
    for variant in params['variant']:
        dm = 1 if variant == 'pv-dm' else 0
        for ngram_size in params['ngrams']:
            
            # prepare data with the provided ngram size
            train_corpus = list(read_corpus(path_to_file, ngram_size=ngram_size))

            # evaluate different window sizes
            for window in params['window']:

                # evaluate the effect of different sample sizes
                for sample in params['sample']:

                    # calculate the model
                    if algo == 'd2v':
                        model = gensim.models.doc2vec.Doc2Vec(vector_size=size, 
                                                              window=window,
                                                              dm = dm,
                                                              min_count=5, 
                                                              sample = sample, # threshold for configuring which higher-frequency words are randomly downsampled
                                                              epochs=epochs,
                                                              )
                        model.build_vocab(train_corpus)
                        model.train(train_corpus, 
                                    total_examples=model.corpus_count, 
                                    epochs=model.epochs)

                    else:
                        model = gensim.models.FastText(data_ngrams, vector_size=size, window=window, sg=sg, epochs=50)

                    file_name = f"{model_path}/{model_prefix}_{algo}_{variant}_{ngram_size}_{window}_{sample}"
                    model.save(file_name + '.model')

                    file_name = f"{data_path}/{model_prefix}_{algo}_{variant}_{ngram_size}_{window}_{sample}"
                    with open(file_name + '.train', 'wb') as f:
                          # Pickle the 'data' dictionary using the highest protocol available.
                          pickle.dump(train_corpus, f, pickle.HIGHEST_PROTOCOL)

2021-11-10 09:06:49,470 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec(dbow,d100,n5,mc5,s0.1,t3)', 'datetime': '2021-11-10T09:06:49.470749', 'gensim': '4.1.2', 'python': '3.7.12 (default, Sep 10 2021, 00:21:48) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'created'}
2021-11-10 09:06:49,472 : INFO : collecting all words and their counts
2021-11-10 09:06:49,473 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2021-11-10 09:06:49,506 : INFO : collected 13442 word types and 2046 unique tags from a corpus of 2046 examples and 89538 words
2021-11-10 09:06:49,508 : INFO : Creating a fresh vocabulary
2021-11-10 09:06:49,530 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=5 retains 2440 unique words (18.152060705252193%% of original 13442, drops 11002)', 'datetime': '2021-11-10T09:06:49.530773', 'gensim': '4.1.2', 'python': '3.7.12 (default, Sep 10 2021, 00:21:48) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.

In [None]:
from gensim.models import KeyedVectors

from os import listdir
from os.path import isfile, join

names = [f for f in listdir(model_path) if isfile(join(model_path, f))]
print(names)
models = {}

for name in names:
    file_name = f"{model_path}/{name}"
    print(file_name)
    models[name] = Doc2Vec.load(file_name)

2021-11-10 09:07:01,752 : INFO : loading Doc2Vec object from ./models_doc2vec/autos_d2v_pv-dbow_3_4_0.1.model
2021-11-10 09:07:01,760 : INFO : loading dv recursively from ./models_doc2vec/autos_d2v_pv-dbow_3_4_0.1.model.dv.* with mmap=None
2021-11-10 09:07:01,761 : INFO : loading wv recursively from ./models_doc2vec/autos_d2v_pv-dbow_3_4_0.1.model.wv.* with mmap=None
2021-11-10 09:07:01,763 : INFO : setting ignored attribute cum_table to None
2021-11-10 09:07:01,803 : INFO : Doc2Vec lifecycle event {'fname': './models_doc2vec/autos_d2v_pv-dbow_3_4_0.1.model', 'datetime': '2021-11-10T09:07:01.803874', 'gensim': '4.1.2', 'python': '3.7.12 (default, Sep 10 2021, 00:21:48) \n[GCC 7.5.0]', 'platform': 'Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic', 'event': 'loaded'}


['autos_d2v_pv-dbow_3_4_0.1.model']
./models_doc2vec/autos_d2v_pv-dbow_3_4_0.1.model


In [None]:
names = [f for f in listdir(data_path) if isfile(join(data_path, f))]
print(names)
data = {}

for name in names:
    file_name = f"{data_path}/{name}"
    print(file_name)
    with open(file_name, 'rb') as f:
        data[name] = pickle.load(f)

['autos_d2v_pv-dbow_3_4_0.1.train']
./traindata_doc2vec/autos_d2v_pv-dbow_3_4_0.1.train


In [None]:
for name, m in models.items():
  print(m)

Doc2Vec(dbow,d100,n5,mc5,s0.1,t3)


In [None]:
def compare_doc_models(models, **kwargs):

    passes = 3

    df = pd.DataFrame()
    names = []
    scores = []
    runs = []

    for p in range(passes):
      print(f"\n*** Run {p}")
      for name, model in models.items():
        ranks = []
        second_ranks = []
        similarities = []

        print(name)

        # load the training corpus data
        train_name = os.path.splitext(name)[0] + '.train'
        train_corpus = data[train_name]
        print(train_corpus[:5])

        # use the model to infer the similarity for a tune
        for doc_id in range(len(train_corpus)):
            inferred_vector = model.infer_vector(train_corpus[doc_id].words)
            sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
            similarities.append(sims)
            rank = [docid for docid, sim in sims].index(doc_id)
            ranks.append(rank)

            second_ranks.append(sims[1])

        counter = Counter(ranks)
        print(counter)

        tunes_sim = counter[0]
        tunes_not_sim = len(train_corpus) - counter[0]
        score = 100 * tunes_not_sim/(tunes_sim+tunes_not_sim)

        print(f"\tTunes similar to themselves in first place: {tunes_sim}")
        print(f"\tTunes not similar in first place: {tunes_not_sim}")
        print(f"\t% of tunes not similar to themselves in first place: {score:.2f}%")

        names.append(name)
        scores.append(score)
        runs.append(p)
      
    df = pd.DataFrame({'name': names, 
                        'score': scores,
                        'pass': runs
                      })

    return df

In [None]:
df_score = compare_doc_models(models)


*** Run 0
autos_d2v_pv-dbow_3_4_0.1.model
[TaggedDocument(words=['CM7-Eb7-G#M7', 'Eb7-G#M7-B7', 'G#M7-B7-EM7', 'B7-EM7-G7', 'EM7-G7-Gm7', 'G7-Gm7-C7', 'Gm7-C7-FM7', 'C7-FM7-G#7', 'FM7-G#7-C#M7', 'G#7-C#M7-E7', 'C#M7-E7-Am7', 'E7-Am7-D7', 'Am7-D7-Dm7', 'D7-Dm7-G7', 'Dm7-G7-CM7', 'G7-CM7-Eb7', 'CM7-Eb7-G#M7', 'Eb7-G#M7-B7', 'G#M7-B7-EM7', 'B7-EM7-G7', 'EM7-G7-Gm7', 'G7-Gm7-C7', 'Gm7-C7-FM7', 'C7-FM7-Eb7', 'FM7-Eb7-G#M7', 'Eb7-G#M7-B7', 'G#M7-B7-EM7', 'B7-EM7-G7', 'EM7-G7-CM7', 'G7-CM7-Gm7', 'CM7-Gm7-C7', 'Gm7-C7-Bm7', 'C7-Bm7-E7', 'Bm7-E7-AM7', 'E7-AM7-C7', 'AM7-C7-FM7', 'C7-FM7-Bbm7', 'FM7-Bbm7-Eb7', 'Bbm7-Eb7-G#M7', 'Eb7-G#M7-Dm7', 'G#M7-Dm7-G7', 'Dm7-G7-CM7', 'G7-CM7-Eb7', 'CM7-Eb7-G#M7', 'Eb7-G#M7-B7', 'G#M7-B7-EM7', 'B7-EM7-G7', 'EM7-G7-Gm7', 'G7-Gm7-C7', 'Gm7-C7-FM7', 'C7-FM7-Eb7', 'FM7-Eb7-G#M7', 'Eb7-G#M7-B7', 'G#M7-B7-EM7', 'B7-EM7-G7', 'EM7-G7-CM7'], tags=[0]), TaggedDocument(words=['Am7-Am7-Cm7', 'Am7-Cm7-Cm7', 'Cm7-Cm7-EbM7', 'Cm7-EbM7-EbM7', 'EbM7-EbM7-Em7b5', 'EbM7-Em7b5-A

In [None]:
pd.pivot_table(df_score, index = 'name', values = 'score', columns='pass')

pass,0,1,2
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
autos_d2v_pv-dbow_3_4_0.1.model,2.932551,2.932551,2.737048


In [None]:
import time
t = time.localtime()
timestamp = time.strftime('%Y%m%d_%H%M%S', t)
print(timestamp)

20211110_090824


In [None]:
score_filename = f'score_{timestamp}.csv'
df_score.to_csv(score_filename)

In [None]:
 !zip -r models.zip models_doc2vec/ 

  adding: models_doc2vec/ (stored 0%)
  adding: models_doc2vec/autos_d2v_pv-dbow_3_4_0.1.model (deflated 10%)


In [None]:
from google.colab import files
files.download("models.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
files.download(score_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Assess the best Model

In [None]:
best_model = 'autos_d2v_pv-dbow_3_4_0.1'

In [None]:
model = models[f"{best_model}.model"]

#### Assess self-similarity

In [None]:
ranks = []
second_ranks = []
similarities = []

train_corpus = data[f"{best_model}.train"]

for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    similarities.append(sims)
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

In [None]:
import collections

counter = collections.Counter(ranks)
print(counter)

tunes_sim = counter[0]
tunes_not_sim = len(train_corpus) - counter[0]

print()
print(f"Tunes similar to themselves in first place: {tunes_sim}")
print(f"Tunes not similar in first place: {tunes_not_sim}")
print(f"% of tunes not similar to themselves in first place: {100 * tunes_not_sim/(tunes_sim + tunes_not_sim):.2f}%")

Counter({0: 1988, 1: 38, 2: 5, 3: 3, 7: 2, 4: 2, 372: 1, 1730: 1, 824: 1, 2004: 1, 1646: 1, 5: 1, 157: 1, 1522: 1})

Tunes similar to themselves in first place: 1988
Tunes not similar in first place: 58
% of tunes not similar to themselves in first place: 2.83%


In [None]:
print('Tunes that are similar to other tunes before they are similar to themselves:')
for tune_index, value in enumerate(ranks):
  if value > 0:
    print(f"'{titles.iloc[tune_index, 0]}': {value}")

Tunes that are similar to other tunes before they are similar to themselves:
'Anthropology': 7
'Arise, Her Eyes': 372
'Bags' Groove': 1
'Basin Street Blues': 1
'Bessie's Blues': 1
'Billie's Bounce': 3
'Blue Monk': 2
'Blue Room, The': 1
'Blue Skies': 1
'Blue Sphere': 1
'Bye Bye Blues': 1
'Creole Love Call': 1
'Cyclic Episode': 2
'Get Happy': 1
'I Can't Give You Anything But Love': 1
'Ko Ko': 1
'Lazy River': 1
'Moose The Mooche': 7
'Mr. P.C.': 1
'No Moe': 3
'Now's The Time': 3
'Oleo': 1
'Onmo': 1730
'Pee Wee': 824
'Pinocchio': 2004
'Re Person I Knew': 1646
'Rosetta': 1
'Salt Peanuts': 2
'Shaw 'Nuff': 5
'So What': 1
'Sorcerer, The': 157
'St. Louis Blues': 2
'Struttin' With Some Barbecue': 1
'Sway': 1
'Sweet Georgia Brown': 1
'Tenor Madness': 1
'Theme, The': 4
'Things Ain't What They Used To Be': 1
'Toy Tune': 1522
'Trane’s Blues': 1
'Walkin'': 4
'You Can Depend On Me': 1
'Birth Of The Blues, The': 1
'C'est Si Bon': 1
'Five Foot Two': 1
'Honeysuckle Rose': 1
'I Got Rhythm': 1
'I Scream, Yo

#### Determine Similarity of Tunes

In [None]:
assert(len(model.dv.vectors == len(list(title_to_index.keys()))))

In [None]:
tunes_eval_list = [
  'These Foolish Things', 
  'Blue Moon',
  'Hundred Years From Today, A',
  "All God's Chillun Got Rhythm",
  'I Got Rhythm',
  'Bye Bye Blackbird',
  'Old Fashioned Love',
  'Dinah',
  'Honeysuckle Rose'
]

In [None]:
for tune in tunes_eval_list:
  sim = model.dv.most_similar(title_to_index[tune])

  print()
  print(f"'{tune}' is most similar to:")
  for index, value in sim:
    print(f'\t{value:.3f}: {index_to_title[index]}')



'These Foolish Things' is most similar to:
	0.748: Isn't It A Pity
	0.745: Mimi
	0.703: It's A Lovely Day Today
	0.702: A Foggy Day
	0.700: They Can't Take That Away From Me
	0.698: Long Ago And Far Away
	0.687: Heart And Soul
	0.681: Star-Crossed Lovers, The
	0.680: At Last
	0.679: Wouldn't It Be Loverly

'Blue Moon' is most similar to:
	0.724: Flamingo
	0.722: At Last
	0.709: My Heart Stood Still
	0.707: Jeepers Creepers
	0.687: Sophisticated Lady
	0.687: Teach Me Tonight
	0.682: It Could Happen To You
	0.680: Long Ago And Far Away
	0.679: Misty
	0.676: Here's That Rainy Day

'Hundred Years From Today, A' is most similar to:
	0.636: Chicken
	0.617: Love Is The Sweetest Thing
	0.595: I May Be Wrong (But I Think You're Wonderful)
	0.591: Nightingale Sang In Berkeley Square , A
	0.589: When Somebody Thinks You're Wonderful
	0.583: Save It Pretty Mama
	0.580: Aged And Mellow Blues
	0.578: White Cliffs Of Dover, The
	0.563: Float Me Down The River
	0.562: That's My Weakness Now

'All God

#### Evaluate Similarity for Jazz Contrafacts

A contrafact is a musical composition built using the chord progression of a pre-existing song, but with a new melody and arrangement. Typically the original tune's progression and song form will be reused but occasionally just a section will be reused in the new composition.

https://en.wikipedia.org/wiki/List_of_jazz_contrafacts


In [None]:
contrafacts = {
               "26-2": "Confirmation",
               "52nd Street Theme": "I Got Rhythm",
               "Ablution": "All The Things You Are",
               "Anthropology": "I Got Rhythm",
               "Ballade": "As Long As I Live",
               "Bright Mississippi": "Sweet Georgia Brown",
               "C.T.A.": "I Got Rhythm",
               "Celia": "I Got Rhythm",
               "Cottontail": "I Got Rhythm",
               "Countdown": "Tune Up",
               "Dewey Square": "Oh, Lady Be Good",
               "Dexterity": "I Got Rhythm",
               "Dig": "Sweet Georgia Brown",
               "Donna Lee": "Indiana (Back Home Again In)",
               "Don't Be That Way": "I Got Rhythm",
               "Eternal Triangle": "I Got Rhythm",
               "Evidence": "Just You, Just Me",
               "Flintstones": "I Got Rhythm",
               "Four On Six": "Summertime",
               "Freight Train": "Blues For Alice",
               "Good Bait": "I Got Rhythm",
               "Hackensack": "Oh, Lady Be Good",
               "Half Nelson": "Lady Bird",
               "Hot House": "What Is This Thing Called Love",
               "Impressions": "So What",
               "In A Mellow Tone (In A Mellotone)": "Rose Room",
               "In Walked Bud": "Blue Skies",
               "Ko Ko": "Cherokee",
               "Lennie's Pennies": "Pennies From Heaven",   ## Lennie's Pennies is in minor and therefore transposed to Amin... not possible to recognize like that
               # "Let's Call This": "Honeysuckle Rose",
               "Little Rootie Tootie": "I Got Rhythm",
               "Little Willie Leaps": "All God's Chillun Got Rhythm",
               "Lullaby Of Birdland": "Love Me Or Leave Me",
               "Moose The Mooche": "I Got Rhythm",
               "My Little Suede Shoes": "Jeepers Creepers",
               "Oleo": "I Got Rhythm",
               "Ornithology": "How High The Moon",
               "Passport": "I Got Rhythm",
               "Quasimodo (Theme)": "Embraceable You",
               "Rhythm-a-ning": "I Got Rhythm",
               "Room 608": "I Got Rhythm",
               "Salt Peanuts": "I Got Rhythm",
               "Satellite": "How High The Moon",
               "Scrapple From The Apple": "Honeysuckle Rose", # A section
               "Scrapple From The Apple": "I Got Rhythm", # B section
               "Segment": "I Got Rhythm",
               "Seven Come Eleven": "I Got Rhythm",
               "Shaw 'Nuff": "I Got Rhythm",
               "Theme, The": "I Got Rhythm",
               "Tour De Force": "Jeepers Creepers",
               "Wow": "You Can Depend On Me",
               "Yardbird Suite": "Rosetta",
               # following tunes are not from wikipedia:
               "Sweet Sue, Just You": "Honeysuckle Rose",  # A section
               # "All Of Me": "Pennies From Heaven", # bars 25-28 of All of Me are same as bars 17-20 of Pennies From Heaven, but different key!
}

contrafacts_test = []
for reference, compare in contrafacts.items():
  contrafacts_test.append([title_to_index[reference], title_to_index[compare]])
  

In [None]:
most_sim = model.dv.most_similar(1)
for i in most_sim:
  if 247 == i[0]:
    print("found!")

found!


In [None]:
similarity_found = 0

for tune1, tune2 in contrafacts.items():
  index1 = title_to_index[tune1]
  index2 = title_to_index[tune2]

  sim = model.dv.similarity(index1, index2)
  most_sim = model.dv.most_similar(index1, topn=5)
  
  # check whether the index2 was found among the topn similar tunes
  for i in most_sim:
    if index2 == i[0]:
      similarity_found += 1

  print()
  print(f"'{tune1}' to '{tune2}': {sim}")
  print(f"'{tune1}' is most similar to:")
  for index, value in most_sim:
    print(f'\t{value:.3f}: {index_to_title[index]}')


print()
num_tunes = len(contrafacts)
print(f"Number of tunes tested: {num_tunes}")
print(f"Number of similarities found: {similarity_found}")
print(f"Result: {100 * similarity_found / num_tunes}%")


'26-2' to 'Confirmation': 0.5461316704750061
'26-2' is most similar to:
	0.805: Giant Steps
	0.785: Countdown
	0.779: Satellite
	0.734: I Won't Dance
	0.729: Lady Bird

'52nd Street Theme' to 'I Got Rhythm': 0.6068105101585388
'52nd Street Theme' is most similar to:
	0.816: I've Got My Fingers Crossed
	0.755: Cherry Pink And Apple Blossom White
	0.723: One Sweet Letter From You (alt.)
	0.714: Ol' Man River
	0.713: Stranger On The Shore

'Ablution' to 'All The Things You Are': 0.8708407282829285
'Ablution' is most similar to:
	0.871: All The Things You Are
	0.711: Dolphin, The
	0.709: So Tender
	0.704: Tune Up
	0.698: Unrequited

'Anthropology' to 'I Got Rhythm': 0.6775551438331604
'Anthropology' is most similar to:
	0.998: Passport
	0.998: No Moe
	0.998: Shaw 'Nuff
	0.998: Oleo
	0.998: Theme, The

'Ballade' to 'As Long As I Live': 0.37266454100608826
'Ballade' is most similar to:
	0.723: September In The Rain
	0.677: After You've Gone
	0.639: Crazy Rhythm
	0.613: Under A Blanket Of Bl

In [None]:
model.dv.similarity(0, 1)

0.35312873

#### Download Model

In [None]:
from google.colab import files
files.download(f"{model_path}/{best_model}.model")
files.download(f"{data_path}/{best_model}.train")