#  MelodyMaster
## Notebook03 - song2vec & Multiclass
### Idan Kashani & Or Raphael Bidusa

# Shalom again!
Today we will examine another representation of the data - song2vec, based on word2vec representation.
We will use this representation with all sorts of different model, hoping to get a better result than last time with our simple knn.

In [203]:
import numpy as np
import torch
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import math
import ast
import warnings
from collections import Counter
warnings.filterwarnings("ignore")
np.random.seed(42)

Loading the dataset and getting the vocabulary.

In [204]:
train_df = pd.read_csv('../train.csv')
print(f'train shape: {train_df.shape}')
train_df['lyrics_as_list'] = train_df['lyrics'].map(lambda l: set(ast.literal_eval(str(l)))) #Why set?
vocabulary = list(sorted(set.union(*train_df['lyrics_as_list'].tolist())))
train_df['lyrics_as_list'] = train_df['lyrics'].map(lambda l: ast.literal_eval(str(l)))
print(f"Vocabulary length is: {len(vocabulary)}")

train shape: (8496, 6)
Vocabulary length is: 62178


In [205]:
with open('../words_list_w2v.txt',encoding='utf-8') as f:
    words = f.read().split('\n')
    # Removing the last word - an empty word
    words.pop()
vectors = np.load('../words_vectors_w2v.npy')

words = [w[3:] if len(w)>3 and w[:3] in ['NN_','VB_','JJ_'] else w for w in words]

for w in words:
    if len(w) > 3:
        if w[:3] == "NN_" or w[:3] == "VB_" or w[:3] == "JJ_":
            print(w)


In [206]:
print(words.count('פסיכותרפיסט'))

2


In [207]:
from collections import defaultdict

def list_duplicates(seq):
    tally = defaultdict(list)
    for i,item in enumerate(seq):
        tally[item].append(i)
    return ((key,locs) for key,locs in tally.items()
            if len(locs)>1)

dups_indx = [dup for dup in sorted(list_duplicates(words))]
print(f"There are {len(dups_indx)} types with multiple entries")

There are 4123 types with multiple entries


In [208]:
from collections import OrderedDict

words = list(OrderedDict.fromkeys(words))
vectors_without_dups = vectors.copy()
to_delete = []
for d in dups_indx:
    indices = sorted(d[1])
    vectors_without_dups[indices[0]] = np.mean(vectors[indices])
    to_delete = to_delete + indices[1:]
vectors = np.delete(vectors_without_dups, to_delete, 0)

In [209]:
print(len(words))
print(len(vectors))

340419
340419


In [210]:
w2v = dict(zip(words,vectors))
v2w_rep = dict(zip([tuple(v) for v in vectors],words))

In [211]:
def v2w(vector):
    return v2w_rep[tuple(vector)]

ahavti_vec = w2v["אהבתיה"]
print(ahavti_vec)
print(v2w(ahavti_vec))

[-0.047849 -0.237195  0.296635 -0.014212  0.022151  0.172562 -0.081755
  0.131053 -0.049191  0.166546  0.046406  0.112231 -0.065383 -0.154182
  0.11132   0.357974  0.056559 -0.197837  0.122658  0.065558 -0.176146
  0.124626  0.266774 -0.331449  0.055231 -0.08458   0.033744  0.081236
  0.169651  0.221381  0.004441 -0.244207  0.080133  0.244274 -0.09737
 -0.035503  0.162789  0.126568  0.081791 -0.083357  0.059758  0.087832
  0.079024  0.146633  0.105375  0.160128 -0.129235 -0.158333  0.173166
  0.037226 -0.071079  0.01272  -0.014435 -0.002203  0.242982 -0.220937
  0.001111 -0.070437  0.194238  0.162719  0.067988 -0.159687  0.061967
  0.025371 -0.174582  0.178798  0.029152  0.025532  0.016066  0.246695
 -0.056776 -0.389847 -0.053492 -0.16045  -0.321974 -0.153765  0.041302
 -0.010492 -0.284652  0.126697  0.068541  0.001509 -0.023024 -0.059568
  0.10348   0.244634  0.061512 -0.074103  0.096155  0.170907 -0.092507
  0.150437 -0.040386  0.103263  0.153118  0.073103 -0.051595  0.286848
 -0.036

In [212]:
from scipy.spatial import cKDTree
tree = cKDTree(vectors)

def closest_word(vector):
    return words[tree.query(vector, k=1)[1]]

def k_closest_words(vector, k=5):
    distances, close_words = tree.query(vector, k=k)
    return [(words[w], d) for w, d in zip(list(close_words.astype(int)), distances)]

In [213]:
def analogy(x,y,a):
    return w2v[y]-w2v[x]+w2v[a]

In [214]:
print(closest_word(analogy("אנגליה", "לונדון", "צרפת")))
print(closest_word(analogy("מלך", "מלכה", "נסיך")))

פריז
נסיכה


In [215]:
k_closest_words(w2v["גרמניה"],k=14)

[('גרמניה', 0.0),
 ('אוסטריה', 9.436663490929567),
 ('שווייץ', 10.70911817768223),
 ('פרוסיה', 10.974586621357817),
 ('פולין', 11.275983929305372),
 ('הולנד', 11.324757442402465),
 ('צרפת', 11.344306222048795),
 ("צ'כוסלובקיה", 11.5207771481726),
 ('הונגריה', 11.54691985215105),
 ('רוסיה', 11.644340376521805),
 ('פינלנד', 11.71177972528518),
 ('איטליה', 11.92222299772081),
 ('ברלין', 11.978732594655956),
 ('בלגיה', 12.067913401246797)]

In [237]:
def song2vec_mean(song):
    print(len([w for w in song if w in words]))
    return np.mean([w2v[w] for w in song if w in words],axis=0)

def song2vec_max(song):
    return np.max([w2v[w] for w in song if w in words], axis=0)

In [246]:
lo_yachol = train_df.query("song_name == 'מתי נתנשק'")["lyrics_as_list"].tolist()[0]
print(k_closest_words(song2vec_mean(lo_yachol)))
print(k_closest_words(song2vec_max(lo_yachol)))

65
[('לחייך', 3.8804646630631243), ('חידלון', 4.019876706009065), ('דבר-מה', 4.047996808217183), ('מגעיל', 4.144055661598788), ('אמך', 4.200752824215914)]
[('סינית', 28.152123957105488), ('יפנית', 28.429831888516166), ('מושלמת', 28.441094344656186), ('נפוצה', 28.478048915127026), ('טהורה', 28.63744286765999)]


In [264]:
words_set = set(words)

In [268]:
train_df['song2vec_mean'] = train_df['lyrics_as_list'].map(lambda l: np.mean([w2v[w] for w in l if w in words_set],axis=0))
train_df = train_df.dropna()
train_df['song2vec_max'] = train_df['lyrics_as_list'].map(lambda l: np.max([w2v[w] for w in l if w in words_set],axis=0))