In [9]:
%load_ext autoreload
%autoreload 2

import numpy as np
import os
import pandas as pd
import pickle
from sklearn.cluster import DBSCAN, KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture

import torch

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load Data

In [4]:
data_path = os.path.join("..", "data")
music_sent_path = os.path.join(data_path, "music_train.csv")

In [3]:
music_train = pd.read_csv(music_sent_path)

In [4]:
music_train.head()

Unnamed: 0,source,target
0,Every single song on the album is excellent.,Every single song on the album is excellent.
1,Still its a mere hiccup to an outstanding reco...,Still its a mere hiccup to an outstanding reco...
2,I just discovered this gem and cannot believe ...,I just discovered this gem and cannot believe ...
3,The three Dylan songs are beautiful beyond words.,The three Dylan songs are beautiful beyond words.
4,I do not know if the original version has ever...,I do not know if the original version has ever...


In [5]:
music_train_src = list(music_train["source"])

## Indico Features

In [8]:
import indicoio
import config
indicoio.config.api_key = config.INDICO_API_KEY

In [10]:
NUM_SENTENCES = 4000

In [11]:
music_train_src[0]

'Every single song on the album is excellent.'

In [13]:
music_sent_features = indicoio.text_features(music_train_src[:NUM_SENTENCES])

In [17]:
pickle.dump(music_sent_features, open(os.path.join(data_path, "music_sent_features.pkl"), "wb"))
pickle.dump(music_train_src[:NUM_SENTENCES], open(os.path.join(data_path, "music_sent.pkl"), "wb"))

## Load Post-Indico Data

In [5]:
music_sent_features = pickle.load(open(os.path.join(data_path, "music_sent_features.pkl"), "rb"))
music_sent = pickle.load(open(os.path.join(data_path, "music_sent.pkl"), "rb"))

## Cluster Sentence Features

In [6]:
def viz_clusters(sentences, sentence_features, labels, viz_num, rand_num):
    cluster_nums = list(set(labels))
    cluster_nums.sort()
    
    cluster_viz = {}
    
    for c in cluster_nums:
        c_label_idxs = np.where(labels == c)[0]
        #print(c_label_idxs)
        
        c_sent_features = sentence_features[c_label_idxs, ]
        cluster_centroid = np.mean(c_sent_features, axis = 0)
        
        dists = np.linalg.norm(c_sent_features - cluster_centroid, axis = 1)
        closest_sent_idxs = np.argsort(dists)[:viz_num]
        
        # naive
        closest_sents = [sentences[i] for i in c_label_idxs]
        top_sents = [closest_sents[i] for i in closest_sent_idxs]
        
        rand_pos = list(set(range(len(c_label_idxs))).difference(set(closest_sent_idxs)))
        if len(rand_pos) != 0:
            rand_idxs = np.random.choice(rand_pos, min(rand_num, len(rand_pos)), replace = False)
            rand_sents = [closest_sents[i] for i in rand_idxs]
        else:
            rand_sents = []
        
        cluster_viz[c] = top_sents + rand_sents
        
        print(f"Cluster {c}")
        print("===============")
        [print(sent + "\n") for sent in top_sents + rand_sents]
                                     
    return cluster_viz

## DBSCAN

In [83]:
#hyper_eps = [0.001, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.08, 0.1, 0.5, 0.75, 0.9]
hyper_eps = [0.75, 0.80, 0.85, 0.9, 0.95, 1.0, 1.1]
#hyper_eps = [0.95]
for ep in hyper_eps:
    clustering = DBSCAN(eps=ep, min_samples=5).fit(music_sent_features)
    print(str(ep) + ":")
    print(set(clustering.labels_))

0.75:
{-1}
0.8:
{-1}
0.85:
{0, 1, 2, -1}
0.9:
{0, -1}


KeyboardInterrupt: 

In [78]:
viz = viz_clusters(music_train_src[:NUM_SENTENCES], music_sent_features, clustering.labels_, 3, 2)

Cluster -1 

I am jaded on music DVD's since Gilmours Remember That Night- which blows everything I have seen out of the water... but this intimate show is worth having.

The vastly underrated B.J. Wilson (RIP) provides some of his most superlative, muscular drumming - why more people don`t acknowledge his skills as a drummer has always irked me to no end - listen to the incredible foundation he provides on 'Bringing Home

Simply put Eidolon does not offer anything that you havn't heard before but 'Nightmare World' is a power-thrash gem that should not be overlooked with razor sharp riffs, heavy drumming, and even beautiful melodies from time to time.

but there is an inflection point here which would characterize later works.  

Avoid if able to resist the challenge of completing the collection.

Cluster 0 

the album does not really remind me of anyone off the top of my head, i guess you could say that the band were fairly unique in some ways, and it is rather diverse.

Dennis De You

In [79]:
viz

{-1: ["I am jaded on music DVD's since Gilmours Remember That Night- which blows everything I have seen out of the water... but this intimate show is worth having.",
  "The vastly underrated B.J. Wilson (RIP) provides some of his most superlative, muscular drumming - why more people don`t acknowledge his skills as a drummer has always irked me to no end - listen to the incredible foundation he provides on 'Bringing Home",
  "Simply put Eidolon does not offer anything that you havn't heard before but 'Nightmare World' is a power-thrash gem that should not be overlooked with razor sharp riffs, heavy drumming, and even beautiful melodies from time to time.",
  'but there is an inflection point here which would characterize later works.  ',
  'Avoid if able to resist the challenge of completing the collection.'],
 0: ['the album does not really remind me of anyone off the top of my head, i guess you could say that the band were fairly unique in some ways, and it is rather diverse.',
  'Den

In [90]:
NUM_CLUSTERS = 100
kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=0).fit(music_sent_features)

  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else

  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else

  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else

  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else

  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else

  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else

  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else np.sqrt(distances, out=distances)
  return distances if squared else

In [91]:
viz = viz_clusters(music_train_src[:NUM_SENTENCES], music_sent_features, kmeans.labels_, 3, 2)

Cluster 0 

In the wake of last week's sad news of the passing of guitarist Robbin Crosby, this is a great album to revisit and relive the 80's, the last great decade of music, back when music was more FUN than dark and depressing.  

All in all, this is an unmemorable pop album that at times sounds like it was made in a time-warp, clearly predating the worst aspects of commercial music of the 1980s.

For fans of the David Lee Roth version of Van Halen, this represents the zenith of that version's work and is my personal favourite as it represents the band at its early prime.  

When love comes to town with BB King is one of the best songs of U2's career.

Pink Moon is the one of the most revealing folk/singer-songwriter records out there..which is odd, because the lyrics are impenetrably nonsensical.

Cluster 1 

The booklet alone is worth the purchase price, but the music is excellent as well.

Nonetheless, even at a dollar per song, this is one of the best investments for entertainm

In [92]:
labels = kmeans.labels_
kcluster_nums = list(set(labels))
kcluster_nums.sort()

kcluster_vals = [len(np.where(labels == c)[0]) for c in cluster_nums]
kcluster_vals

[0,
 59,
 19,
 22,
 85,
 52,
 22,
 75,
 101,
 52,
 87,
 15,
 22,
 2,
 65,
 29,
 32,
 73,
 80,
 64,
 19,
 69,
 62]

In [70]:
labels = clustering.labels_
pickle.dump(labels, open(os.path.join(data_path, "music_sent_labels.pkl"), "wb"))

## Generate Anchor, Positive, Negative Examples

In [80]:
cluster_nums = list(set(labels))
cluster_nums.sort()

cluster_weights = [len(np.where(labels == c)[0]) / len(labels)  for c in cluster_nums]

In [81]:
cluster_weights

[0.661,
 0.32725,
 0.0005,
 0.00075,
 0.00075,
 0.00125,
 0.0005,
 0.0005,
 0.0005,
 0.0005,
 0.0005,
 0.0005,
 0.0005,
 0.0005,
 0.0005,
 0.0005,
 0.0005,
 0.0005,
 0.0005,
 0.0005,
 0.0005,
 0.0005,
 0.0005]

## K-Means

In [12]:
clustering = AgglomerativeClustering(n_clusters=30).fit(music_sent_features)

In [13]:
viz = viz_clusters(music_sent, music_sent_features, clustering.labels_, 3, 2)

Cluster 0
If you buy the reissue (which you definitely should, and probably will because the original version of this C.D. is very hard to find), you will get six extra songs which make for a great listen and are essential for your collection.

If you want to hear some of The Cure's best live work, get this single at once, it is relatively inexpensive as well.

It is so good, in fact, you can play it back-to-back with Nat King Cole's wonderful version, and enjoy each in an entirely different way.

It wriggles into your skull and camps out.  

So, although the music is great, the $130 pricetag is a bit much.  

Cluster 1
Still, it is an undeniably good song.

This is a very good album.

Overall, this is a great album.

This album is a picture perfect example.

This guy has a nice voice.  

Cluster 2
The list would be long- Dave Van Ronk, Geoff Muldaur, Maria Muldaur, Phil Ochs, Chris Smithers, Joan Baez and on and on.

Standout tracks here include Prison Sex, Flood, Sober, and Bottom.  

## GMM

In [12]:
clustering = GaussianMixture(n_components=30).fit_predict(music_sent_features)

In [13]:
viz = viz_clusters(music_sent, music_sent_features, clustering, 3, 2)

Cluster 0
Probably the 2nd best Type O Negative works after Bloody Kisses,an excellent group of songs from them.

Because of this album along with Innuendo, The Works and A Kind Of Magic, I have come to really appreciate Queen's 70s albums such as A Night

Ask Any Girl is the oldest recording, having first been the B-side of Baby Love in 1964, and was also on their earlier Where Did Our Love Go?

The beginnings of my all-time Favorite Band!

KC & the Sunshine Band was the best band of the Disco era.

Cluster 1
As an Al Stewart fan, I hated to see his career go on this path, but he really lost a lot between the mid-1970s and this record.

This is a great introduction to Phoebe Snow, who is clearly going to get the love and appreciation that she has had coming to her in upcoming years.  

My life, is the one who has some guts, but fall apart and sounds later on "overproduced".

Then that violin just keeps coming back again and again, building in blistering intensity each time he comes ba