# Clustering analysis - Part 2:


In this notebook, we extend our clustering to genres, grouping them based on their words and lyrical structure.


## Inferences:

#### Clusters for genre were as follows 


Cluster 4 words: b'holy', b'holy', b'think', b'country', b'belong', b'wherever', b'belong', b'always', b'e', b'ways',

Cluster 4 genres: Country,

Country songs depict religion and a sense of rustic belongingness.


Cluster 1 words: b'wi', b'glen', b'tm', b'lad', b'na', b'aye', b'ye', b'sae', b'weavers', b'neer',

Cluster 1 genres: Folk

Folk songs depict usage of old-fashioned English words like 'aye' and 'lad'.



In [130]:
import numpy as np
import pandas as pd
import nltk
from nltk.stem.snowball import SnowballStemmer
from bs4 import BeautifulSoup
import re
import os
import codecs
from sklearn import feature_extraction

In [131]:
##Importing data and dropping unnecessary columns

data = pd.read_csv('lyrics.csv')
data.head(10)
data.shape

  interactivity=interactivity, compiler=compiler, result=result)


(356467, 6)

In [132]:
data = data.dropna()

In [133]:
data = data.iloc[:10000]

In [134]:
data = data.drop('index',axis=1)

In [135]:
data = data.drop('year',axis=1)

In [136]:
data = data.drop('artist',axis=1)

In [137]:
##Removing punctuations

lyrics_list = []

for ix,row in data.iterrows():
    chars_rm = ['\n',',','[',']','.','?','!','(',')',':']
    ly = row['lyrics']
    for char in ly:
        if char in chars_rm:
            ly = ly.replace(char,' ')
        elif char=='\'':
            ly = ly.replace(char,'')
    lyrics_list.append(ly)

In [138]:
data['lyrics'] = lyrics_list

In [139]:
#Accumulating words based on genre

vocab = {}

for ix,row in data.iterrows():
    genre = row['genre']
    if genre in vocab:
        vocab[genre] += row['lyrics']
    else:
        vocab[genre] = row['lyrics']
    

In [140]:
len(vocab)

10

In [141]:
cleaned_data = pd.DataFrame({'Genre':list(set(data['genre'].tolist())), 'Lyrics':[vocab[genre] for genre in set(data['genre'].tolist())]})
cleaned_data.head(10)

Unnamed: 0,Genre,Lyrics
0,Pop,Oh baby how you doing You know Im gonna cut ...
1,Country,Verse 1 When the last breath of life Is gone...
2,Jazz,If Ive had my way Id leave here today Id leave...
3,Hip-Hop,horns chorus Timbo- When you hit me on ...
4,Folk,Rise and fall like the tide My hand goes with...
5,Metal,Gods Of The Mountains Sky Forest And Seas Lan...
6,Electronic,Reverse Behold the finite set of thirteen co...
7,R&B,Little wallflower on the shelf standing by h...
8,Indie,Dont feel so bad Its just the way the wheel t...
9,Rock,A lot of cats are hatin slandering makin bad ...


In [142]:
cleaned_data.shape

(10, 2)

In [143]:
## Please download nltk packages

nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [144]:
stopwords = nltk.corpus.stopwords.words('english')

In [145]:
stemmer = SnowballStemmer("english")

In [146]:
#tokenizing and stemming

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [147]:

def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [148]:
totalvocab_stemmed = []
totalvocab_tokenized = []

lyrics_sample = cleaned_data['Lyrics'].iloc[0:500].tolist()
genres_sample = cleaned_data['Genre'].iloc[0:500].tolist()

for l in lyrics_sample:
    allwords_stemmed = tokenize_and_stem(l) # for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) # extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(l)
    totalvocab_tokenized.extend(allwords_tokenized)

In [149]:
len(totalvocab_stemmed)

2191326

In [150]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')
print(vocab_frame.head())

there are 2191326 items in vocab_frame
      words
oh       oh
babi   baby
how     how
you     you
do    doing


In [151]:
# Note that the result of this block takes a while to show
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(lyrics_sample) #fit the vectorizer to synopses

# (100, 563) means the matrix has 100 rows and 563 columns
print(tfidf_matrix.shape)
terms = tfidf_vectorizer.get_feature_names()
len(terms)

Wall time: 47.2 s
(10, 85709)


85709

In [152]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(tfidf_matrix)

In [153]:
from sklearn.cluster import KMeans

num_clusters = 6

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

Wall time: 3.19 s


In [154]:
len(clusters)

10

In [155]:
songs = { 'Genre': genres_sample, 'Lyrics': lyrics_sample, 'cluster': clusters }

frame = pd.DataFrame(songs, index = [clusters] , columns = ['Genre', 'cluster'])

frame['cluster'].value_counts() #number of genres per cluster (clusters from 0 to 4)


2    4
5    2
4    1
3    1
1    1
0    1
Name: cluster, dtype: int64

In [156]:
from __future__ import print_function

print("Top terms per cluster:")
print() #add whitespace

top_terms_final = []

#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    top_terms = []
    for ind in order_centroids[i, :10]:
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
        top_terms.append((' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore')))
        
    print() #add whitespace
    print() #add whitespace
    
    top_terms_final.append(top_terms)
    
    print("Cluster %d genres:" % i, end='')
    if type(frame.ix[i]['Genre'])!=str:
        for title in frame.ix[i]['Genre'].values.tolist():
            print(' %s,' % title, end='')
    else:
        print(' %s,' % frame.ix[i]['Genre'], end='')
    print() #add whitespace
    print() #add whitespace

Top terms per cluster:

Cluster 0 words:

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app


 b'holy', b'holy', b'doggin', b'think', b'country', b'bow', b'bout', b'belong', b'e', b'belong',

Cluster 0 genres: Country,

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated




Cluster 1 words: b'nigga', b'shit', b'bitch', b'niggaz', b'cent', b'yo', b'hood', b'gangstas', b'hoes', b'tha',

Cluster 1 genres: Hip-Hop,

Cluster 2 words: b'que', b'y', b'el', b'en', b'mi', b'te', b'se', b'tu', b'je', b'por',

Cluster 2 genres: Pop, Metal, Indie, Rock,

Cluster 3 words: b'wi', b'o', b'bau', b'glen', b'lad', b'tm', b'gon', b'na', b'na', b'aye',

Cluster 3 genres: Folk,

Cluster 4 words: b'dem', b'mi', b'di', b'fi', b'nuh', b'gyal', b'seh', b'pon', b'inna', b'nah',

Cluster 4 genres: Electronic,

Cluster 5 words: b'boiler', b'homicide', b'la', b'la', b'confusing', b'oh', b'lets', b'somebody', b'uh', b'dont',

Cluster 5 genres: Jazz, R&B,



In [157]:
similarity_distance = 1 - cosine_similarity(tfidf_matrix)
print(type(similarity_distance))

<class 'numpy.ndarray'>


In [158]:
import os  # for os.path.basename

import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.manifold import MDS

# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

%time pos = mds.fit_transform(similarity_distance)  # shape (n_components, n_samples)

print(pos.shape)

xs, ys = pos[:, 0], pos[:, 1]
print(type(xs))

Wall time: 14.5 ms
(10, 2)
<class 'numpy.ndarray'>


In [159]:
#set up colors per clusters using a dict
cluster_colors = {0: '#a50026', 1: '#d73027', 2: '#f46d43', 3: '#fdae61', 4: '#fee08b', 5: '#d9ef8b', 6: '#a6d96a', 7: '#66bd63', 8: '#1a9850', 9: '#006837'}

cluster_names = {}

for i in range(0,6):
    cluster_names[i] = str(top_terms_final[i]).replace('b','').replace('"','').replace('[','').replace(']','')


In [160]:
#some ipython magic to show the matplotlib plots inline
%matplotlib inline 

#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, genre=genres_sample)) 

print(df[1:10])


        genre  label         x         y
1     Country      0 -0.611834 -0.313061
2        Jazz      5 -0.016958 -0.594096
3     Hip-Hop      1 -0.342277 -0.222395
4        Folk      3  0.470785 -0.508500
5       Metal      2  0.370956  0.338517
6  Electronic      4 -0.128430  0.651392
7         R&B      5 -0.613150  0.344314
8       Indie      2  0.534487  0.208684
9        Rock      2  0.108034 -0.056882


In [161]:
clus_ids = []
for ix,row in df.iterrows():
    clus_ids.append(cluster_names[row['label']])

In [162]:
colormap = {0: '#a50026', 1: '#d73027', 2: '#f46d43', 3: '#fdae61', 4: '#fee08b', 5: '#d9ef8b', 6: '#a6d96a', 7: '#66bd63', 8: '#1a9850', 9: '#006837'}
colors = [colormap[x] for x in df['label']]
df['color'] = colors

In [165]:
import pandas as pd
import numpy as np
from bokeh.io import output_notebook, show, curdoc, push_notebook
from bokeh.models import ColumnDataSource,HoverTool,ColorBar,LabelSet
from bokeh.plotting import figure, show, output_notebook
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from bokeh.layouts import layout
from bokeh.layouts import row
from ipywidgets import interact
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual, IntSlider

output_notebook()

colormap = {0: 'red', 1: 'blue', 2: 'green', 3: 'yellow', 4: 'orange', 5: 'black', 6: 'navy', 7: 'pink', 8: 'magenta', 9: 'brown'}
colors = [colormap[x] for x in df['label'].tolist()]

p = figure(title = "Clustering Music Genres Based on Lyrics",tools="hover,lasso_select,pan,wheel_zoom,box_zoom,reset,save")

xs = list(xs)
ys = list(ys)

source = ColumnDataSource(dict(
    x=xs,
    y=ys,
    color=colors,
    label=clus_ids,
    genre = [i for i in df['genre'].tolist()]
))

# scatter plot
t = p.scatter('x', 'y', source=source, fill_alpha=0.6,
              fill_color="#8724B5",
              line_color=None)

# text labels
labels = LabelSet(x='x', y='y', text='genre', y_offset=8,
                      text_font_size="6pt", text_color="color",
                      source=source, text_align='center')
#r = p.text(x='x', y='y', text_color='color', text = 'artist', text_alpha=0.8, text_font_size='5pt', source=source, legend = 'artist')

p.select_one(HoverTool).tooltips = [('Cluster', '@label')]
p.add_layout(labels)
#p.add_tools(HoverTool(tooltips=[("Cluster", "@label")]))


In [166]:
show(p)