# Lab 3 — clustering

In [None]:
# Given Imports
import json

import matplotlib.pyplot as plt
import numpy as np
import pickle as pk
from operator import itemgetter

# Import panda that is easier to use than dicts
import pandas as pd
# Import random
import random as rn

# Import bokeh
from bokeh.io import show, output_notebook
from bokeh.plotting import figure, ColumnDataSource
from bokeh.models import HoverTool, ResetTool, PanTool, WheelZoomTool, SaveTool
output_notebook()
# Categorial colors
from bokeh.palettes import Dark2_8

%matplotlib inline
plt.style.use("ggplot")

### Exercise 3.11 : Clustering tags

In [None]:
# Let's open the pickle to load it.
with open( "tagCoord.pickle", "rb" ) as f:
    tagEmbed = pk.load( f, encoding="utf-8" )

In [None]:
#Cluster the data using the k-means clustering algorithm.

# Input: N data points {x1, ..., xn} , given in our pickle.
#        K number of clusters, between 2 and 5 for us.
# Output: We will have mu_k, center of clusters, K times.
#         r_nk, point-cluster assigment indicator. If r_nk = 1, n is in cluster k.

# r_nk = 1 only for k = argmin ||xn - mu_k||
# mu_k = sum_over_n(r_nk + xn)/sum_over_n(r_nk)
def k_mean(k, data, give_up = 1000):
    mat = np.array(list(zip(*data))[1])
    print(mat.shape)
    #We arbitrary decide that if the centroids all move less than 1e-5 * the variance, they have converged and we can stop.
    var = np.var(mat,1)
    thershold = np.linalg.norm(var)*1e-5
    
    #Pick k random data points as initialposition for centroids.
    centroids_idx = np.random.randint(mat.shape[0], size=k)
    centroids = mat[centroids_idx]
    
    distances = np.zeros((mat.shape[0],k))
    
    # We should loop until convergence but just in case we don't, we set a maximal number of steps.
    for step in range(0,give_up):
        
        #For each centroid, the distance between each point and this centroids.
        for i in range(0, k):
            distances[:,i] = np.linalg.norm(mat - centroids[i,:],axis=1)
        
        #The index of the closet centroids for each point
        closest = np.argpartition(distances,1,1)[:,0]
        
        #For each centroid, the center of gravity of all point closest to this centroid. 
        means = np.zeros(centroids.shape)
        for i in range(0, k):
            means[i,:] = np.mean(mat[closest == i,:],0)
            
        #We will set the new centroids as the center of gravity of the cluster we found.
        #But first we compute the distance between the current position of each centroid and its position for the next step.
        move = means - centroids
        dist = np.max(np.linalg.norm(move,axis=0))
        
        #If no centroid have moved more than the thershold, we are close enough from the convergence point.
        if(dist < thershold):
            return closest
        centroids = means
    print("Centroids have not stabilized after",give_up,"steps")
    return closest  

In [None]:
data = list(tagEmbed.items())
# Start with K = 2
clustering = k_mean(2,data)

In [None]:
# We project on choosen directions. Here 0 is "good-bad". See the categories in dimred.
main_dims = (0,1)

# Use the categorial colors for clustering.
source = ColumnDataSource(
    data={
        "x": [x[1][main_dims[0]] for x in data],
        "y": [x[1][main_dims[1]] for x in data],
        "name": [x[0] for x in data],
        "color": [Dark2_8[x] for x in clustering],
    })

hover = HoverTool(
    tooltips=[
        ("Name", "@name"),
    ])
tools = [hover, ResetTool(), PanTool(), WheelZoomTool(), SaveTool()]
p = figure(plot_width=960, plot_height=360, tools=tools, title="Mouse over the dots")
p.circle("x", "y", source=source, size=20, color="color", alpha=0.5)
show(p, notebook_handle=True)

### Exercise 3.12 : Clustering movies

In [None]:
# Data Frame of all movies and genres. Taken directly from the introduction in dimred.
data_movies = sc.textFile("/ix/ml-20m/movies.txt").map(json.loads)

# Frame of genres in movie order.
intermediateGenresDF = pd.DataFrame(data_movies.map(itemgetter("genres")).collect())
# Frame of all unique genres. There are 20 genres, including "No genre listed", once we drop None.
genresDF = pd.DataFrame(pd.DataFrame(intermediateGenresDF.values.flatten()).drop_duplicates().reset_index()[0])
# We drop "None", as it is to disregard.
genresDF = pd.DataFrame(genresDF[~genresDF[0].isnull()].reset_index()[0])

# We need anothe dataframe that lists all the genre lsits with the right movieId as they are NOT continuous
genresListsDF = pd.DataFrame(data_movies.map(itemgetter("genres", "movieId")).collect(),columns=["genres", "movieId"])

# Let's open the pickle wewant to cluster.
# The file is in the shape of a list of tuples, in form of: (Id, Movie name)
with open( "most-rated.pickle", "rb" ) as f:
    ratedMovies = pk.load( f, encoding="utf-8" )
# Put it in a DF for easier use.
ratedMoviesDF = pd.DataFrame(list(ratedMovies), columns=['movieId', 'title'])

In [None]:
# Defining the Jaccard distance for our exercice.
# x1 is a matrix, x2 is a vector. Returns a matrix.       
def jaccard(x1, x2):
    # Gives intersection directly
    set1 = x1 @ x2
    # Gives union
    set2 = (x1 + x2).clip(max=1)
    return 1 - ( set1 / np.sum(set2) )

# The k-medioids algorithm implementation. 
def kmedAlgo(k, mat, give_up = 1000):
    # Same as before.
    # We arbitrary decide that if the medioids all move less than 1e-5 * the variance, they have converged and we can stop.
    var = np.var(mat,1)
    thershold = np.linalg.norm(var)*1e-5
    
    #Pick k random data points as initial position for medioids.
    medioids_idx = np.random.randint(mat.shape[0], size=k)
    medioids = mat[medioids_idx]
    
    
    
    # We should loop until convergence but just in case we don't, we set a maximal number of steps.
    for step in range(0, give_up):
        
        distances = np.zeros((mat.shape[0],k))
        #For each medioid, the distance between each point and this medioids.
        for i in range(0, k):
            distances[:,i] = jaccard(mat, medioids[i,:])
        #The index of the closet medioid for each point
        closest = np.argpartition(distances,1,1)[:,0]
        
        new_medioids = np.zeros(medioids.shape)
        #For each cluster
        for i in range(0, k):
            #Matrix of all points in cluster i 
            clusteri = mat[closest == i,:]
            #As we loose the index in mat, we use another matrix with the index of each point
            real_index = np.argwhere(closest == i)
            #List of all Jaccard distance
            distSum = list()
            for pt in clusteri:
                distSum = distSum + [list(jaccard(clusteri, pt))]
            miIndex = real_index[distSum.index(min(distSum))]
            new_medioids[i] = mat[miIndex]

        #We will set the new medioids as the center of gravity of the cluster we found.
        #But first we compute the distance between the current position of each medioids and its position for the next step.
        move = new_medioids - medioids
        dist = np.max(np.linalg.norm(move,axis=0))
        
        #If no centroid have moved more than the thershold, we are close enough from the convergence point.
        if(dist < thershold):
            return closest
        medioids = new_medioids
    print("Medioids have not stabilized after",give_up,"steps")
    return closest  

In [None]:
# Initialize the matrix with 0's
matKmed = np.zeros((ratedMoviesDF.shape[0],genresDF.shape[0]))

# Fill the matrix with the data. We use what we did in the dimred exercise again.

# Create an index of id's for ratedMoviesDF.
ratedMoviesIndex = pd.Index(ratedMoviesDF.movieId)

# Fill the matrix by iterating over all the ids in ratedMovies
# For each id, seek it's genres and fill the matrix with 1 if the genr eis lsited, 0 if it isn't
# "None" in genresListsDF is disregarded.
a = 0
for mId in ratedMoviesIndex:
    # Exract the right list of genres
    movieGenres = (genresListsDF[genresListsDF.movieId == mId].genres).tolist()[0]
    # Drop the Null
    #movieGenres = movieGenres[~movieGenres.isnull()]
    # Put a 1 in the matrix a the right place
    genreId = np.array(list(genresDF.isin(movieGenres)[0]))
    
    #We put a 1 if the genre is present in the lsit for that id.
    matKmed[a, :] = genreId
    a += 1

# Call method to cluster.
k = 2
clusterKmed = kmedAlgo(k,matKmed)

In [None]:
# Visualize the results
per_genre_total =  np.sum(matKmed,0)
for i in range(0,k):
    # Visualize the frequency of each film genre in a cluster
    bars = np.sum(matKmed[clusterKmed == i, :],0)/per_genre_total *100
    
    x = range(genresDF.shape[0])
    f = plt.figure()
    ax = f.add_axes([0.1, 0.1, 0.8, 0.8])
    ax.bar(x,height=bars)
    
    plt.ylabel("Film % of type x")
    ax.set_xticks(x)
    ax.set_xticklabels(genresDF[0].tolist(),{"rotation":90})
    f.show();

As we can see, and as expected, the frequency of a genre in one cluster is complementary to the frequency in the other cluster.

We can also interpret the clusters using which genres are dominant. In the first cluster, we have mostly action and non-realistic categories (with the exception of “Documentary”). The second cluster has mostly realistic categories, like “Mystery” or “Thriller”.

We could also say that the first cluster is more "for children", aving less violent genres, and the second cluster is "for adults" with things like horror and mysteries. 