In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import silhouette_samples, silhouette_score
# from sklearn.metrics import v_measure_score
from sklearn.neighbors import NearestNeighbors

from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD

import scipy
from scipy.sparse import csgraph
from numpy import linalg as LA

from IPython.display import display
from kneed import KneeLocator

import matplotlib.style as style
style.use("fivethirtyeight")

import sys

In [None]:
# dataset = pd.read_csv('./../Datasets/tmdb_prepro.csv')
dataset = pd.read_csv('./../Datasets/movies46k_prepro_onehot.csv')

In [None]:
# pd.set_option('display.max_columns', 500)
text_columns = ['original_title', 'overview', 'title', 'keywords', 'actors', 'characters', 'original_language']
# text_columns = ['release_date', 'original_title', 'overview', 'id', 'tagline', 'title', 'keywords', 'actors', 'characters', 'original_language']
df = dataset.drop(columns=text_columns)
display(pd.DataFrame(df.describe()))


df = StandardScaler().fit_transform(df)
# display(pd.DataFrame(pd.DataFrame(df).describe()))


## PCA (works well with dense data)
# variance_retention = 0.70
# pca = PCA(variance_retention)
# principalComponents = pca.fit_transform(df)
# df = pd.DataFrame(principalComponents)

## LDA works well with sparse data (a lot of zeroes)
svd = TruncatedSVD(n_components=40, algorithm='randomized')
df = svd.fit_transform(df)

display(pd.DataFrame(df))

# Eigengap heuristics to determine number of clusters

In [None]:
nearest_neighbors = NearestNeighbors(n_neighbors=11)
neighbors = nearest_neighbors.fit(df)

distances, indices = neighbors.kneighbors(df)
distances = np.sort(distances[:,10], axis=0)

fig, ax = plt.subplots(1)

ax.spines['bottom'].set_color('black')
ax.spines['left'].set_color('black')
ax.xaxis.label.set_color('black')
ax.tick_params(axis='x', colors='black')
ax.yaxis.label.set_color('black')
ax.tick_params(axis='y', colors='black')
plt.plot(distances)
plt.xlabel("Points")
plt.ylabel("Distance")

In [None]:
i = np.arange(len(distances))
knee = KneeLocator(i, distances, S=1, curve='convex', direction='increasing', interp_method='polynomial')

plt.plot(knee.x, knee.y, "b", label="data", c='black')
plt.vlines(knee.knee, plt.ylim()[0], plt.ylim()[1], linestyles="--", label="knee/elbow", color='yellow')

plt.gca().spines['bottom'].set_color('black')
plt.gca().spines['left'].set_color('black')
plt.gca().xaxis.label.set_color('black')
plt.gca().tick_params(axis='x', colors='black')
plt.gca().yaxis.label.set_color('black')
plt.gca().tick_params(axis='y', colors='black')
plt.xlabel("Points", c='black')
plt.ylabel("Distance", c='black')
plt.title("Knee Point", c='black')
plt.legend(loc='best', labelcolor='black')

eps = distances[knee.knee]
print(eps)

In [None]:
dbscan_cluster = DBSCAN(eps=eps, min_samples=8)
dbscan_cluster.fit(df)

# Number of Clusters
labels = dbscan_cluster.labels_
N_clus = len(set(labels))-(1 if -1 in labels else 0)
print('Estimated no. of clusters: %d' % N_clus)

# Identify Noise
n_noise = list(dbscan_cluster.labels_).count(-1)
print('Estimated no. of noise points: %d' % n_noise)


# Silhouette method to determine number of clusters

In [None]:
silhouette_avg_n_clusters = []
k_range = 30

for n_clusters in range(2, k_range):
    clusterer = DBSCAN(eps=eps, min_samples=8)
    cluster_labels = clusterer.fit_predict(df)
    
    silhouette_avg = silhouette_score(df, cluster_labels)
    silhouette_avg_n_clusters.append(silhouette_avg)

In [None]:
fig, ax = plt.subplots(1)

ax.spines['bottom'].set_color('black')
ax.spines['left'].set_color('black')
ax.xaxis.label.set_color('black')
ax.tick_params(axis='x', colors='black')
ax.yaxis.label.set_color('black')
ax.tick_params(axis='y', colors='black')

ax.plot(range(2, k_range), silhouette_avg_n_clusters, '-o', c = 'y')
ax.xaxis.get_major_locator().set_params(integer=True)

plt.xlabel("Number of Clusters (k)")
plt.ylabel("silhouette score")
plt.title("Silhouette Method", c = 'black')
plt.show()

In [None]:
sc = DBSCAN(eps=eps, min_samples=8)

sc.fit(df)

# Results  UPPPPDDDAAATEEEE!!!
* In this baseline model using k_means clustering, which is not the best clustering algorithm, we can see that the optimal number of clusters is 13, combining the results from elbow method and silhouette method.
* We use k_means clustering as the baseline as it is the most standard, simple and basic clustering algorithm.
* Since our data is high dimensional, and k_means clustering fails in high dimensional data due the 'curse of dimensionality', we will now use other clustering algorithms that perform better with high dimensional data.