## Preliminaries

#### Imports

In [1]:
import sys
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np


from time import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.metrics import silhouette_samples, silhouette_score

In [2]:
# Allows auto reloading of modules
%load_ext autoreload
%autoreload 2

In [3]:
# Add path to src folder to the path variable 
PATH_TO_SRC = "./src"

if PATH_TO_SRC not in sys.path:
  sys.path.append(PATH_TO_SRC)

In [4]:
from data_preprocessor import DataPreprocessor
from data_clusterer import DataClusterer

#### Parameters

In [5]:
from config import MOVIES_PATH, GENRES_PATH, TEST_SIZE, RANDOM_STATE, MAX_DF, MIN_DF, STOP_WORDS, N_COMPONENTS, NORMALIZER_COPY, N_CLUSTERS, MAX_ITER, N_INIT

#### Instantiated classes

In [6]:
kmeans = KMeans(n_clusters=N_CLUSTERS, max_iter=MAX_ITER, n_init=N_INIT)

## Preprocessing

In [7]:
preprocessor = DataPreprocessor(MOVIES_PATH, GENRES_PATH)
preprocessor.df_movies.head()

Unnamed: 0,id,name,date,tagline,description,minute,rating
0,1000001,Barbie,2023.0,She's everything. He's just Ken.,Barbie and Ken are having the time of their li...,114.0,3.91
1,1000002,Parasite,2019.0,Act like you own the place.,"All unemployed, Ki-taek's family takes peculia...",133.0,4.57
2,1000003,Everything Everywhere All at Once,2022.0,The universe is so much bigger than you realize.,An aging Chinese immigrant is swept up in an i...,140.0,4.32
3,1000004,Fight Club,1999.0,Mischief. Mayhem. Soap.,A ticking-time-bomb insomniac and a slippery s...,139.0,4.27
4,1000005,Interstellar,2014.0,Mankind was born on Earth. It was never meant ...,The adventures of a group of explorers who mak...,169.0,4.32


In [8]:
preprocessor.df_genres.head()

Unnamed: 0,id,genre
0,1000001,Comedy
1,1000001,Fantasy
2,1000001,Adventure
3,1000002,Comedy
4,1000002,Thriller


#### Data Cleaning and Splitting

In [9]:
preprocessor.handleDataFrame()
X_train, X_test, y_train, y_test = preprocessor.splitData(TEST_SIZE, RANDOM_STATE)
X_train_desc_only = X_train.description

Reduced Length:  540740
Reduced Columns:  Index(['id', 'genre', 'name', 'description', 'rating'], dtype='object')


In [10]:
X_train_desc_only

585217    In this intimate and surprising documentary, t...
122446    The Stooges, not faring well with their diner,...
229370    The plot revolves around the life of three you...
197995    A young soldier's fear during an artillery att...
398017    Mumbai-based Rajendra Gupta works for Global I...
                                ...                        
110799    In a modern-day adaptation of Charles Dickens'...
266435    A passionate woman, a perfect idea and a vulne...
389542    An anti-hunting short film film made for the H...
132826    The life and career of Clarence Darrow, the no...
122680    Zero finds himself trapped within a universal ...
Name: description, Length: 378518, dtype: object

#### Vectorization

In [11]:
X_tfidf, features = preprocessor.vectorizeData(X_train_desc_only, MAX_DF, MIN_DF, STOP_WORDS)

Vectorized in 7.756 s
# of Samples: 378518
# of Features: 81591
% of Nonzero Entries: 0.010


In [12]:
X_tfidf[0]

<1x81591 sparse matrix of type '<class 'numpy.float64'>'
	with 14 stored elements in Compressed Sparse Row format>

#### Dimensionality Reduction

In [13]:
genres = preprocessor.countGenres()
X_lsa, lsa = preprocessor.reduceDim(X_tfidf, N_COMPONENTS, NORMALIZER_COPY)

# of Genres:  19
Unique Genres:
                     id
genre                  
Action            19194
Adventure          8827
Animation         39939
Comedy            89726
Crime             15459
Documentary      138760
Drama            153192
Family            12476
Fantasy            9163
History            7233
Horror            31020
Music             29414
Mystery            8362
Romance           23825
Science Fiction    9519
TV Movie          10236
Thriller          21435
War                4647
Western            6205
LSA done in 7.316 s
Explained variance of the SVD step: 5.0%


In [14]:
X_lsa.shape

(378518, 50)

## Clustering and Optmizing K

In [None]:
# clusterer = DataClusterer(X_lsa, N_CLUSTERS, MAX_ITER, N_INIT)
# K, inertias = clusterer.computeKMeansInertia()

silhouettes = []

for i in range(2, N_CLUSTERS + 1):
    kmeans = KMeans(n_clusters=i)
    print("Insantiated", i)
    kmeans.fit(X_lsa)
    print("Model fitted")
    silhouettes.append(silhouette_score(X_lsa, kmeans.labels_))
    print("Silhouette computed")

plt.plot(range(2, N_CLUSTERS + 1), silhouettes, 'bx-')
plt.xlabel('K value')
plt.ylabel('Silhouette')
plt.show()

In [33]:
X_lsa.shape

(378518, 50)

In [18]:
clusterer = DataClusterer(X_lsa, N_CLUSTERS, MAX_ITER, N_INIT)

In [19]:
optimal_k = 15
centers, y_kmeans, inertia = clusterer.computeKMeans(optimal_k)
clusterer.getClusters(optimal_k, centers, features, lsa)

Cluster 0: school high student students teacher girls college group class girl friends new year life children friend day young gets way 
Cluster 1: woman young husband man love life mysterious finds meets relationship married falls beautiful gets home men past help story tries 
Cluster 2: series movie short comedy based drama horror directed tv stories animated video films special documentary animation film events set new 
Cluster 3: love young girl father time day wife friends mother people home new couple house night make friend son town years 
Cluster 4: life love new death work way film years time day young documentary people real lives change make journey living takes 
Cluster 5: old year years new life boy girl mother father time friends friend home love lives son make day house parents 
Cluster 6: city new york big lives living takes village way brother music place people night life film help young streets country 
Cluster 7: family life father home son daughter mother young chi

In [24]:
X_train_clustered = X_train.assign(cluster=y_kmeans)
X_train_clustered.loc[X_train_clustered['cluster'] == 11].sort_values('rating', ascending=False).head(10)

Unnamed: 0,id,genre,name,description,rating,cluster
2720,1002722,Music,Folklore: The Long Pond Studio Sessions,"An intimate concert film, in which Taylor Swif...",4.46,11
736,1000737,Drama,Sunset Boulevard,A hack screenwriter writes a screenplay for a ...,4.44,11
2049,1002051,War,The Battle of Algiers,Tracing the struggle of the Algerian Front de ...,4.38,11
2143,1002145,Action,"Sherlock, Jr.","A film projectionist longs to be a detective, ...",4.36,11
20116,1020161,Action,Gintama: The Very Final,The concluding film to the Gintama animated se...,4.35,11
1100,1001101,Music,Taylor Swift: The Eras Tour,The cultural phenomenon continues on the big s...,4.34,11
2453,1002455,Documentary,Dear Zachary: A Letter to a Son About His Father,"In 2001, Andrew Bagby, a medical resident, is ...",4.33,11
5618,1005622,Documentary,Streetwise,This documentary about teenagers living on the...,4.33,11
3706,1003708,Thriller,Kill Bill: The Whole Bloody Affair,An assassin is shot and almost killed by her r...,4.32,11
527,1000528,Romance,Singin' in the Rain,"In 1927 Hollywood, a silent film production co...",4.32,11


In [34]:
X_train_clustered.shape

(378518, 6)

In [31]:
# np.save('cluster_centers.npy', centers) # Centroids
np.save('vectorized_data.npy', X_lsa) # Data as 'points'
# X_train_clustered.to_csv('clustered_movies.csv') # Data as presented to user

In [None]:
# TODO
# Export clustered data
# Given block of text, convert it into 6s-dim vector
# Compute euclidean distance for each centroid
# Return cluster with minimum distance
# Given the returned cluster, shoot out all other movies in the same cluster