## Preliminaries

#### Imports

In [1]:
import sys
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from time import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

In [2]:
# Allows auto reloading of modules
%load_ext autoreload
%autoreload 2

In [3]:
# Add path to src folder to the path variable 
PATH_TO_SRC = "./src"

if PATH_TO_SRC not in sys.path:
  sys.path.append(PATH_TO_SRC)

In [4]:
from data_preprocessor import DataPreprocessor

#### Parameters

In [40]:
MOVIES_PATH = "./dataset/movies.csv"
GENRES_PATH = "./dataset/genres.csv"

TEST_SIZE = 0.3
RANDOM_STATE = 42

MAX_DF = 0.5 # Ignore terms that appear in > 50% of the entries
MIN_DF = 5 # Ignore terms that appear in < 5 entries
STOP_WORDS = "english"

N_COMPONENTS = 100
NORMALIZER_COPY = False

N_CLUSTERS = 19
MAX_ITER = 100
N_INIT = 1

#### Instantiated classes

In [53]:
preprocessor = DataPreprocessor(MOVIES_PATH, GENRES_PATH)
kmeans = KMeans(n_clusters=N_CLUSTERS, max_iter=MAX_ITER, n_init=N_INIT)

## Preprocessing

In [54]:
preprocessor.df_movies.head()

Unnamed: 0,id,name,date,tagline,description,minute,rating
0,1000001,Barbie,2023.0,She's everything. He's just Ken.,Barbie and Ken are having the time of their li...,114.0,3.91
1,1000002,Parasite,2019.0,Act like you own the place.,"All unemployed, Ki-taek's family takes peculia...",133.0,4.57
2,1000003,Everything Everywhere All at Once,2022.0,The universe is so much bigger than you realize.,An aging Chinese immigrant is swept up in an i...,140.0,4.32
3,1000004,Fight Club,1999.0,Mischief. Mayhem. Soap.,A ticking-time-bomb insomniac and a slippery s...,139.0,4.27
4,1000005,Interstellar,2014.0,Mankind was born on Earth. It was never meant ...,The adventures of a group of explorers who mak...,169.0,4.32


In [55]:
preprocessor.df_genres.head()

Unnamed: 0,id,genre
0,1000001,Comedy
1,1000001,Fantasy
2,1000001,Adventure
3,1000002,Comedy
4,1000002,Thriller


#### Vectorization

In [56]:
preprocessor.handleDataFrame()
X_train, X_test, y_train, y_test = preprocessor.splitData(TEST_SIZE, RANDOM_STATE)
X_tfidf, features = preprocessor.vectorizeData(X_train, MAX_DF, MIN_DF, STOP_WORDS)

Reduced Length:  540740
Reduced Columns:  Index(['id', 'genre', 'name', 'description', 'rating'], dtype='object')
Vectorized in 7.312 s
# of Samples: 378518
# of Features: 58373
% of Nonzero Entries: 0.014


#### Dimensionality Reduction

In [57]:
genres = preprocessor.countGenres()
X_lsa, lsa = preprocessor.reduceDim(X_tfidf, N_COMPONENTS, NORMALIZER_COPY)

# of Genres:  19
Unique Genres:
                     id
genre                  
Action            19194
Adventure          8827
Animation         39939
Comedy            89726
Crime             15459
Documentary      138760
Drama            153192
Family            12476
Fantasy            9163
History            7233
Horror            31020
Music             29414
Mystery            8362
Romance           23825
Science Fiction    9519
TV Movie          10236
Thriller          21435
War                4647
Western            6205
LSA done in 14.962 s
Explained variance of the SVD step: 8.0%


## Clustering

In [61]:
kmeans.fit(X_lsa)

y_kmeans = kmeans.predict(X_lsa) # Let kmeans assign the clusters/labels
centers = kmeans.cluster_centers_
inertia = kmeans.inertia_

In [62]:
original_space_centroids = lsa[0].inverse_transform(centers)
order_centroids = original_space_centroids.argsort()[:, ::-1]

for i in range(N_CLUSTERS):
    print(f"Cluster {i}: ", end="")
    for ind in order_centroids[i, :10]:
        print(f"{features[ind]} ", end="")
    print()

Cluster 0: family life home mother young son years daughter man story 
Cluster 1: people life young film lives world documentary story group different 
Cluster 2: world war ii life film time years end new journey 
Cluster 3: life young friends girl time night gets wife friend finds 
Cluster 4: live band music concert tour rock album video footage special 
Cluster 5: love falls life young girl fall man meets daughter story 
Cluster 6: woman young man husband life love finds mysterious beautiful married 
Cluster 7: new york city life year film young years time home 
Cluster 8: day life time lives young man modern years night work 
Cluster 9: man young life wife finds old home gets mysterious tries 
Cluster 10: old year life years mother girl boy time friend home 
Cluster 11: film directed director feature life history work footage time produced 
Cluster 12: story tells life film love true young years man history 
Cluster 13: short film documentary animated animation experimental comedy f