#### Imports

In [13]:
import sys
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from gensim.models import CoherenceModel, LdaModel
from gensim.corpora.dictionary import Dictionary
from gensim.matutils import Sparse2Corpus

movies = pd.read_csv('./dataset/movies.csv')

#### Preprocessing

In [14]:
movies.dropna(inplace=True, subset=['description'])

custom_stop_words = {'lives', 'life', 'director', 'directed', 'film', 'films', 'filmaker',}
combined_stop_words = list(ENGLISH_STOP_WORDS.union(custom_stop_words))

# Tokenizing descriptions for dictionary
tokenized_descriptions = [doc.split() for doc in movies['description']]

# Remove stop words and vectorize to bag of words
vectorizer = CountVectorizer(stop_words=combined_stop_words, min_df=100, max_df=0.06)
X = vectorizer.fit_transform(movies['description'])
vocab = vectorizer.get_feature_names_out()

# Convert sparse matrix to Gensim's corpus
corpus = Sparse2Corpus(X, documents_columns=False)
id2word = Dictionary(tokenized_descriptions)

# Normalize data
normalizer = Normalizer()
X_normalized = normalizer.fit_transform(X)

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x1072bad30>>
Traceback (most recent call last):
  File "/Users/biboy/Library/Python/3.9/lib/python/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


KeyboardInterrupt: 

##### Determining Range of Topics for LDA using Log Likelihoods

In [None]:
# log_likelihoods = []
# n_topics_options = range(5, 31)  # Trying from 5 to 30 topics
# for n_topics in n_topics_options:
#     print("Starting LDA", n_topics)
#     lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
#     print("Model insantiated")
#     lda.fit(X_normalized)
#     print("Model fitted")
#     ldaScore = lda.score(X_normalized)
#     log_likelihoods.append(ldaScore)
#     print("LDA", n_topics, ":", ldaScore)

# # Plot log likelihoods
# plt.figure(figsize=(12, 6))
# plt.plot(n_topics_options, log_likelihoods, marker='o')
# plt.xlabel('Number of Topics')
# plt.ylabel('Log Likelihood')
# plt.title('Log Likelihood by Number of Topics')
# plt.show()

##### Determining Model with highest Coherence score from range of Topics

In [None]:
selected_range = range(6,8)  # Selecting a range based on the log likelihood plot (TBD)
coherence_scores = []
models = []
for n_topics in selected_range:
    print("Starting", n_topics)
    lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=n_topics, passes=10, random_state=0)
    print("Model instantiated")
    models.append(lda_model)
    
    # Calculate coherence score
    cm = CoherenceModel(model=lda_model, texts=tokenized_descriptions, dictionary=id2word, coherence='c_v')
    coherence = cm.get_coherence()
    print("LDA", n_topics, ":", coherence)
    coherence_scores.append(coherence) 

    # Print top words per topic
    n_top_words = 15

    def print_top_words(model, n_top_words):
        for idx, topic in model.print_topics(-1):
            print(f"Topic #{idx}: {topic}")

    print("Top words per topic:")
    print_top_words(lda_model, n_top_words)


# Plot coherence scores
plt.figure(figsize=(12, 6))
plt.plot(list(selected_range), coherence_scores, marker='o')
plt.xlabel('Number of Topics')
plt.ylabel('Coherence Score')
plt.title('Topic Coherence by Number of Topics')
plt.show()

# Select the model with the highest coherence score
best_topic_n = selected_range[np.argmax(coherence_scores)]
best_lda_model = models[np.argmax(coherence_scores)]
lda_transformed = best_lda_model.get_document_topics(corpus, minimum_probability=0)

Starting 6
Model instantiated




LDA 6 : 0.32938387200324193
Top words per topic:
Topic #0: 0.042*"brutish;" + 0.013*"home/gallery" + 0.010*"geeky," + 0.009*"Rubens)," + 0.008*"childen" + 0.008*"'Nenek" + 0.006*"pinch." + 0.006*"z'n" + 0.005*"ensemble," + 0.005*"shootout."
Topic #1: 0.014*"Koyama," + 0.012*"umutsuzluğu," + 0.012*"Tibbett" + 0.010*"Damnation" + 0.008*"drag--" + 0.007*"Disappear" + 0.007*"1felco" + 0.007*"École" + 0.007*"sour-sweet" + 0.006*"incense."
Topic #2: 0.007*"dream—or" + 0.005*"Time"-Cinderella," + 0.005*"Priam" + 0.004*"hibakusha," + 0.004*"Marschall" + 0.004*"Zembla," + 0.004*"Ibaraki" + 0.004*"Wareheim's" + 0.003*"Moullet." + 0.003*"mysteriously"
Topic #3: 0.016*"Tuschinski" + 0.006*"screenwriters." + 0.005*"Auno" + 0.004*"Samford" + 0.004*"largo" + 0.004*"Babli" + 0.004*"couldn't'" + 0.004*"iced." + 0.004*"incense." + 0.004*"Merced"
Topic #4: 0.013*"pinch." + 0.010*"Tuschinski" + 0.009*"1felco" + 0.009*"beers" + 0.009*"Koyama," + 0.008*"throat-clenching" + 0.007*"Figueres" + 0.007*"Samford"

KeyboardInterrupt: 

#### Fitting to K-means Model and Determining Optimal Clusters

In [None]:
silhouette_scores = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(lda_transformed)
    score = silhouette_score(lda_transformed, kmeans.labels_)
    silhouette_scores.append(score)

# Plot silhouette scores
plt.figure(figsize=(12, 6))
plt.plot(range(2, 11), silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs Number of Clusters')
plt.show()

# Select the optimal number of clusters
optimal_clusters = silhouette_scores.index(max(silhouette_scores)) + 2
kmeans_final = KMeans(n_clusters=optimal_clusters, random_state=0)
kmeans_final.fit(lda_transformed)

#### Results and Plots

In [None]:
movies['Cluster'] = kmeans_final.labels_

# Plot clusters
plt.figure(figsize=(10, 5))
plt.hist(movies['Cluster'], bins=optimal_clusters, alpha=0.7, rwidth=0.85)
plt.xlabel('Cluster')
plt.ylabel('Number of Movies')
plt.title('Distribution of Movies Across Clusters')
plt.xticks(range(optimal_clusters))
plt.show()