#### Imports

In [2]:
import sys
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from gensim.models import CoherenceModel, LdaModel
from gensim.corpora.dictionary import Dictionary
from gensim.matutils import Sparse2Corpus

movies = pd.read_csv('./dataset/movies.csv')

#### Preprocessing

In [None]:
# Dropping null rows
movies.dropna(inplace=True, subset=['description'])

# Remove stop words and vectorize to bag of words
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(movies['description'])
vocab = vectorizer.get_feature_names_out()

# Convert the sparse matrix to Gensim's corpus
corpus = Sparse2Corpus(X, documents_columns=False)
id2word = dict((i, s) for i, s in enumerate(vocab))

# Normalize the data
normalizer = Normalizer()
X_normalized = normalizer.fit_transform(X)


##### Determining Range of Topics for LDA using Log Likelihoods

In [None]:
log_likelihoods = []
n_topics_options = range(5, 31)  # Trying from 5 to 30 topics
for n_topics in n_topics_options:
    print("Starting LDA", n_topics)
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
    print("Model insantiated")
    lda.fit(X_normalized)
    print("Model fitted")
    ldaScore = lda.score(X_normalized)
    log_likelihoods.append(ldaScore)
    print("LDA", n_topics, ":", ldaScore)

# Plot log likelihoods
plt.figure(figsize=(12, 6))
plt.plot(n_topics_options, log_likelihoods, marker='o')
plt.xlabel('Number of Topics')
plt.ylabel('Log Likelihood')
plt.title('Log Likelihood by Number of Topics')
plt.show()

##### Determining Model with highest Coherence score from range of Topics

In [None]:
selected_range = range(2,5)  # Selecting a range based on the log likelihood plot (TBD)
coherence_scores = []
models = []
for n_topics in selected_range:
    lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=n_topics, passes=10, random_state=0)
    models.append(lda_model)
    cm = CoherenceModel(model=lda_model, texts=[doc.split() for doc in movies['description']], dictionary=id2word, coherence='c_v')
    coherence_scores.append(cm.get_coherence())

# Plot coherence scores
plt.figure(figsize=(12, 6))
plt.plot(list(selected_range), coherence_scores, marker='o')
plt.xlabel('Number of Topics')
plt.ylabel('Coherence Score')
plt.title('Topic Coherence by Number of Topics')
plt.show()

# Select the model with the highest coherence score
best_topic_n = selected_range[np.argmax(coherence_scores)]
best_lda_model = models[np.argmax(coherence_scores)]
lda_transformed = best_lda_model.get_document_topics(corpus, minimum_probability=0)

#### Fitting to K-means Model and Determining Optimal Clusters

In [None]:
silhouette_scores = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(lda_transformed)
    score = silhouette_score(lda_transformed, kmeans.labels_)
    silhouette_scores.append(score)

# Plot silhouette scores
plt.figure(figsize=(12, 6))
plt.plot(range(2, 11), silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs Number of Clusters')
plt.show()

# Select the optimal number of clusters
optimal_clusters = silhouette_scores.index(max(silhouette_scores)) + 2
kmeans_final = KMeans(n_clusters=optimal_clusters, random_state=0)
kmeans_final.fit(lda_transformed)

#### Results and Plots

In [None]:
movies['Cluster'] = kmeans_final.labels_

# Plot clusters
plt.figure(figsize=(10, 5))
plt.hist(movies['Cluster'], bins=optimal_clusters, alpha=0.7, rwidth=0.85)
plt.xlabel('Cluster')
plt.ylabel('Number of Movies')
plt.title('Distribution of Movies Across Clusters')
plt.xticks(range(optimal_clusters))
plt.show()