In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import string
import nltk
from nltk.corpus import stopwords

try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')

def remove_stops(text, stops):
    text = text.lower().translate(str.maketrans("", "", string.punctuation))
    
    words = text.split()
    filtered_words = [word for word in words if word not in stops]
    
    final_text = " ".join(filtered_words)
    
    final_text = ''.join([i for i in final_text if not i.isdigit()])
    
    return final_text

def clean_plots(docs):
    stops = set(stopwords.words("english"))
    
    final_docs = [remove_stops(doc, stops) for doc in docs]
    
    return final_docs

plots_json = pd.read_json("./results/plots.json")
plots = plots_json['plot']
cleaned_plots = clean_plots(plots)
ids = plots_json['id']

In [None]:
vectorizer = TfidfVectorizer(
                                lowercase=True,
                                max_features=100,
                                max_df=0.8,
                                min_df=5,
                                ngram_range = (1,3),
                                stop_words = "english"
                            )

vectors = vectorizer.fit_transform(cleaned_plots)

feature_names = vectorizer.get_feature_names_out()

dense = vectors.todense()
denselist = dense.tolist()

all_keywords = []

for description in denselist:
    x=0
    keywords = []
    for word in description:
        if word > 0:
            keywords.append(feature_names[x])
        x=x+1
    all_keywords.append(keywords)

true_k = 12

model = KMeans(n_clusters=true_k, init="k-means++", max_iter=100, n_init=1)

model.fit(vectors)

order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

cluster_labels = model.labels_

id_to_cluster = pd.DataFrame({'id': ids, 'cluster': cluster_labels})
id_to_cluster.to_csv('results/id_to_cluster.csv', index=False)

cluster_terms = pd.DataFrame(columns=['Id','Name', 'Top Terms'])

rows = []
for i in range(true_k):
    top_terms = ', '.join([terms[ind] for ind in order_centroids[i, :10]])
    rows.append({'Id': i, 'Name': f'Cluster {i}', 'Top Terms': top_terms})

cluster_terms = pd.concat([cluster_terms, pd.DataFrame(rows)], ignore_index=True)
cluster_terms.to_csv('results/cluster_terms.csv', index=False)

In [6]:
terms

array(['agent', 'american', 'army', 'attempt', 'battle', 'begins', 'best',
       'bond', 'boy', 'called', 'child', 'city', 'crew', 'crime',
       'dangerous', 'daughter', 'day', 'death', 'detective', 'discover',
       'discovers', 'earth', 'evil', 'falls', 'family', 'father', 'fight',
       'finds', 'forced', 'forces', 'friend', 'friends', 'future', 'game',
       'girl', 'goes', 'group', 'help', 'high', 'high school', 'home',
       'human', 'james', 'john', 'killer', 'learns', 'life', 'lives',
       'love', 'make', 'makes', 'man', 'meets', 'mother', 'murder',
       'mysterious', 'new', 'new york', 'new york city', 'old', 'order',
       'past', 'people', 'plan', 'planet', 'police', 'powerful', 'real',
       'relationship', 'rescue', 'return', 'save', 'school', 'secret',
       'sent', 'sets', 'small', 'son', 'star', 'stop', 'story', 'takes',
       'team', 'teenager', 'time', 'town', 'train', 'tries', 'true',
       'trying', 'war', 'way', 'wife', 'woman', 'work', 'world', 'ye