In [28]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from scipy.cluster.vq import kmeans

In [30]:
df = pd.read_csv('data/movies_plot.csv')
df.head()

Unnamed: 0,Title,Plot
0,The Ballad of Cable Hogue,"Cable Hogue is isolated in the desert, awaitin..."
1,Monsters vs. Aliens,"In the far reaches of space, a planet explodes..."
2,The Bandit Queen,Zarra Montalvo is the daughter of an American ...
3,Broken Arrow,Major Vic Deakins (John Travolta) and Captain ...
4,Dolemite,Dolemite is a pimp and nightclub owner who is ...


In [31]:
df.shape

(1000, 2)

In [32]:
df.isna().sum()

Title    0
Plot     0
dtype: int64

In [33]:
plots = df.Plot.tolist()
len(plots)

1000

In [34]:
def remove_noise(text, stop_words = ENGLISH_STOP_WORDS):
    tokens = word_tokenize(text)
    cleaned_tokens = []
    for token in tokens:
        token = re.sub('[^A-Za-z0-9]+', '', token)
        if len(token) > 1 and token.lower() not in stop_words:
            # Get lowercase
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [35]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=50, min_df=0.1, tokenizer=remove_noise)
tfidf_matrix = tfidf_vectorizer.fit_transform(plots)



In [36]:
num_clusters = 2

# Generate cluster centers through the kmeans function
cluster_centers, distortion = kmeans(tfidf_matrix.todense(), num_clusters)

# Generate terms from the tfidf_vectorizer object
terms = tfidf_vectorizer.get_feature_names_out()

for i in range(num_clusters):
    # Sort the terms and print top 3 terms
    center_terms = dict(zip(terms, list(cluster_centers[i])))
    sorted_terms = sorted(center_terms, key=center_terms.get, reverse=True)
    print(sorted_terms[:3])

['father', 'new', 'man']
['film', 'life', 'new']
