In [23]:
import pandas as pd
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import plotly.express as px
import nltk
from nltk.corpus import stopwords
import string
import fasttext
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import HDBSCAN

nltk.download("stopwords")
nltk.download("punkt_tab")

[nltk_data] Downloading package stopwords to /home/woleek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/woleek/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Load data

In [24]:
df = pd.read_csv('../data/movie_plots.csv')
df.head(1)

Unnamed: 0,label,text
0,film noir,The film tells the story of Elizabeth (Colbert...


# Preprocess text

In [25]:
def preprocess_texts(text):
    # remove extra whitespace
    text = text.strip()
    text = " ".join(text.split())
    
    # lowercase text
    text = text.lower()

    # tokenize text
    tokens = nltk.word_tokenize(text)

    # remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]

    # remove stopwords
    tokens = [token for token in tokens if token not in stopwords.words("english")]
    return " ".join(tokens)

In [26]:
df['cleaned_text'] = df['text'].apply(preprocess_texts)
df.head(1)

Unnamed: 0,label,text,cleaned_text
0,film noir,The film tells the story of Elizabeth (Colbert...,film tells story elizabeth colbert john welles...


# Text embeddings

### FastText

In [27]:
ft = fasttext.load_model("../models/cc.en.300.bin")

In [28]:
ft_df = df.copy()

ft_df["ft_embeddings"] = df["cleaned_text"].apply(  
    lambda x: ft.get_sentence_vector(x)
)
ft_df.head(1)

Unnamed: 0,label,text,cleaned_text,ft_embeddings
0,film noir,The film tells the story of Elizabeth (Colbert...,film tells story elizabeth colbert john welles...,"[-0.0124314, 0.007053868, 0.0022771724, 0.0611..."


In [29]:
ft_embed_arr = np.array(ft_df["ft_embeddings"].to_list())

In [30]:
tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=1000)
X_tsne = tsne.fit_transform(ft_embed_arr)

ft_df['text'] = ft_df['text'].apply(lambda x: " ".join(x.split()[:5]) + "...")
ft_df["x-tsne"] = X_tsne[:, 0]
ft_df["y-tsne"] = X_tsne[:, 1]
ft_df.head(1)

Unnamed: 0,label,text,cleaned_text,ft_embeddings,x-tsne,y-tsne
0,film noir,The film tells the story...,film tells story elizabeth colbert john welles...,"[-0.0124314, 0.007053868, 0.0022771724, 0.0611...",-1.394166,1.630814


In [31]:
# Create interactive scatter plot
fig = px.scatter(
    ft_df, x='x-tsne', y='y-tsne', color='label',
    hover_data={'text': True, 'label': True},
    title='Sentences t-SNE Visualization with Fasttext Embeddings', width=1200, height=1200
)

# Show plot
fig.show()

### TF-IDF

In [14]:
vectorizer = TfidfVectorizer(max_features=300)

In [16]:
tfidf_df = df.copy()

X = vectorizer.fit_transform(tfidf_df["cleaned_text"])
print(f" Shape of the TF-IDF matrix: {X.shape}")

tfidf_df["embeddings"] = [X[i].toarray()[0] for i in range(X.shape[0])]

tfidf_emb_arr = np.array(tfidf_df['embeddings'].tolist())

 Shape of the TF-IDF matrix: (300, 300)


In [20]:
tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=1000)
X_tsne = tsne.fit_transform(tfidf_emb_arr)

tfidf_df['text'] = tfidf_df['text'].apply(lambda x: " ".join(x.split()[:5]) + "...")

# Add t-SNE results to the DataFrame
tfidf_df["x-tsne"] = X_tsne[:, 0]
tfidf_df["y-tsne"] = X_tsne[:, 1]

# Create interactive scatter plot
fig_tfidf = px.scatter(
    tfidf_df,
    x="x-tsne",
    y="y-tsne",
    color="label",
    hover_data={"text": True, "label": True},
    title="Sentences t-SNE Visualization with TF-IDF Embeddings",
    width=1200,
    height=1200,
)
# Show plot

fig_tfidf.show()

### HDBSCAN

In [42]:
hdbscan = HDBSCAN(
    min_cluster_size=5, metric="euclidean", cluster_selection_method="eom"
)

In [43]:
ft_df["cluster"] = hdbscan.fit_predict(ft_embed_arr)
ft_df.head(1)

Unnamed: 0,label,text,cleaned_text,ft_embeddings,x-tsne,y-tsne,cluster
0,film noir,The film tells the story...,film tells story elizabeth colbert john welles...,"[-0.0124314, 0.007053868, 0.0022771724, 0.0611...",-1.394166,1.630814,0


In [49]:
# Create interactive scatter plot
ft_df["cluster"] = ft_df["cluster"].astype(str)
fig = px.scatter(
    ft_df, x='x-tsne', y='y-tsne', color='cluster',
    hover_data={'text': True, 'label': True, "cluster": True},
    title='Sentences t-SNE Visualization with FastText Embeddings and HDBSCAN Clustering', width=1200, height=1200
)

# Show plot
fig.show()

In [46]:
tfidf_df['cluster'] = hdbscan.fit_predict(tfidf_emb_arr)

In [50]:
tfidf_df['cluster'] = tfidf_df['cluster'].astype(str)
fig_cluster = px.scatter(
    tfidf_df,
    x="x-tsne",
    y="y-tsne",
    color="cluster",
    hover_data={"text": True, "label": True, "cluster": True},
    title="Sentences t-SNE Visualization with TF-IDF Embeddings and HDBSCAN Clustering",
    width=1200,
    height=1200,
)
fig_cluster.show()