### Natural Language Processing Final Exam
#### Question 1) Text processing, feature extraction and representation by using both TF and TF-IDF schemes

In [None]:
#%pip install Pillow
#%pip install --upgrade Pillow
#%pip install WordCloud
#%pip install --upgrade WordCloud
#%pip uninstall Pillow
#%pip install --upgrade Pillow

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import string
import warnings

from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, NMF
from sklearn.exceptions import ConvergenceWarning
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud


warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Load the data: Exam_NLP.csv
df = pd.read_csv('Exam_NLP.csv')
print(df.shape)
df.head()

### Task 1: Data Preprocessing
#### Access the columns, then through printing and visualization, understand the meaning in each column.

In [None]:
# Make a working DataFrame for Question 1.
df_NLP_q1 = df.copy()
print(df_NLP_q1.dtypes)

In [None]:
# Convert 'release_date' to datetime
df_NLP_q1['release_date'] = pd.to_datetime(df_NLP_q1['release_date'])

In [None]:
# Extract unique original languages.
df_NLP_q1['original_language'].unique()

In [None]:
# Get the counts of movies based on unique 'original_language'
language_counts = df_NLP_q1['original_language'].value_counts().reset_index()
language_counts.columns = ['Original Language', 'Number of Movies']

# Display the table
print(language_counts)

In [None]:
# Plot -  top 10 languages


language_counts = df_NLP_q1['original_language'].value_counts()
top_language = language_counts.idxmax()
top_languages = language_counts.nlargest(10).index

# Set the style for the plot
sns.set(style="whitegrid")


# Set up subplots
fig, axs = plt.subplots(1, 2, figsize=(18, 8), gridspec_kw={'width_ratios': [1, 4]})

# Plot for the top language
sns.countplot(x='original_language', data=df_NLP_q1[df_NLP_q1['original_language'] == top_language], palette='viridis', ax=axs[0])
axs[0].set_title(f'{top_language.capitalize()} ({len(df_NLP_q1[df_NLP_q1["original_language"] == top_language])})', fontsize=16)

# Plot for the other nine languages (excluding 'en')
sns.countplot(x='original_language', data=df_NLP_q1[df_NLP_q1['original_language'].isin(top_languages) & (df_NLP_q1['original_language'] != 'en')], palette='viridis', order=top_languages, ax=axs[1])
axs[1].set_title('Other Languages', fontsize=16)

# Rotate x-axis labels for better readability
axs[1].tick_params(axis='x', rotation=90, labelsize=14)  # Increase x-axis label font size

# Add numbers on the bars
for ax in axs:
    for p in ax.patches:
        ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='baseline', fontsize=20)

# Add labels and title
fig.suptitle('Top 10 Original Languages', fontsize=18)
# axs[0].set_ylabel('Number of Movies')
# axs[1].set_ylabel('Number of Movies')
# axs[1].set_xlabel('Original Language')

axs[0].tick_params(axis='y', labelsize=14)
axs[1].tick_params(axis='y', labelsize=14)

# Remove the empty space for the 'en' bar
axs[1].margins(x=0)

# Adjust layout for better spacing
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()


In [None]:
# Set the display format for float values to be in standard decimal notation
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# print as decimal
revenue_stats = df_NLP_q1['revenue'].describe()
print(revenue_stats)

In [None]:
# Plot - Average Budget and Revenue Over Release Years with Variance Shadows


# Convert release_date to datetime
df_NLP_q1['release_date'] = pd.to_datetime(df_NLP_q1['release_date'])

# Extract release year
df_NLP_q1['release_year'] = df_NLP_q1['release_date'].dt.year

# Set the style for the plot
sns.set(style="whitegrid")

# Set the figure size
plt.figure(figsize=(12, 6))

# Create a line plot with shadows for average budget
sns.lineplot(x='release_year', y='budget', data=df_NLP_q1, label='Average Budget', err_style="band", lw=2, errorbar='sd')

# Create a line plot with shadows for average revenue
sns.lineplot(x='release_year', y='revenue', data=df_NLP_q1, label='Average Revenue', err_style="band", lw=2, errorbar='sd')

# Add labels and title
plt.xlabel('Release Year')
plt.ylabel('Amount (in billions)')
plt.title('Average Budget and Revenue Over Release Years with Variance Shadows')
plt.legend()
plt.show()

In [None]:
# Plot - Average Budget and Revenue Over Release Years with Min-Max Shadows


# Convert release_date to datetime
df_NLP_q1['release_date'] = pd.to_datetime(df_NLP_q1['release_date'])

# Extract release year
df_NLP_q1['release_year'] = df_NLP_q1['release_date'].dt.year

# Set the style for the plot
sns.set(style="whitegrid")

# Set the figure size
plt.figure(figsize=(12, 6))

# Create a line plot with shadows for average budget
sns.lineplot(x='release_year', y='budget', data=df_NLP_q1, label='Average Budget', err_style="band", lw=2, errorbar=None)
sns.lineplot(x='release_year', y='budget', data=df_NLP_q1, err_style="band", alpha=0.2, errorbar='sd', color='blue')

# Create a line plot with shadows for average revenue
sns.lineplot(x='release_year', y='revenue', data=df_NLP_q1, label='Average Revenue', err_style="band", lw=2, errorbar=None)
sns.lineplot(x='release_year', y='revenue', data=df_NLP_q1, err_style="band", alpha=0.2, errorbar='sd', color='orange')

# Add labels and title
plt.xlabel('Release Year')
plt.ylabel('Amount (in billions)')
plt.title('Average Budget and Revenue Over Release Years with Min-Max Shadows')
plt.legend()
plt.show()

### Then create a new column, name it as ‘description’ by concatenating the strings from two columns: tagline and overview.

In [None]:
# Check for missing value in 'tagline' and 'overview' columns
df_NLP_q1[['tagline', 'overview']].isnull().sum()

In [None]:
# Check for the shape of the DataFrame  
print(df_NLP_q1.shape)
df_NLP_q1.head(10)

In [None]:
# Check if there is any rows with missing values in both columns
df_NLP_q1[['tagline', 'overview']].isnull().all(axis=1).sum()

In [None]:
# Create the 'description' column from 'tagline' and 'overview', dropping rows with missing values in BOTH columns

# Drop rows with missing values in both 'tagline' and 'overview' columns
df_NLP_q1 = df_NLP_q1.dropna(subset=['tagline', 'overview'], how='all')

# Create the 'description' column
df_NLP_q1['description'] = df_NLP_q1['tagline'].astype(str) + ' ' + df_NLP_q1['overview'].astype(str)

# Check for the shape of the DataFrame
print(df_NLP_q1.shape)

# Display the head of the 'description' column
print(df_NLP_q1['description'].head())

In [None]:
# Check for missing values in 'description' column
df_NLP_q1['description'].isnull().sum()

### Task 2: Text Preprocessing
#### convert words in ‘description’ to lower case, remove white space, remove words from stop_words (from nltk package), remove special characters (such as ‘/n’) and add other necessary processings.

In [None]:
# Create a text_processing function.
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def process_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove white spaces
    text = ' '.join(text.split())
    
    # Remove stop words
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    
    # Remove special characters
    text = ''.join([char for char in text if char not in string.punctuation])
    
    # Remove numbers
    text = ''.join([char for char in text if not char.isdigit()])
    
    # Lemmatize using WordNetLemmatizer
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    
    # Remove NaNs
    text = text.replace('nan', '')
    
    return text

# Apply text processing to the 'description' column
df_NLP_q1['processed_description'] = df_NLP_q1['description'].apply(process_text)

# Check the shape of the DataFrame
print(df_NLP_q1.shape)

# Output 15 rows of the modified dataframe
print(df_NLP_q1[['processed_description']].head(10))

### Taks 3: TF and TF-IDF representation on ‘description’: for each sample in the dataset, generate TF and TF-IDF representation for each sample based on the column of ‘description’.

In [None]:
# Create tf vector


# To summarize the attributes of the DataFram
movie_descriptions = df_NLP_q1['processed_description'].copy()
n_samples = movie_descriptions.shape[0] # Number of samples (documents)
n_features = 3000  # Size of vocabulary, can set the desired number of features

count_vectorizer = CountVectorizer(
    # max_df=0.95, min_df=2, max_features=n_features, stop_words="english"
    max_features=n_features, stop_words="english"
)
# Join the tokenized words into a single string for each document
tf_vector = count_vectorizer.fit_transform(movie_descriptions)
tf_feature_names = count_vectorizer.get_feature_names_out()

# Explore the TF vectors
print(type(tf_vector))
print(tf_vector.shape)
print(tf_feature_names[:50])
print(tf_vector.toarray())


In [None]:
# Create TF-IDF vectors


movie_descriptions = df_NLP_q1['processed_description'].copy()
n_samples = movie_descriptions.shape[0] # Number of samples (documents)
n_features = 3000  # Size of vocabulary, can set the desired number of features

tfidf_vectorizer = TfidfVectorizer(
     # max_df=0.95, min_df=2, max_features=n_features, stop_words="english"
    max_features=n_features, stop_words="english"
)
# Join the tokenized words into a single string for each document
tfidf = tfidf_vectorizer.fit_transform(movie_descriptions)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

# Explore the TF-IDF vectors
print(type(tfidf))
print(tfidf.shape)
print(tfidf_feature_names[:50])
print(tfidf.toarray())

### 2.2 Topic Modeling (10 Marks)
Use TF and TF-IDF representation generated in task 2.1 to perform topic modelling. Select and compare two topic modelling algorithms from LDA, Truncated SVD, Word2Vec or any other topic modelling algorithms, and then analyze the results.

We Choose to compare LDA and Truncated SVD

In [None]:
### 1) LDA : Latent Dirichlet Allocation


# Create and fit the LDA model
n_topics = 10  # You specified n_topics separately, use it here
LDA_model = LatentDirichletAllocation(
    n_components=n_topics,  # Use n_topics instead of n_components
    max_iter=5,
    learning_method="online",
    learning_offset=50.0,
    random_state=42,
)

LDA_model.fit(tfidf) 

In [None]:
### 2) TruncatedSVD with TFIDF vectorizer


svd = TruncatedSVD(n_components=10, n_iter=100)
svd_topic_vectors = svd.fit_transform(tfidf)

topic_terms = svd.components_

# Convert the numpy array to a Pandas DataFrame
svd_topic_vectors = pd.DataFrame(svd_topic_vectors) 
svd_topic_vectors.round(3) # Round the values to 3 decimal places


# illustrated topics after truncated SVD: display topics and terms
top_terms = 30
TOTAL_TOPICS = 10 # we assume the optimal number of topics
vocabulary = np.array(tfidf_feature_names)
topic_key_term_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:,:top_terms]
topic_keyterm_weights = np.array([topic_terms[row, columns] for row, columns in list(zip(np.arange(TOTAL_TOPICS), topic_key_term_idxs))])

In [None]:
def plot_top_words(model, feature_names, n_top_words, title, importance_threshold=0.05):
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()

    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        # Apply threshold for topic importance to display the bars correctly
        if weights.max() < importance_threshold:
            continue

        # Normalize weights, also helping display the bars correctly
        weights /= weights.sum()

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx + 1}", fontdict={"fontsize": 30})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=20)
        ax.grid(False)  # Remove the background grid
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.20, hspace=0.2)
    plt.savefig('topics.png')
    plt.show()
    
# Plot the topics for LDA BOTH with TF and TF-IDF
plot_top_words(LDA_model, tfidf_feature_names, n_top_words=10, title="Topics in LDA model with TFIDF vector")
plot_top_words(LDA_model, tf_feature_names, n_top_words=10, title="Topics in LDA model with TF vector")

In [None]:
# Plot the Topics for TruncatedSVD BOTH with TF-IDF
svd = TruncatedSVD(n_components=10, n_iter=100)
svd_topic_vectors = svd.fit_transform(tfidf)
plot_top_words(svd, tfidf_feature_names, n_top_words=10, title="Topics in TruncatedSVD model with TFIDF vector")

# plot Topics in TruncatedSVD model with TF
svd = TruncatedSVD(n_components=10, n_iter=100)
svd_topic_vectors = svd.fit_transform(tf_vector.toarray())
plot_top_words(svd, tf_feature_names, n_top_words=10, title="Topics in TruncatedSVD model with TF vector")

In [None]:
### 4) Non-Negative Matrix Factorization(NMF):


# Fit NMF model
nmf_model = NMF(n_components=n_topics, init='random', random_state=42)
nmf_topic_vectors = nmf_model.fit_transform(tfidf)

# Display top words for each topic
plot_top_words(nmf_model, tfidf_feature_names, n_top_words=10, title="Topics in nmf_model with TFIDF vector")
plot_top_words(nmf_model, tf_feature_names, n_top_words=10, title="Topics in nmf_model with TF vector")

### Model Evaluation


In [None]:
# Topic perplexity


# Create a function to compute perplexity
def compute_perplexity_score(model, vectors):
    if isinstance(model, LatentDirichletAllocation):
        return model.perplexity(vectors)
    elif isinstance(model, TruncatedSVD):
        reconstructed = model.inverse_transform(model.transform(vectors))
        error = np.linalg.norm(vectors - reconstructed)
        perplexity = np.exp(error / vectors.shape[0])
        return perplexity
    elif isinstance(model, NMF):
        reconstructed = model.transform(vectors) @ model.components_
        error = np.linalg.norm(vectors - reconstructed)
        perplexity = np.exp(error / vectors.shape[0])
        return perplexity
    else:
        raise ValueError("Unsupported model type")


# Print perplexity for LDA_TF
perplexity_score_lda_tf = compute_perplexity_score(LDA_model, tf_vector)
print(f"Perplexity Score for LDA_TF: {perplexity_score_lda_tf}")

# Print perplexity for LDA_TFIDF
perplexity_score_lda_tfidf = compute_perplexity_score(LDA_model, tfidf)
print(f"Perplexity Score for LDA_TFIDF: {perplexity_score_lda_tfidf}")

# Print perplexity for TruncatedSVD_TF
perplexity_score_svd_tf = compute_perplexity_score(svd, tf_vector)
print(f"Perplexity Score for TruncatedSVD_TF: {perplexity_score_svd_tf}")

# Print perplexity for TruncatedSVD_TFIDF
perplexity_score_svd_tfidf = compute_perplexity_score(svd, tfidf)
print(f"Perplexity Score for TruncatedSVD_TFIDF: {perplexity_score_svd_tfidf}")

# Print perplexity for NMF_TF
perplexity_score_nmf_tf = compute_perplexity_score(nmf_model, tf_vector)
print(f"Perplexity Score for NMF_TF: {perplexity_score_nmf_tf}")

# Print perplexity for NMF_TFIDF
perplexity_score_nmf_tfidf = compute_perplexity_score(nmf_model, tfidf)
print(f"Perplexity Score for NMF_TFIDF: {perplexity_score_nmf_tfidf}")

In [None]:
# Plot - Perplexity Score over the models


# Set the seaborn style
sns.set_style('whitegrid')

# Choose a color palette, 'husl' is a nice palette for distinct colors
palette = sns.color_palette('husl', 3) 

# Perplexity scores
perplexity_tf = [2366.208130818902, 1.0735427937935456, 1.0740484162056743]  # LDA, SVD, NMF for TF
perplexity_tfidf = [7355.282280813201, 1.0142304178995196, 1.0142237116266126]  # LDA, SVD, NMF for TF-IDF

# Names of the models
names = ['LDA', 'Truncated SVD', 'NMF']

# Set the positions and width for the bars
positions = np.arange(len(names))
width = 0.35

# Plotting the bars
fig, ax = plt.subplots(figsize=(8, 6))

# Create bars for TF
tf_per = ax.bar(positions - width/2, perplexity_tf, width, label='TF', color='darkseagreen')

# Create bars for TF-IDF
tfidf_per = ax.bar(positions + width/2, perplexity_tfidf, width, label='TF-IDF', color='#4361EE')

# [https://stackoverflow.com/questions/58325443/how-to-annotate-bar-chart-with-values-different-to-those-from-get-height]
# Adding the text labels on the bars
def text_label(bars_charts):
    for i in bars_charts:
        height = i.get_height() #get height to get a text
        ax.annotate('{}'.format(round(height, 2)),
                    xy=(i.get_x() + i.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom',
                    fontsize=10)

# Adding titles and labels
ax.set_title('Perplexity Score')
ax.set_xlabel('Models')
ax.set_ylabel('Perplexity Score (log scale)')
ax.set_xticks(positions)
ax.set_xticklabels(names)
ax.tick_params(axis='y', labelsize=10)
ax.tick_params(axis='x', labelsize=10)
ax.yaxis.label.set_size(10)
ax.xaxis.label.set_size(10)
ax.title.set_size(11)
ax.set_yscale('log')  # Set the y-axis to a logarithmic scale
ax.grid(False)
ax.legend()
text_label(tf_per)
text_label(tfidf_per)

# Change y-axis labels to normal numbers
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, _: '{:g}'.format(y)))

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
# Topic Stability score


# Ignore convergence warnings
warnings.simplefilter("ignore", ConvergenceWarning)

# Function to compute topic stability
def compute_topic_stability(model, vectors, n_runs=10):
    stability_scores = []

    for _ in range(n_runs):
        # Train the model
        model.fit(vectors)

        # Get the topic-word matrix
        topic_word_matrix = model.components_

        # Calculate cosine similarity between topics
        cosine_similarity_matrix = cosine_similarity(topic_word_matrix)
        stability_scores.append(np.mean(cosine_similarity_matrix))

    return np.mean(stability_scores)

# List of models and vectors
models = [LDA_model, LDA_model, svd, svd, nmf_model, nmf_model]
vectors = [tf_vector, tfidf, tf_vector, tfidf, tf_vector, tfidf]

# Loop to compute and print topic stability scores
for model, vector, model_name in zip(models, vectors, ["LDA_TF", "LDA_TFIDF", "TruncatedSVD_TF", "TruncatedSVD_TFIDF", "NMF_TF", "NMF_TFIDF"]):
    stability_score = compute_topic_stability(model, vector)
    print(f"Topic Stability for {model_name}: {stability_score}")

In [None]:
# Plot - Topic stability score


# stability score
topic_stability_tf = [0.2409324219514124, 0.09999999999999998, 0.1449234941037762]  # LDA, SVD, NMF for TF
topic_stability_tfidf = [0.6862930148616094, 0.10000000000000005, 0.178338990363186]  # LDA, SVD, NMF for TF-IDF

# Names of the models
names = ['LDA', 'Truncated SVD', 'NMF']

# Set the positions and width for the bars
positions = np.arange(len(names))
width = 0.35

# Plotting the bars
fig, ax = plt.subplots(figsize=(8, 6))

# Create bars for TF
tf_topic_stability = ax.bar(positions - width/2, topic_stability_tf, width, label='TF', color='darkseagreen')

# Create bars for TF-IDF
tfidf_topic_stabilty = ax.bar(positions + width/2, topic_stability_tfidf,width, label='TF-IDF', color='#4361EE')

# [https://stackoverflow.com/questions/58325443/how-to-annotate-bar-chart-with-values-different-to-those-from-get-height]
# Adding the text labels on the bars
def text_label(bars_charts):
    for i in bars_charts:
        height = i.get_height() #get height to get a text
        ax.annotate('{}'.format(round(height, 2)),
                    xy=(i.get_x() + i.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom', 
                    fontsize=10)

# Adding titles and labels
ax.tick_params(axis='both', labelsize=10)
ax.legend(fontsize=10)

ax.set_title('Topic stability score')
ax.set_xlabel('Models')
ax.set_ylabel('Topic stability score')  # Removed '(log scale)' for clarity
ax.set_xticks(positions)
ax.set_xticklabels(names)
ax.tick_params(axis='y', labelsize=10)
ax.tick_params(axis='x', labelsize=10)
ax.yaxis.label.set_size(10)
ax.xaxis.label.set_size(10)
ax.title.set_size(11)
ax.set_yscale('linear')  # Set the y-axis to a linear scale (not log)
ax.grid(False)
ax.legend()
text_label(tf_topic_stability)
text_label(tfidf_topic_stabilty)

# Change y-axis labels to normal numbers
ax.yaxis.set_major_formatter(ticker.StrMethodFormatter("{x}"))
# Show the plot
plt.tight_layout()
plt.show()

In [None]:
# Topic Diversity


def compute_topic_diversity(model, feature_names, n_top_words=10):
    # Get top words for each topic
    top_words_per_topic = [feature_names[model.components_[i].argsort()[-n_top_words:][::-1]] for i in range(model.components_.shape[0])]
    
    # Calculate pairwise Jaccard similarity between sets of top words
    similarities = []
    for i in range(len(top_words_per_topic)):
        for j in range(i + 1, len(top_words_per_topic)):
            jaccard_similarity = len(set(top_words_per_topic[i]) & set(top_words_per_topic[j])) / len(set(top_words_per_topic[i]) | set(top_words_per_topic[j]))
            similarities.append(jaccard_similarity)
    
    # Calculate average similarity
    avg_similarity = sum(similarities) / len(similarities)
    
    return avg_similarity

# Assuming you have a list of tokenized texts named 'tokenized_texts'
# Make sure 'tokenized_texts' corresponds to the documents used to train your model

# List of models and vectors
models = [LDA_model, LDA_model, svd, svd, nmf_model, nmf_model]
vectors = [tf_vector, tfidf, tf_vector, tfidf, tf_vector, tfidf]
model_names = ["LDA_TF", "LDA_TFIDF", "TruncatedSVD_TF", "TruncatedSVD_TFIDF", "NMF_TF", "NMF_TFIDF"]

# Loop to compute and print only topic diversity scores
for model, vector, model_name in zip(models, vectors, model_names):
    topic_diversity = compute_topic_diversity(model, tf_feature_names)
    print(f"Topic Diversity for {model_name}: {topic_diversity}")

In [None]:
# Plot - Topic diversity score


#Topic Diversity score
topic_diversity_tf = [0.014035087719298244, 0.1181504960658727, 0.008187134502923975]  # LDA, SVD, NMF for TF
topic_diversity_tfidf = [0.014035087719298244, 0.1181504960658727, 0.008187134502923975]  # LDA, SVD, NMF for TF-IDF

# Names of the models
names = ['LDA', 'Truncated SVD', 'NMF']

# Set the positions and width for the bars
positions = np.arange(len(names))
width = 0.35

# Plotting the bars
fig, ax = plt.subplots(figsize=(8, 6))

# Create bars for TF
tf_topic_diver = ax.bar(positions - width/2, topic_diversity_tf, width, label='TF', color='darkseagreen')

# Create bars for TF-IDF
tfidf_topic_diver = ax.bar(positions + width/2, topic_diversity_tfidf,width, label='TF-IDF', color='#4361EE')

# [https://stackoverflow.com/questions/58325443/how-to-annotate-bar-chart-with-values-different-to-those-from-get-height]
# Adding the text labels on the bars
def text_label(bars_charts):
    for i in bars_charts:
        height = i.get_height() #get height to get a text
        ax.annotate('{}'.format(round(height, 2)),
                    xy=(i.get_x() + i.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom', 
                    fontsize=10)

# Adding titles and labels
ax.tick_params(axis='both', labelsize=10)
ax.legend(fontsize=10)

ax.set_title('Topic diversity score')
ax.set_xlabel('Models')
ax.set_ylabel('Topic diversity score')  # Removed '(log scale)' for clarity
ax.set_xticks(positions)
ax.set_xticklabels(names)
ax.tick_params(axis='y', labelsize=10)
ax.tick_params(axis='x', labelsize=10)
ax.yaxis.label.set_size(10)
ax.xaxis.label.set_size(10)
ax.title.set_size(11)

ax.set_yscale('linear')  # Set the y-axis to a linear scale (not log)
ax.grid(False)
ax.legend()
text_label(tf_topic_diver)
text_label(tfidf_topic_diver)
# Show the plot
plt.tight_layout()
plt.show()

# evaluation of models 
Evaluating the performance of topic modeling, whether based on Term Frequency (TF) or Term Frequency-Inverse Document Frequency (TF-IDF), involves assessing the quality and coherence of the generated topics. Here are several common methods for evaluating topic modeling performance:

Perplexity:

Perplexity is a commonly used metric for assessing topic model performance. It measures how well the model predicts a held-out test set. Lower perplexity values indicate better performance. However, it's important to note that perplexity may not always correlate perfectly with the human interpretability of topics.


Coherence Score:

Coherence scores measure the semantic similarity between high-scoring words in a topic. Higher coherence scores suggest more coherent topics. Common coherence measures include Pointwise Mutual Information (PMI) and Normalized Pointwise Mutual Information (NPMI). Libraries like Gensim provide functions to compute coherence scores.

Manual Inspection:

Ultimately, the interpretability of topics is crucial. Manually inspecting the generated topics allows you to assess whether they make sense and align with your expectations. This involves looking at the top words in each topic and determining if they form coherent and meaningful themes.

Visualization:

Visualization techniques, such as t-SNE or UMAP, can be used to project high-dimensional topic distributions into 2D or 3D space. This allows you to visually inspect the separation and grouping of topics.

Topic Stability:

Topic stability assesses how consistent topics are across different runs or subsets of the data. This can be measured using techniques like Jaccard similarity or topic overlap.

Human Evaluation:

Conducting surveys or getting domain experts to evaluate the quality of topics is a valuable but resource-intensive approach. Experts can provide qualitative feedback on the relevance and coherence of topics.

Topic Diversity:

Assess whether topics cover a diverse range of themes or if they are too similar. A good topic model should capture diverse aspects of the dataset.
It's essential to use a combination of these evaluation methods, as no single metric can fully capture the quality of generated topics. Additionally, the choice of evaluation metrics may depend on the specific goals of your analysis and the characteristics of your data.

### 2.3. Analysis Task: Searching for similar movies (30 Marks):

Assume you would like to find similar movies as ‘Harry Potter and the Half-Blood Prince’ based
on the given dataset, what would you do? Assume you already know that the user DOES NOT like
‘Harry Potter and the Half-Blood Prince’, then which movies would you suggest the user to watch?
Then write a report using “An example of possible report structure” shown above. Please introduce
your solution, where minimum information should be provided as follows:
1. Details on each step and expected inputs/outputs of each step
2. Major algorithm to be used to solve this problem
3. The results
4. Analysis on the results
Be noted that visualization should be used when exploring the data or illustrating the results.


# To find the similar movies to 'Harry Potter and the Half-Blood Prince'

The following are the steps:
    1. Create a DataFrame with the movie title and description
    2. Tokenize the description
    3. Vectorize the description (TF-IDF)
    4. Compute Similarity (cosine similarity)
    5. Retrieve the similar movies, or dissimilar movies 



In [None]:
# The first 3 steps have been completed as part of the previous tasks:    
# 1. Create a DataFrame with the movie title and description
# 2. Tokenize the description 
# 3. Vectorize the description
tfidf_matrix = tfidf
    
# 4. Compute Similarity (cosine similarity)
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

# 5. Retrieve the similar movies, or dissimilar movies
def get_similar_movies(movie_title, cosine_similarities, titles):
    idx = titles.index(movie_title)
    similar_scores = list(enumerate(cosine_similarities[idx]))
    similar_scores = sorted(similar_scores, key=lambda x: x[1], reverse=True)
    similar_movies = [(titles[i], score) for i, score in similar_scores[1:]]  # Exclude the movie itself
    return similar_movies

# Example: Find similar movies to 'Harry Potter and the Half-Blood Prince'
movie_title = 'Harry Potter and the Half-Blood Prince'
similar_movies = get_similar_movies(movie_title, cosine_similarities, df['title'].tolist())

# Print the top similar movies
print(f"Movies similar to '{movie_title}':")
for title, score in similar_movies[:20]:
    print(f"{title} (Similarity Score: {score:.3f})")

In [None]:
# Plot - WordCloud top titles


# Prepare data for the word cloud
movie_frequencies = {title: score for title, score in similar_movies}

# Create and display the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=20)
wordcloud.generate_from_frequencies(movie_frequencies)
plt.figure(figsize=(15, 7.5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Top 20 Movies Similar to "Harry Potter and the Half-Blood Prince"')
plt.show()

In [None]:
# If we know that the audiance like the 'Harry Potter and the Half-Blood Prince', it is logical to focus on the Harry Potter francise.

# Transform movie descriptions into TF-IDF matrix
tfidf_matrix = tfidf

# Get feature names (words) from the TF-IDF vectorizer
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

# Compute Similarity (cosine similarity)
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Retrieve the similar movies, including at least one more Harry Potter movie
def get_similar_movies(movie_title, cosine_similarities, titles, num_movies=5):
    idx = titles.index(movie_title)
    similar_scores = list(enumerate(cosine_similarities[idx]))
    similar_scores = sorted(similar_scores, key=lambda x: x[1], reverse=True)

    # Filter out movies that are not Harry Potter movies
    similar_movies = [(titles[i], score) for i, score in similar_scores[1:] if 'harry potter' in titles[i].lower()]

    # Include at least one more Harry Potter movie if possible
    additional_movies = [(titles[i], score) for i, score in similar_scores[1:] if 'harry potter' not in titles[i].lower()]
    similar_movies += additional_movies[:max(0, num_movies - len(similar_movies))]

    return similar_movies[:num_movies]

# Example: Find similar movies to 'Harry Potter and the Half-Blood Prince'
movie_title = 'Harry Potter and the Half-Blood Prince'
similar_movies = get_similar_movies(movie_title, cosine_similarities, df_NLP_q1['title'].tolist())

# Print the top similar movies
print(f"Movies similar to '{movie_title}':")
for title, score in similar_movies:
    print(f"{title} (Similarity Score: {score:.3f})")

In [None]:
# In case the viewer DOES NOT like the movie, we can recommend the most dissimilar movies instead.


# 5. Retrieve the dissimilar movies
# Example: Find dissimilar movies to 'Harry Potter and the Half-Blood Prince'
hated_movie_title = 'Harry Potter and the Half-Blood Prince'

# Find the index of the hated movie in the titles list
hated_movie_index = df['title'].tolist().index(hated_movie_title)

# Get dissimilar movies using negative cosine similarities
dissimilar_scores = list(enumerate(-cosine_similarities[hated_movie_index]))
dissimilar_scores = sorted(dissimilar_scores, key=lambda x: x[1], reverse=True)
dissimilar_movies = [(df['title'].iloc[i], score) for i, score in dissimilar_scores[1:]]  # Exclude the hated movie itself

# Print the top dissimilar movies
print(f"Movies dissimilar to '{hated_movie_title}':")
for title, score in dissimilar_movies[:15]:
    print(f"{title} (Dissimilarity Score: {score:.3f})")