In [None]:
%%capture
%run full_setup.py

In [None]:
from tfidf_corpus_dictionary import get_tfidf_tokendocs_corpus_dict
from gensim.models import LdaModel, LsiModel, CoherenceModel
from sklearn.decomposition import NMF, PCA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
import numpy as np
from scipy import sparse
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
tfidf_matrix, feature_names, tokenized_docs, corpus, dictionary = get_tfidf_tokendocs_corpus_dict(df, max_df=0.5, min_df=5, max_features=1000)

In [None]:
from coherence_by_topics import coherence_by_topics
from coherence_by_words import coherence_by_words

In [None]:
topics = [5, 10, 20, 50]

In [None]:
evaluation_by_topics = {}

for n_topics in topics:
    metrics_words = coherence_by_topics(n = n_topics, corpus=corpus, dictionary=dictionary,
                               texts=tokenized_docs, feature_names=feature_names, tfidf=tfidf_matrix)
    
    evaluation_by_topics[n_topics] = metrics_words

In [None]:
evaluation_by_topics[5]

In [None]:
evaluation_by_topics[10]

In [None]:
evaluation_by_topics[20]

In [None]:
evaluation_by_topics[50]

In [None]:
words = [10, 100, 1000, 10000]

In [None]:
evaluation_by_words = {}

for n_words in words:
    metrics_words = coherence_by_words(df, n = n_words)
    evaluation_by_words[n_words] = metrics_words

In [None]:
evaluation_by_words[10]

In [None]:
evaluation_by_words[100]

In [None]:
evaluation_by_words[1000]

In [None]:
evaluation_by_words[10000]

In [None]:
def tables(evaluation, type: str):
    # Specify the results folder
    results = R"results"
    results_folder = os.path.join(path, results)

    # Create individual DataFrames for each specific number
    dfs = {}
    for n, values in evaluation.items():
        dfs[n] = pd.DataFrame(values, columns=['Model', 'Coherence'])

    # Save each DataFrame as a PNG file with a title in the specified folder
    for n, df in dfs.items():
        fig, ax = plt.subplots(figsize=(4, 3))  # Adjust the figure size
        ax.axis('off')  # Turn off the axis

        # Set the width of the columns
        col_width = 1.0 / len(df.columns)
        cell_data = [df.columns] + df.values.tolist()  # Include column names as the first row
        table = ax.table(cellText=cell_data, loc='center', cellLoc='center', colLabels=None, edges='open')

        # Make column labels bold
        for (i, j), cell in table.get_celld().items():
            if i == 0:
                cell.set_text_props(fontweight='bold')

        # Adjust column width
        table.auto_set_column_width([0, 1])

        # Adjust the position of the table within the figure
        table.set_fontsize(13)  # Adjust font size
        table.scale(1, 2)  # Scale the table

        ax.set_title(f'Coherence with {n} {type}', fontsize=18, y=0.95)  # Add a title

        filename = os.path.join(results_folder, f'table_{n}_{type}.png')
        plt.savefig(filename, bbox_inches='tight', pad_inches=0.1)  # Adjust padding
        plt.close()  # Close the figure to avoid overlapping when saving multiple files

In [None]:
tables(evaluation_by_topics, 'topics')

In [None]:
tables(evaluation_by_words, 'words')

In [None]:
def plots(evaluation, type: str):
    for n, metrics in evaluation.items():
        model_names, coherence_values = zip(*metrics)

        # Create a DataFrame for easy plotting with Seaborn
        data = {'Model': model_names, 'Coherence Value': coherence_values}
        df = pd.DataFrame(data)

        plt.figure(figsize=(8, 5))
        
        # Use Seaborn's barplot with the hue parameter
        sns.barplot(x='Model', y='Coherence Value', data=df, hue='Model', palette='viridis')
        
        plt.xlabel('Model')
        plt.ylabel('Coherence Value')
        plt.title(f'Coherence Evaluation for {n} {type}')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        
        figures_folder = R'C:\Users\andre\OneDrive - Alma Mater Studiorum Università di Bologna\University\UniBo\Machine Learning\PR1.20\figures'
        save_path = os.path.join(figures_folder, f'coherence_evaluation_{n}_{type}')
        
        plt.savefig(save_path)
        plt.show()

In [None]:
plots(evaluation_by_topics, 'topics')

In [None]:
plots(evaluation_by_words, 'words')

In [None]:
from display_topics import display_topics

Now we'll fit the LDA model with the number of topics that yields the highest coherence

In [None]:
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5,
                     alpha='symmetric', eta='auto', passes=5, random_state=1)

In [None]:
display_topics('LDA', lda_model, feature_names)

Now we're going to do the same for LSA

In [None]:
lsi_model = LsiModel(corpus, id2word=dictionary, num_topics=5, random_seed = 1)

In [None]:
display_topics('LSA', lsi_model, feature_names)

In [None]:
nmf_model = NMF(n_components=5, random_state=1).fit(tfidf_matrix)

In [None]:
display_topics('NMF', nmf_model, feature_names)

In [None]:
tfidf_matrix_dense = tfidf_matrix.todense() if sparse.issparse(tfidf_matrix) else tfidf_matrix

# Convert to numpy array
tfidf_matrix_array = np.asarray(tfidf_matrix_dense)

# Centering
mean_tfidf = np.mean(tfidf_matrix_array, axis=0)  # Calculate the mean of each column
centered_tfidf_matrix = tfidf_matrix_array - mean_tfidf

pca_model = PCA(n_components=5, random_state=1).fit(centered_tfidf_matrix)

In [None]:
display_topics('PCA', pca_model, feature_names)

In [None]:
rp_model = GaussianRandomProjection(n_components=5, random_state=1).fit(tfidf_matrix)

In [None]:
display_topics('RP', rp_model, feature_names)