## Project overview

The goal of the project is to evaluate how different models perform in the task of topic modelling.
The models used by the paper are:
1. Latent Dirchlet Allocation (LDA)
2. Latent Semantic Analysis (LSA)
3. Non-Negative Matrix Factorization (NMF)
4. Principal Component Analysis (PCA)
5. Random Projection (RP)

The evaluation performed by the authors consisted in examining models performances when changing the **number of topics** or the **number of words**, keeping all the others parameters fixed.

The authors didn't specify which parameters were fixed while performing evaluation, therefore we had to choose our own settings. 

## Importing and preprocessing
The data used comes from the 20-Newsgroups dataset, available online, containing various texts from different topics. The importing required preprocessing steps such as:
- removal of special characters using regular expressions
- tokenization
- lemmatization with the aid of Part Of Speech

Below the code used for importing and preprocessing

```python
import pandas as pd
import os
import re, string
from nltk.corpus import words, wordnet, stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.tokenize import word_tokenize

```python
def import_data(path):
    
    folders_path_list = []
    folders = os.listdir(path)
    
    for folder in folders:
        folders_path_list.append(os.path.join(path, folder))
    
    conversion = {'religion' : [folders[i] for i in [0, 15, 19]],
                  'computer': [folders[i] for i in range(1, 6)],
                  'science': [folders[i] for i in range(11, 15)],
                  'politics': [folders[i] for i in range(16, 19)],
                  'misc': [folders[6]],
                  'recreation': [folders[i] for i in range(7, 11)]}
    
    topics = folders.copy()
    
    for j, folder in enumerate(folders):
        for topic, values in conversion.items():
            
            if folder in values:
                topics[j] = topic
    
    
    column_names = ['File_Name', 'Content', 'Folder', 'Topic']
    df = pd.DataFrame(columns=column_names)

    for folder, folder_path, topic in zip(folders, folders_path_list, topics):
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)
            
            with open(file_path, 'r', encoding='latin-1') as file:
                content = file.read()
                df = pd.concat([df,
                                pd.DataFrame({'File_Name': [file_name], 'Content': [content],
                                              'Folder' : [folder], 'Topic': [topic]})],
                               ignore_index=True)
                
    df['Content'] = df['Content'].astype("string")
    
    return df

```python
def preprocess(doc):
    
    # Remove email addresses
    doc = re.sub(r'\b\S*@\S*\.\S*\b', '', doc)
    
    # Remove special characters and digits, retain only words with letters
    doc = re.sub(r'[^\w\s]', '', doc)
    
    # Lowercase and strip
    doc = doc.lower().strip()
    
    # Remove brackets of any kind
    doc = re.sub(r'[(){}[\]]', '', doc)
    
    # Remove punctuation
    doc = doc.translate(str.maketrans("", "", string.punctuation))
    
    # Tokenize document
    tokens = word_tokenize(doc)
    
    # POS tagging
    pos_tags = pos_tag(tokens)
    
     # map POS tags to WordNet POS tags
    tag_map = {
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV,
        'J': wordnet.ADJ
    }

    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    # .get() returns value associated to keyname. If keyname is not a key, it returns what's specified in value
    lemmatized_tokens = [lemmatizer.lemmatize(token, tag_map.get(pos[0], wordnet.NOUN)) for token, pos in pos_tags]
    
    
    # Filter stopwords out of lemmatized tokens
    stop_words = stopwords.words('english')
    
    stop_words.extend(['hi', 'thanks', 'lot', 'dont', 'article', 'everyone', 'anyone',
                       'someone', 'nothing',
                       'something', 'anything', 'everybody', 'somebody', 'anybody',
                       'please', 'ask', 'people', 'university',
                       'question', 'yeah', 'shouldnt', 'theyre', 'thing', 'theyll', 'didnt', 'sorry', 'hey',
                       'oh', 'thats', 'thank', 'cannot', 'right', 'would', 'one', 'get', 'know', 'like', 'use', 'go',
                       'think', 'make', 'say', 'see', 'also', 'could', 'well', 'want', 'way', 'take', 'find', 'need', 'try',
                       'much', 'come', 'many', 'may', 'give', 'really', 'tell', 'two', 'still', 'read', 'might', 'write',
                       'never', 'look', 'sure', 'day', 'even', 'new', 'time', 'good', 'first', 'keep', 'since', 'last', 
                       'long', 'fact', 'must', 'cant', 'another', 'little', 'without', 'csutexasedu', 'nntppostinghost',
                       'im', 'seem', 'replyto', 'let', 'group', 'call', 'seem', ])
    
    filtered_tokens = [token for token in lemmatized_tokens if token not in stop_words]
    
    # Recreate the document
    doc = ' '.join(filtered_tokens)
    
    return doc


```python
path = R"~\path"

df = import_data(path)

df['Clean_Content'] = df['Content'].apply(preprocess)

## TFIDF and other objects
Once the dataset has been imported and preprocessed, a TFIDF matrix was constructed as input matrix for the models. To do this, it was convenient to write another function that could be called inside the notebook. Besides the tfidf matrix, given a specific set of parameters, the function also returned some objects required for topic modelling with the gensim package, which is the one used for some of the models.

```python
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora, matutils
from nltk.tokenize import word_tokenize

def get_tfidf_tokendocs_corpus_dict(df, max_df, min_df, max_features): #0.5, 5, 5000
    # convert text into lists
    documents = df['Clean_Content'].tolist()
    documents

    # TF-IDF Vectorization
    vectorizer = TfidfVectorizer(max_df = max_df, min_df = min_df, norm = 'l2', max_features=max_features)
    tfidf_matrix = vectorizer.fit_transform(documents)
    feature_names = vectorizer.get_feature_names_out()
    
    # this is a list of documents with tokens
    # it's needed for the coherence function
    tokenized_docs = [word_tokenize(document) for document in documents]
    
    # Convert TF-IDF matrix to Gensim corpus
    corpus = matutils.Sparse2Corpus(tfidf_matrix.transpose())
    # Convert the document-term matrix to a gensim Dictionary
    dictionary = corpora.Dictionary.from_corpus(corpus,
                                            id2word=dict((id, word) for word, id in vectorizer.vocabulary_.items()))
    
    return tfidf_matrix, feature_names, tokenized_docs, corpus, dictionary


tfidf_matrix, feature_names, tokenized_docs, corpus, dictionary = get_tfidf_tokendocs_corpus_dict(df, max_df=0.5, min_df=5, max_features=1000)

## Models evaluation
The evaluation of the models was done by comparing Coherence, a measure of topics quality. We replicated the analysis of the paper by comparing the goodness of the models for different number of topics (5, 10, 20, 50) and different number of words (10, 100, 1000, 10000).

To perform the evaluation, we chose to write functions that ran the models different times for the specified parameters.

```python
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora, matutils
from nltk.tokenize import word_tokenize
from gensim.models import LdaModel, LsiModel, CoherenceModel
from sklearn.decomposition import NMF, PCA
from sklearn.random_projection import GaussianRandomProjection
from scipy import sparse
import numpy as np

topics = [5, 10, 20, 50]
words = [10, 100, 1000, 10000]

### Evaluation by topics:
```python
def coherence_by_topics(n: int, corpus, dictionary, texts, feature_names, tfidf):
    models = ['LDA', 'LSA', 'NMF', 'PCA', 'RP']

    coherence = []

    for model_name in models:
        if model_name == 'LDA' or model_name == 'LSA':
            
            if model_name == 'LDA':
                model = LdaModel(corpus=corpus, id2word=dictionary,
                                num_topics = n,
                                alpha='symmetric', eta='auto', passes=5,
                                random_state=1)
        
            elif model_name == 'LSA':
                model = LsiModel(corpus, id2word=dictionary,
                                num_topics=n, random_seed=1)
            
            coherence_value = CoherenceModel(model=model,
                                            dictionary = dictionary,
                                            texts=texts,
                                            coherence='c_v').get_coherence()
            coherence.append(coherence_value)

        elif model_name == 'NMF' or model_name == 'PCA' or model_name == 'RP':
            
            if model_name == 'NMF':
                
                model = NMF(n_components=n,
                            random_state=1, max_iter=600).fit(tfidf)
                
            elif model_name == 'RP':
                
                model = GaussianRandomProjection(n_components=n,
                                                random_state=1).fit(tfidf)
                
            elif model_name == 'PCA':
                # Convert sparse matrix to dense. PCA cannot be done on sparse matrixes
                tfidf_matrix_dense = tfidf.todense()
                            if sparse.issparse(tfidf) else tfidf

                # Convert to numpy array
                tfidf_matrix_array = np.asarray(tfidf_matrix_dense)

                # Centering
                mean_tfidf = np.mean(tfidf_matrix_array,
                                    axis=0)
                centered_tfidf_matrix = tfidf_matrix_array - mean_tfidf
                
                model = PCA(n_components=n,
                            random_state=1).fit(centered_tfidf_matrix)
                
            # Retrieve top words for each component
            topics = []
            for j, component in enumerate(model.components_):
                component_words = [(feature_names[k], component[k])
                                    for k in component.argsort()[::-1]]
                topics.append(component_words)
                
            topics_for_coherence = [[word for word, _ in topic]
                                        for topic in topics]
            
            coherence_value = CoherenceModel(topics=topics_for_coherence,
                                            texts=texts, dictionary=dictionary,
                                            coherence='c_v').get_coherence()
            coherence_value = round(coherence_value, 4)
            
            coherence.append(coherence_value)
    
    coherence = [round(num, 4) for num in coherence]

    return list(zip(models, coherence))

### Evaluation by words:
```python
def coherence_by_words(df, n):
    models = ['LDA', 'LSA', 'NMF', 'PCA', 'RP']

    documents = df['Clean_Content'].tolist()
    documents

    coherence = []

    for model_name in models:
        # TF-IDF Vectorization
        vectorizer = TfidfVectorizer(max_df = 0.5, min_df = 5,
                                    norm = 'l2', max_features=n)
        tfidf_matrix = vectorizer.fit_transform(documents)
        feature_names = vectorizer.get_feature_names_out()

        # this is a list of documents with tokens
        # it's needed for the coherence function
        texts = [word_tokenize(document) for document in documents]
        
        # Convert TF-IDF matrix to Gensim corpus
        corpus = matutils.Sparse2Corpus(tfidf_matrix.transpose())
        # Convert the document-term matrix to a gensim Dictionary
        dictionary = corpora.Dictionary.from_corpus(corpus,
                                                id2word=dict((id, word)
                                                for word, id in
                                                vectorizer.vocabulary_.items()))
        
        if model_name == 'LDA' or model_name == 'LSA':

            if model_name == 'LDA':
                model = LdaModel(corpus=corpus,
                                id2word=dictionary,
                                num_topics = 5,
                                alpha='symmetric',
                                eta='auto', passes=5,
                                random_state=1)
        
            elif model_name == 'LSA':
                model = LsiModel(corpus,
                                id2word=dictionary,
                                num_topics=5,
                                random_seed=1)
            
            coherence_value = CoherenceModel(model=model,
                                            dictionary = dictionary,
                                            texts=texts,
                                            coherence='c_v').get_coherence()
            coherence.append(coherence_value)

        elif model_name == 'NMF' or model_name == 'PCA' or model_name == 'RP':
            
            if model_name == 'NMF':
                
                model = NMF(n_components=5,
                            random_state=1,
                            max_iter=600).fit(tfidf_matrix)
                
            elif model_name == 'RP':
                
                model = GaussianRandomProjection(n_components=5,
                                                random_state=1).fit(tfidf_matrix)
                
            elif model_name == 'PCA':
                # Convert sparse matrix to dense. PCA cannot be done on sparse matrixes
                tfidf_matrix_dense = tfidf_matrix.todense()
                                        if sparse.issparse(tfidf_matrix)
                                        else tfidf_matrix

                # Convert to numpy array
                tfidf_matrix_array = np.asarray(tfidf_matrix_dense)

                # Centering with respect to the columns
                mean_tfidf = np.mean(tfidf_matrix_array,
                                    axis=0)
                centered_tfidf_matrix = tfidf_matrix_array - mean_tfidf
                
                model = PCA(n_components=5,
                            random_state=1).fit(centered_tfidf_matrix)
                
            # Retrieve top words for each component
            components = []
            for j, component in enumerate(model.components_):
                component_words = [(feature_names[k], component[k])
                                    for k in component.argsort()[::-1]]
                components.append(component_words)
                
            topics_for_coherence = [[word for word, _ in component]
                                        for component in components]
            
            coherence_value = CoherenceModel(topics=topics_for_coherence,
                                            texts=texts,
                                            dictionary=dictionary,
                                            coherence='c_v').get_coherence()
            coherence_value = round(coherence_value, 4)
            
            coherence.append(coherence_value)
    
    coherence = [round(num, 4) for num in coherence]
        
    return list(zip(models, coherence))

### Storing results
```python
from coherence_by_topics import coherence_by_topics
from coherence_by_words import coherence_by_words

evaluation_by_topics = {}

for n_topics in topics:
    metrics_words = coherence_by_topics(n = n_topics,
                                        corpus=corpus,
                                        dictionary=dictionary,
                                        texts=tokenized_docs,
                                        feature_names=feature_names,
                                        tfidf=tfidf_matrix)
    
    evaluation_by_topics[n_topics] = metrics_words

evaluation_by_words = {}

for n_words in words:
    metrics_words = coherence_by_words(df, n = n_words)
    evaluation_by_words[n_words] = metrics_words

### Results for evaluation by topics:
```python
evaluation_by_topics[5]
'''
[('LDA', 0.5179),
 ('LSA', 0.5469),
 ('NMF', 0.6479),
 ('PCA', 0.5464),
 ('RP', 0.2019)]'''

evaluation_by_topics[10]
'''
[('LDA', 0.5264),
 ('LSA', 0.4587),
 ('NMF', 0.6736),
 ('PCA', 0.4517),
 ('RP', 0.2012)]'''

evaluation_by_topics[20]
'''
[('LDA', 0.3957),
 ('LSA', 0.382),
 ('NMF', 0.633),
 ('PCA', 0.3656),
 ('RP', 0.2071)]'''

 evaluation_by_topics[50]
 '''
 [('LDA', 0.3593),
 ('LSA', 0.3138),
 ('NMF', 0.5271),
 ('PCA', 0.3193),
 ('RP', 0.2112)]'''

### Results for evaluation by words:
```python
evaluation_by_words[10]
'''
[('LDA', 0.4661),
 ('LSA', 0.4661),
 ('NMF', 0.4661),
 ('PCA', 0.4661),
 ('RP', 0.4661)]'''

 evaluation_by_words[100]
'''
[('LDA', 0.4183),
 ('LSA', 0.3723),
 ('NMF', 0.4723),
 ('PCA', 0.387),
 ('RP', 0.3258)]'''

 evaluation_by_words[1000]
'''
[('LDA', 0.5179),
 ('LSA', 0.5469),
 ('NMF', 0.6479),
 ('PCA', 0.5464),
 ('RP', 0.2019)]'''

 evaluation_by_words[10000]
'''
[('LDA', 0.4755),
 ('LSA', 0.4596),
 ('NMF', 0.7112),
 ('PCA', 0.625),
 ('RP', 0.6706)]'''
