In [None]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Latent Semantic Analysis (LSA)

**Sources:**
- [Latent Semantic Analysis (LSA)](https://blog.marketmuse.com/glossary/latent-semantic-analysis-definition/#:~:text=Latent%20Semantic%20Analysis%20is%20a,relationships%20between%20terms%20and%20concepts)
- https://www.datacamp.com/tutorial/discovering-hidden-topics-python

**Latent Semantic Analysis (LSA)** is a Natural Language Processing (NLP) method that analyzes relationships between a set of documents and the terms contained within. It uses **Singular Value Decomposition (SVD)** method to scan unstructured data to find hidden relationships between terms and concepts.
LSA is commonly used in NLP and in Information Retrieval (IR). 
By reducing the dimensionality of the term-document matrix, LSA enables the identification of hidden semantic concepts, facilitating tasks such as **document classification**, and **text summarization**.

### How does LSA work?

1. Create Term-Document Matrix: First, a term-document matrix is created, where rows represent terms (words) and columns represent documents. Each entry in the matrix indicates the frequency of a term in a particular document.

2. Apply Singular Value Decomposition (SVD): LSA applies the SVD to the term-document matrix, decomposing it into three matrices: U, Σ, and V^T. U represents the relationship between terms and concepts, Σ contains the singular values, and V^T represents the relationship between documents and concepts.

3. Dimensionality Reduction: LSA then reduces the dimensionality of the original matrix by selecting the top k singular values and their corresponding columns in U and V^T. This process helps to capture the most important underlying patterns and relationships in the data.

4. Capture Latent Semantic Structure: By examining the relationships between the terms and documents in the reduced-dimensional space, LSA identifies the latent semantic structure, allowing for the discovery of underlying concepts or topics that are present across the document collection.


In [None]:
# !pip install gensim

In [None]:
from gensim import corpora
from gensim.models import LsiModel
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt

# Load data from file
def load_data(file_path):
    documents_list = []
    titles = []
    with open(file_path, "r") as fin:
        for line in fin.readlines():
            text = line.strip()
            documents_list.append(text)
            titles.append(text[0 : min(len(text), 100)])
    return documents_list, titles


# Preprocess data
def preprocess_data(doc_set):
    tokenizer = RegexpTokenizer(r"\w+")
    en_stop = set(stopwords.words("english"))
    p_stemmer = PorterStemmer()
    texts = []
    for i in doc_set:
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        stopped_tokens = [i for i in tokens if not i in en_stop]
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        texts.append(stemmed_tokens)
    return texts


# Create dictionary and doc_term_matrix
def prepare_corpus(doc_clean):
    dictionary = corpora.Dictionary(doc_clean)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    return dictionary, doc_term_matrix


# Create LSA model
def create_gensim_lsa_model(doc_clean, number_of_topics, words):
    dictionary, doc_term_matrix = prepare_corpus(doc_clean)
    lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word=dictionary)
    print("LSA Model:")
    for idx, topic in lsamodel.print_topics(num_topics=number_of_topics, num_words=words):
        print(f"Topic-{idx + 1}: {topic}")
    return lsamodel


# coherence values
def compute_coherence_values(dictionary, doc_term_matrix, doc_clean, stop, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, stop, step):
        model = LsiModel(doc_term_matrix, num_topics=num_topics, id2word=dictionary)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=doc_clean, dictionary=dictionary, coherence="c_v")
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values


# coherence graph
def plot_graph(doc_clean, start, stop, step):
    dictionary, doc_term_matrix = prepare_corpus(doc_clean)
    model_list, coherence_values = compute_coherence_values(dictionary, doc_term_matrix, doc_clean, stop, start, step)
    x = range(start, stop, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence score")
    plt.legend("coherence_values", loc="best")
    plt.show()

number_of_topics = 7
words = 10

# Load data and preprocess
document_list, titles = load_data("data/lsa.txt")
clean_text = preprocess_data(document_list)

start, stop, step = 2, 12, 1
plot_graph(clean_text, start, stop, step)

# LSA Model
model = create_gensim_lsa_model(clean_text, number_of_topics, words)