In [1]:
# Step 1: Import required libraries
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora
from gensim.models import LdaModel
from fastapi import FastAPI
from pydantic import BaseModel
import nest_asyncio
import uvicorn
import os

# Ensure the necessary NLTK data files are downloaded
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

app = FastAPI()
# Define the corpus
corpus = """Hi there! Can you help me understand the concept of black holes?

Absolutely! A black hole is a region in space where the gravitational pull is so strong that not even light can escape from it. This happens when a massive star collapses under its own gravity, compressing all its mass into a very small volume. The boundary around a black hole beyond which nothing can escape is called the event horizon.

That's fascinating! What happens if someone falls into a black hole?

If someone were to fall into a black hole, they would experience a process called "spaghettification." As they approach the black hole, the gravitational pull would become stronger on their feet than on their head, stretching them out lengthwise. Eventually, they would be pulled apart. However, from the perspective of an outside observer, they would appear to slow down and fade away as they approach the event horizon, never actually seeming to cross it.

Wow, that's intense! Can black holes be detected directly?

Not directly, because they emit no light. However, we can detect them indirectly by observing their effects on nearby stars and gas. For example, if a star orbits a seemingly empty region of space, the presence of a black hole can be inferred. Additionally, black holes can sometimes emit powerful jets of radiation when they consume nearby matter, which can also be detected.

I've heard about supermassive black holes. How are they different from regular black holes?

Supermassive black holes are much larger than regular black holes, with masses ranging from millions to billions of times that of our Sun. They are typically found at the centers of galaxies, including our own Milky Way. Their formation is still a topic of research, but they may form through the merging of smaller black holes and the accumulation of vast amounts of matter over time.

That's so interesting! Is there any way to escape a black hole once you’re inside?

Unfortunately, once you cross the event horizon of a black hole, escape is impossible due to the immense gravitational pull. Even light cannot escape, which is why black holes are "black." The escape velocity inside the event horizon exceeds the speed of light, making it the ultimate point of no return.

Thanks for the explanation! Can you recommend a good book or movie about black holes?

Sure! For books, you might enjoy "A Brief History of Time" by Stephen Hawking, which covers black holes among other fascinating topics in cosmology. For movies, "Interstellar" is a great choice. It features a lot of scientifically accurate depictions of black holes, thanks to the input from physicist Kip Thorne.

Great, I'll check those out. Thanks for your help!

You're welcome! If you have any more questions about black holes or anything else, feel free to ask. Enjoy your reading and movie watching!
"""
# Preprocess the text

def preprocess(text):
            # Tokenize, remove stop words, and lemmatize
            return [lemmatizer.lemmatize(word) for word in text.lower().split() if word not in stop_words]

stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()


def clean(doc):
    """
    Clean and preprocess a document.

    Args:
        doc (str): The document to be cleaned.

    Returns:
        str: The cleaned and preprocessed document.

    """
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = "".join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

# Split the corpus into individual statements
statements = corpus.split('\n\n')
clean_corpus = [clean(statement).split() for statement in statements]

# Create a dictionary and document-term matrix
dictionary = corpora.Dictionary(clean_corpus)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in clean_corpus]



[nltk_data] Downloading package stopwords to C:\Users\Aayush
[nltk_data]     Rehal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Aayush
[nltk_data]     Rehal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Aayush
[nltk_data]     Rehal\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
from fastapi import HTTPException

@app.post("/predict/")
async def predict_topic(text_request: TextRequest):
    try:
        lemmatizer = WordNetLemmatizer()
        stop_words = set(stopwords.words('english'))
        
        def preprocess(text):
            # Tokenize, remove stop words, and lemmatize
            return [lemmatizer.lemmatize(word) for word in text.lower().split() if word not in stop_words]

        text_data = text_request.text
        processed_text = preprocess(text_data)
        dictionary = corpora.Dictionary([processed_text])
        doc_term_matrix = [dictionary.doc2bow(processed_text)]
        
        lda_model = LdaModel(doc_term_matrix, num_topics=text_request.num_topics, id2word=dictionary, passes=50)
        topics = lda_model.print_topics(num_topics=text_request.num_topics, num_words=text_request.num_words)
        
        topics_dict = {i: topic for i, topic in topics}
        return topics_dict
    except Exception as e:
        raise HTTPException(status_code=500, detail="An error occurred while processing the request")

if __name__ == "__main__":
    nest_asyncio.apply()
    uvicorn.run(app, host="127.0.0.1", port=8001)  # Change the port number if necessary


NameError: name 'TextRequest' is not defined

Focus: LSI focuses on relationships between words, LDA focuses on uncovering hidden topics.
Method: LSI uses math to find patterns, LDA uses probability to figure out topics.
Results: LSI helps find similar documents, LDA helps understand document content.