In [5]:
import pickle
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load LDA Model
with open('../static/model/lda_model.pickle', 'rb') as f:
    lda_model = pickle.load(f)

# Load Dictionary (id2word)
with open('../static/model/id2word.pickle', 'rb') as f:
    id2word = pickle.load(f)

In [15]:
# Function to predict topic using LDA model
def get_topic(text, lda_model, id2word):
    # Preprocess the input text and convert it to BOW format
    bow_vector = id2word.doc2bow(text.split())
    
    # Get topic probabilities for the input text
    topic_probabilities = lda_model.get_document_topics(bow_vector)
    
    # Select the topic with the highest probability
    topic_id, prob = max(topic_probabilities, key=lambda x: x[1])
    
    # Get topic keywords
    topic_keywords = lda_model.show_topic(topic_id, topn=5)
    return [word for word, prob in topic_keywords]

In [16]:
# Load GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [17]:
# Function to generate a sentence from GPT-2 based on the topic keywords
def generate_topic_sentence(keywords):
    # Convert the list of keywords into a string as a prompt
    prompt = " ".join(keywords)
    
    # Tokenize the prompt for GPT-2
    inputs = tokenizer(prompt, return_tensors="pt")
    
    # Generate text continuation
    outputs = model.generate(inputs["input_ids"], max_length=50, num_return_sequences=1)
    
    # Decode the generated text
    generated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_sentence

In [18]:
# Example usage
text_input = "Food is any substance consumed by an organism for nutritional support. Food is usually of plant, animal."  # Replace with your actual input text
topic_keywords = get_topic(text_input, lda_model, id2word)
generated_sentence = generate_topic_sentence(topic_keywords)

print("Topic Keywords:", topic_keywords)
print("Generated Topic Sentence:", generated_sentence)

IndexError: index 472159 is out of bounds for axis 1 with size 388950

In [12]:
# Print all vocabulary words with their IDs
for word_id, word in id2word.items():
    print(f"ID: {word_id}, Word: {word}")


ID: 0, Word: access
ID: 1, Word: accessibility
ID: 2, Word: across_globe
ID: 3, Word: amalgamation
ID: 4, Word: analyse
ID: 5, Word: analysis
ID: 6, Word: anywhere
ID: 7, Word: archaeology
ID: 8, Word: article
ID: 9, Word: assist
ID: 10, Word: beautiful
ID: 11, Word: become
ID: 12, Word: capture
ID: 13, Word: certain
ID: 14, Word: change
ID: 15, Word: cloud
ID: 16, Word: cloud_compute
ID: 17, Word: combine
ID: 18, Word: come
ID: 19, Word: company
ID: 20, Word: competition
ID: 21, Word: continuous
ID: 22, Word: conventional
ID: 23, Word: create
ID: 24, Word: creation
ID: 25, Word: crime
ID: 26, Word: currently
ID: 27, Word: data
ID: 28, Word: database
ID: 29, Word: datum
ID: 30, Word: decade
ID: 31, Word: demand
ID: 32, Word: development
ID: 33, Word: disseminate
ID: 34, Word: domain
ID: 35, Word: drawback
ID: 36, Word: dream
ID: 37, Word: dynamic
ID: 38, Word: earth
ID: 39, Word: easier
ID: 40, Word: easily
ID: 41, Word: easy
ID: 42, Word: editing
ID: 43, Word: efficient
ID: 44, Word: 

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [13]:
import pickle
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load LDA model and dictionary
with open('../static/model/lda_model.pickle', 'rb') as f:
    lda_model = pickle.load(f)

with open('../static/model/id2word.pickle', 'rb') as f:
    id2word = pickle.load(f)

# Function to get the most probable topic
def get_most_probable_topic(lda_model, bow):
    topics = lda_model.get_document_topics(bow)
    most_probable_topic = max(topics, key=lambda x: x[1])
    return most_probable_topic[0]

# Example document (Bag of Words format)
example_bow = id2word.doc2bow("Food is any substance consumed by an organism for nutritional support. Food is usually of plant, animal.".split())

# Get the most probable topic for the example document
most_probable_topic_id = get_most_probable_topic(lda_model, example_bow)

# Get the words associated with the most probable topic
topic_words = lda_model.show_topic(most_probable_topic_id, topn=10)
topic_words_str = ' '.join([word for word, prob in topic_words])

# Load GPT-2 model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Generate a meaningful topic description
input_ids = tokenizer.encode(topic_words_str, return_tensors='pt')
output = model.generate(input_ids, max_length=50, num_return_sequences=1)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(f"Most Probable Topic ID: {most_probable_topic_id}")
print(f"Topic Words: {topic_words_str}")
print(f"Generated Topic Description: {generated_text}")

IndexError: index 472159 is out of bounds for axis 1 with size 388950

In [21]:
import pickle
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load LDA model and dictionary
with open('../static/model/lda_model.pickle', 'rb') as f:
    lda_model = pickle.load(f)

with open('../static/model/id2word.pickle', 'rb') as f:
    id2word = pickle.load(f)

# Function to get the most probable topic
def get_most_probable_topic(lda_model, bow):
    vocab = lda_model.id2word.token2id
    filtered_bow = [(word_id, count) for word_id, count in bow if word_id in vocab]
    
    if not filtered_bow:
        print("No valid words found in the BOW representation.")
        return None  # Return None if no valid words are present
    
    topics = lda_model.get_document_topics(filtered_bow)
    print("Document Topics:", topics)  # Debugging line to see all topic probabilities
    most_probable_topic = max(topics, key=lambda x: x[1])
    return most_probable_topic[0]


# Example document
document = "Climate change is a pressing global issue that affects ecosystems, weather patterns, and human health. Governments and organizations worldwide are implementing policies to reduce carbon emissions and promote renewable energy sources. Public awareness and education on environmental conservation are crucial for fostering sustainable practices. Technological advancements in green energy and waste management are also playing a significant role in mitigating the impacts of climate change."
# Convert document to BoW format, filtering out words not in the dictionary
example_bow = id2word.doc2bow([word for word in document.split() if word in id2word.token2id])

# Get the most probable topic for the example document
most_probable_topic_id = get_most_probable_topic(lda_model, example_bow)

# Get the words associated with the most probable topic
topic_words = lda_model.show_topic(most_probable_topic_id, topn=10)
topic_words_str = ' '.join([word for word, prob in topic_words])

# Load GPT-2 model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Generate a meaningful topic description
input_ids = tokenizer.encode(topic_words_str, return_tensors='pt')
output = model.generate(input_ids, max_length=50, num_return_sequences=1)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(f"Most Probable Topic ID: {most_probable_topic_id}")
print(f"Topic Words: {topic_words_str}")
print(f"Generated Topic Description: {generated_text}")

No valid words found in the BOW representation.


MemoryError: Unable to allocate 13.8 TiB for an array with shape (5, 388950, 5, 388950) and data type float32