### Build a language generation model that can generate headline for the given input document. (Perform syntactic and semantic analysis on the input)

In [5]:
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from collections import Counter

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_document(document):
    # Basic preprocessing: lowercasing and removing non-alphanumeric characters
    document = document.lower()
    document = re.sub(r'[^a-zA-Z0-9\s]', '', document)
    return document

def extract_keywords_textrank(document, top_n=5):
    # Preprocess the document
    preprocessed_document = preprocess_document(document)

    # Tokenize the document into words
    words = word_tokenize(preprocessed_document)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Calculate word frequencies
    word_freq = Counter(words)

    # Tokenize the document into sentences
    sentences = sent_tokenize(preprocessed_document)

    # Calculate sentence scores based on word frequencies
    sentence_scores = {sentence: sum(word_freq[word] for word in word_tokenize(sentence)) for sentence in sentences}

    # Sort sentences by score and get top sentences
    top_sentences = sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]

    # Extract keywords from top sentences
    keywords = [word for sentence, score in top_sentences for word in word_tokenize(sentence)]

    return keywords

# Example input document
input_document = """
Scientists have discovered a new species of frog in the Amazon rainforest. The frog, named "Amazonia Emeraldensis", has bright green skin and a distinctive croak. This discovery sheds light on the biodiversity of the region and highlights the importance of conservation efforts in the Amazon.
"""

# Extract keywords using TextRank
keywords = extract_keywords_textrank(input_document)

print("Keywords:", keywords)


Keywords: ['scientists', 'have', 'discovered', 'a', 'new', 'species', 'of', 'frog', 'in', 'the', 'amazon', 'rainforest', 'the', 'frog', 'named', 'amazonia', 'emeraldensis', 'has', 'bright', 'green', 'skin', 'and', 'a', 'distinctive', 'croak', 'this', 'discovery', 'sheds', 'light', 'on', 'the', 'biodiversity', 'of', 'the', 'region', 'and', 'highlights', 'the', 'importance', 'of', 'conservation', 'efforts', 'in', 'the', 'amazon']


[nltk_data] Downloading package punkt to /home/bagiya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/bagiya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def generate_headline(keywords, model, tokenizer, max_sequence_len):
  """
  Generates a headline based on a list of keywords using the LSTM model.

  Args:
      keywords: A list of keywords representing the key points of the document.
      model: The trained LSTM model for text generation.
      tokenizer: The tokenizer object used for text processing.
      max_sequence_len: Maximum sequence length for the model.

  Returns:
      A generated headline string.
  """

  # Combine keywords with a separator for context
  seed_text = " ".join(keywords)

  # Generate text using the function you provided
  generated_text = generate_text(seed_text, next_words=10, model=model, max_sequence_len=max_sequence_len)  # Adjust next_words as needed

  # Extract the first sentence as the headline (assuming generated text is a sentence)
  headline = generated_text.split(".")[0].strip()

  return headline.capitalize()

# Assuming you have a function `extract_keywords(text)` to get keywords from text
text = "..."  # Replace with your text data
keywords = extract_keywords(text)

# ... (Load your trained LSTM model and tokenizer)

generated_headline = generate_headline(keywords, model, tokenizer, max_sequence_len)

print(f"Input Text keywords: {keywords}")
print(f"Generated Headline: {generated_headline}")

In [6]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import spacy

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load English tokenizer, tagger, parser, NER, and stop words
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    # Remove stopwords and punctuation
    cleaned_tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    return ' '.join(cleaned_tokens)

def generate_headline(input_text):
    # Preprocess input text
    preprocessed_text = preprocess_text(input_text)
    
    # Perform semantic analysis
    doc = nlp(preprocessed_text)
    
    # Extract important entities
    entities = [entity.text for entity in doc.ents]
    
    # Extract keywords or important phrases
    keywords = [token.text for token in doc if not token.is_stop and token.pos_ != 'PUNCT']
    
    # Generate headline based on extracted entities and keywords
    headline = " ".join(entities[:3]) + " ".join(keywords[:3])
    
    return headline.capitalize()

# Example input document
input_document = """
    Natural language processing (NLP) is a subfield of linguistics, 
    computer science, and artificial intelligence concerned with the interactions 
    between computers and human language, in particular how to program computers 
    to process and analyze large amounts of natural language data.
"""

# Generate headline for the input document
headline = generate_headline(input_document)
print("Generated Headline:", headline)

[nltk_data] Downloading package punkt to /home/bagiya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/bagiya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Generated Headline: Natural language processing
