In [1]:
# Import Libraries
import re
import spacy
import nltk
from nltk.corpus import stopwords
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer

In [2]:
# Download NLTK stopwords (only once)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load Models
nlp_spacy = spacy.load("en_core_web_sm")            # You can replace with transformer-based model later
kw_model = KeyBERT(model='all-MiniLM-L6-v2')
embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight but very strong embeddings

# MiniLM: A very lightweight sentence transformer that fits easily on your laptop (384-dimensional vector)

# 1. Text Cleaning Function
def clean_text(text):
    text = text.lower()                         # convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # remove special characters
    tokens = text.split()                       # split into words
    tokens = [word for word in tokens if word not in stop_words]    # remove stopwords
    cleaned_text = ' '.join(tokens)
    return cleaned_text

# 2. Intent Detection (Simple Rule-based)
def detect_intent(text):
    travel_keywords = ["visit", "travel", "trip", "tour", "vacation", "places"] 
    # can be replace this later with a fine-tuned classifier or use Gemini via API
    if any(word in text.lower() for word in travel_keywords):
        return "Travel Roadmap Generation"
    else:
        return "Unknown Intent (Expand Later)"

# 3. Named Entity Recognition (NER)
def extract_entities(text):
    doc = nlp_spacy(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# 4. Keyword Extraction (KeyBERT)
def extract_keywords(text, top_n=5):
    keywords = kw_model.extract_keywords(text, top_n=top_n)
    return keywords

# 5. Embedding Generation
def generate_embedding(text):
    embedding = embedder.encode(text)
    return embedding

# MASTER FUNCTION: Run Full Pipeline
def run_pipeline(user_input):
    print("Original Input:", user_input)
    cleaned = clean_text(user_input)
    print("\nCleaned Text:", cleaned)
    
    intent = detect_intent(user_input)
    print("\nDetected Intent:", intent)
    
    entities = extract_entities(user_input)
    print("\nExtracted Entities:", entities)
    
    keywords = extract_keywords(user_input)
    print("\nExtracted Keywords:", keywords)
    
    embedding = generate_embedding(user_input)
    print("\nGenerated Embedding Shape:", embedding.shape)

# Test Example
if __name__ == "__main__":
    user_prompt = "I want to visit Kandy. What are the places I can travel?"
    run_pipeline(user_prompt)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thara\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Original Input: I want to visit Kandy. What are the places I can travel?

Cleaned Text: want visit kandy places travel

Detected Intent: Travel Roadmap Generation

Extracted Entities: [('Kandy', 'PERSON')]

Extracted Keywords: [('kandy', 0.5729), ('visit', 0.4209), ('travel', 0.3944), ('places', 0.3555), ('want', 0.1613)]

Generated Embedding Shape: (384,)
