In [1]:
# Import Libraries
import re
import spacy
import nltk
from nltk.corpus import stopwords
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer

In [2]:
# Download NLTK stopwords (only once)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load Models
nlp_spacy = spacy.load("en_core_web_sm")            # You can replace with transformer-based model later
kw_model = KeyBERT(model='all-MiniLM-L6-v2')
embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight but very strong embeddings

# MiniLM: A very lightweight sentence transformer that fits easily on your laptop (384-dimensional vector)

# 1. Text Cleaning Function
def clean_text(text):
    text = text.lower()                         # convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # remove special characters
    tokens = text.split()                       # split into words
    tokens = [word for word in tokens if word not in stop_words]    # remove stopwords
    cleaned_text = ' '.join(tokens)
    return cleaned_text

# 2. Intent Detection (Simple Rule-based)
def detect_intent(text):
    travel_keywords = ["visit", "travel", "trip", "tour", "vacation", "places"] 
    # can be replace this later with a fine-tuned classifier or use Gemini via API
    if any(word in text.lower() for word in travel_keywords):
        return "Travel Roadmap Generation"
    else:
        return "Unknown Intent (Expand Later)"

# 3. Named Entity Recognition (NER)
def extract_entities(text):
    doc = nlp_spacy(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# 4. Keyword Extraction (KeyBERT)
def extract_keywords(text, top_n=5):
    keywords = kw_model.extract_keywords(text, top_n=top_n)
    return keywords

# 5. Embedding Generation
def generate_embedding(text):
    embedding = embedder.encode(text)
    return embedding

# MASTER FUNCTION: Run Full Pipeline
def run_pipeline(user_input):
    print("Original Input:", user_input)
    cleaned = clean_text(user_input)
    print("\nCleaned Text:", cleaned)
    
    intent = detect_intent(user_input)
    print("\nDetected Intent:", intent)
    
    entities = extract_entities(user_input)
    print("\nExtracted Entities:", entities)
    
    keywords = extract_keywords(user_input)
    print("\nExtracted Keywords:", keywords)
    
    embedding = generate_embedding(user_input)
    print("\nGenerated Embedding Shape:", embedding.shape)

# Test Example
if __name__ == "__main__":
    user_prompt = "I want to visit Kandy. What are the places I can travel?"
    run_pipeline(user_prompt)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thara\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Original Input: I want to visit Kandy. What are the places I can travel?

Cleaned Text: want visit kandy places travel

Detected Intent: Travel Roadmap Generation

Extracted Entities: [('Kandy', 'PERSON')]

Extracted Keywords: [('kandy', 0.5729), ('visit', 0.4209), ('travel', 0.3944), ('places', 0.3555), ('want', 0.1613)]

Generated Embedding Shape: (384,)


In [3]:
# Test Example
if __name__ == "__main__":
    user_prompt = "I am plan to visit Colombo and Kalutara next week. What are the possible beatiful places that i can travel to get very good food expereince?"
    run_pipeline(user_prompt)

Original Input: I am plan to visit Colombo and Kalutara next week. What are the possible beatiful places that i can travel to get very good food expereince?

Cleaned Text: plan visit colombo kalutara next week possible beatiful places travel get good food expereince

Detected Intent: Travel Roadmap Generation

Extracted Entities: [('Colombo', 'PERSON'), ('Kalutara', 'PERSON'), ('next week', 'DATE')]

Extracted Keywords: [('colombo', 0.5895), ('places', 0.392), ('food', 0.3478), ('visit', 0.289), ('kalutara', 0.2885)]

Generated Embedding Shape: (384,)


In [4]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

In [5]:
# Download NLTK stopwords (only once)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load Models
# nlp_spacy = spacy.load("en_core_web_sm")            # You can replace with transformer-based model later
kw_model = KeyBERT(model='all-MiniLM-L6-v2')
embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight but very strong embeddings

# MiniLM: A very lightweight sentence transformer that fits easily on your laptop (384-dimensional vector)

# 1. Text Cleaning Function
def clean_text(text):
    text = text.lower()                         # convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # remove special characters
    tokens = text.split()                       # split into words
    tokens = [word for word in tokens if word not in stop_words]    # remove stopwords
    cleaned_text = ' '.join(tokens)
    return cleaned_text

# 2. Intent Detection (Simple Rule-based)

# Load your fine-tuned model (replace this path with your model path or HF repo)
# tokenizer_intent = BertTokenizer.from_pretrained("bert-base-uncased")
# model_intent = BertForSequenceClassification.from_pretrained("your-finetuned-intent-model")

# def detect_intent(text):
#     inputs = tokenizer_intent(text, return_tensors="pt", truncation=True, padding=True)
#     with torch.no_grad():
#         outputs = model_intent(**inputs)
#     probs = torch.nn.functional.softmax(outputs.logits, dim=1)
#     predicted_class = torch.argmax(probs, dim=1).item()
    
#     label_map = {0: "Travel Roadmap Generation", 1: "Learning Roadmap", 2: "Job Preparation", 3: "Unknown"}
#     return label_map.get(predicted_class, "Unknown")

# 3. Named Entity Recognition (NER)
ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
ner_pipeline = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, aggregation_strategy="simple")

def extract_entities(text):
    ner_results = ner_pipeline(text)
    return [(ent['word'], ent['entity_group']) for ent in ner_results]

# 4. Keyword Extraction (KeyBERT)
def extract_keywords(text, top_n=None):
    word_count = len(text.split())
    top_n = top_n if top_n else min(10, max(3, word_count // 3))
    return kw_model.extract_keywords(text, top_n=top_n)

# 5. Embedding Generation
def generate_embedding(text):
    embedding = embedder.encode(text)
    return embedding

# MASTER FUNCTION: Run Full Pipeline
def run_pipeline(user_input):
    print("Original Input:", user_input)
    cleaned = clean_text(user_input)
    print("\nCleaned Text:", cleaned)
    
    # intent = detect_intent(user_input)
    # print("\nDetected Intent:", intent)
    
    entities = extract_entities(user_input)
    print("\nExtracted Entities:", entities)
    
    keywords = extract_keywords(user_input)
    print("\nExtracted Keywords:", keywords)
    
    embedding = generate_embedding(user_input)
    print("\nGenerated Embedding Shape:", embedding.shape)

# Test Example
if __name__ == "__main__":
    user_prompt = "I want to visit Kandy. What are the places I can travel?"
    run_pipeline(user_prompt)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thara\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


Original Input: I want to visit Kandy. What are the places I can travel?

Cleaned Text: want visit kandy places travel

Extracted Entities: [('Kandy', 'LOC')]

Extracted Keywords: [('kandy', 0.5729), ('visit', 0.4209), ('travel', 0.3944), ('places', 0.3555)]

Generated Embedding Shape: (384,)


In [6]:
# Test Example
if __name__ == "__main__":
    user_prompt = "I want to visit Kandy and colombo. What are the places I can travel?"
    run_pipeline(user_prompt)

Original Input: I want to visit Kandy and colombo. What are the places I can travel?

Cleaned Text: want visit kandy colombo places travel

Extracted Entities: [('Kandy', 'LOC'), ('co', 'LOC')]

Extracted Keywords: [('colombo', 0.6636), ('visit', 0.4042), ('places', 0.3634), ('travel', 0.3561)]

Generated Embedding Shape: (384,)
