<a href="https://colab.research.google.com/github/Atabak-Touri/GUI-project/blob/main/Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import tensorflow as tf

# Load JSON from file or paste the JSON data directly if running in Colab
with open('thd_content.json', 'r') as f:
    data = json.load(f)

# Create a list of tuples: (title, content, url)
entries = [(item['title'], item['content'], item['url']) for item in data]

# Create a tf.data.Dataset
dataset = tf.data.Dataset.from_tensor_slices(entries)

# Let's preview the dataset
for title, content, url in dataset.take(1):
    print(f"Title: {title.numpy().decode()}")
    print(f"Content: {content.numpy().decode()[:300]}...")
    print(f"URL: {url.numpy().decode()}")


Title: Deggendorf Institute of Technology
Content: Welcome to our award-winning university where staff and students boldly develop, study, research, teach and innovate. We are a place of endless opportunities and personal stories, created by each individual member of our inspirational university community. Discover our world and let us inspire you, ...
URL: https://www.th-deg.de/en


In [None]:
!pip install -U sentence-transformers scikit-learn



In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Extract just the content to embed
contents = [entry[1] for entry in entries]  # index 1 = content field

# Generate embeddings (this may take a few seconds)
embeddings = model.encode(contents, show_progress_bar=True)

# Save embeddings to file
np.save('thd_embeddings.npy', embeddings)

# Also save titles and urls for later retrieval
with open('thd_metadata.json', 'w') as f:
    json.dump([{'title': e[0], 'url': e[2]} for e in entries], f)

print("✅ Embeddings saved to 'thd_embeddings.npy'")
print("✅ Metadata saved to 'thd_metadata.json'")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Embeddings saved to 'thd_embeddings.npy'
✅ Metadata saved to 'thd_metadata.json'


In [None]:
import numpy as np
import json
from sklearn.metrics.pairwise import cosine_similarity

# Load embeddings
embeddings = np.load('thd_embeddings.npy')

# Load metadata (titles and URLs)
with open('thd_metadata.json', 'r') as f:
    metadata = json.load(f)


In [None]:
def search_documents(query, k=3):
    # Embed the query using the same model
    query_embedding = model.encode([query])

    # Compute cosine similarity
    similarities = cosine_similarity(query_embedding, embeddings)[0]

    # Get top-k indices
    top_k_idx = similarities.argsort()[-k:][::-1]

    results = []
    for idx in top_k_idx:
        results.append({
            "score": float(similarities[idx]),
            "title": metadata[idx]['title'],
            "url": metadata[idx]['url'],
            "content": entries[idx][1]  # get original content
        })

    return results


In [None]:
# Try a test query
results = search_documents("How can I apply to DIT as an international student?", k=3)

# Display results
for i, res in enumerate(results, 1):
    print(f"\n🔹 Result {i}")
    print(f"Title: {res['title']}")
    print(f"Score: {res['score']:.4f}")
    print(f"URL: {res['url']}")
    print(f"Excerpt: {res['content'][:300]}...")



🔹 Result 1
Title: Applications | DIT
Score: 0.5387
URL: https://www.th-deg.de/en/apply
Excerpt: To begin your studies at the Deggendorf Institute of Technology, apply online only during the application period. During the application process, we will carefully check to see if you can be admitted to DIT. If you are successful, you will receive an offer and will be requested to complete online en...

🔹 Result 2
Title: Welcome Centre | DIT
Score: 0.5144
URL: https://www.th-deg.de/en/welcome-centre
Excerpt: The Welcome Centre is the first point of contact for international students and prospective international students. We provide guidance with entry and residence formalities, tips for planning your student life on campus, plus support for all other necessary organisation. Inquiries:welcome@th-deg.de ...

🔹 Result 3
Title: FAQ | DIT
Score: 0.4945
URL: https://www.th-deg.de/en/study-with-us/advice-support/faq-potentials
Excerpt: Please find the current application periods on ourapplication

In [None]:
!pip install -U transformers accelerate

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load Falcon-1B (small enough for Colab free)
model_id = "tiiuae/falcon-rw-1b"

tokenizer = AutoTokenizer.from_pretrained(model_id)
falcon = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,  # required for Falcon
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
falcon.eval().to("cuda" if torch.cuda.is_available() else "cpu")




FalconForCausalLM(
  (transformer): FalconModel(
    (word_embeddings): Embedding(50304, 2048)
    (h): ModuleList(
      (0-23): 24 x FalconDecoderLayer(
        (self_attention): FalconAttention(
          (query_key_value): FalconLinear(in_features=2048, out_features=6144, bias=True)
          (dense): FalconLinear(in_features=2048, out_features=2048, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): FalconMLP(
          (dense_h_to_4h): FalconLinear(in_features=2048, out_features=8192, bias=True)
          (act): GELU(approximate='none')
          (dense_4h_to_h): FalconLinear(in_features=8192, out_features=2048, bias=True)
        )
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_f): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=2048, out_feature

In [None]:
def build_prompt(query, retrieved_docs):
    context = "\n\n".join([doc['content'][:1000] for doc in retrieved_docs])  # Keep it short and relevant
    prompt = f"""You are a helpful university assistant chatbot. Use the context below to answer the user's question clearly and concisely.

Context:
{context}

User Question:
{query}

Answer:"""
    return prompt
def generate_answer(query, top_k_docs):
    prompt = build_prompt(query, top_k_docs)

    # Tokenize and send to GPU if available
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(falcon.device)

    with torch.no_grad():
        outputs = falcon.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.7,
            top_p=0.9,
            do_sample=True
        )

    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Clean the output (remove prompt prefix if needed)
    return generated[len(prompt):].strip()


In [None]:
query = "What support is available for international students at THD?"
top_docs = search_documents(query, k=3)
answer = generate_answer(query, top_docs)

print("🤖 Chatbot Answer:\n")
print(answer)


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


🤖 Chatbot Answer:

nternational students can use the following services:
- Application support
- International student support
- International student services
- International student support and counseling
- International student counseling
- International student services for exchange students
- International student services for international students
- International student services for prospective students
- International student services for international students
- International student services for prospective students
- International student services for international students
- International student services for international students
- International student services for prospective students
- International student services for international students
- International student services for prospective students
- International student services for prospective students
- International student services for international students
- International student services for international st

In [None]:
!pip install gradio




In [None]:
def chatbot_interface(user_query):
    try:
        top_docs = search_documents(user_query, k=3)
        answer = generate_answer(user_query, top_docs)

        # Optional: Add clickable sources
        sources = "\n\nSources:\n" + "\n".join(
            [f"- [{doc['title']}]({doc['url']})" for doc in top_docs]
        )

        return answer + sources

    except Exception as e:
        # Return error to the frontend
        return f"⚠️ An error occurred: {str(e)}"


In [15]:
import gradio as gr

gr.Interface(
    fn=chatbot_interface,
    inputs=gr.Textbox(lines=2, placeholder="Ask me something about THD..."),
    outputs="text",
    title="🎓 THD Assistant Chatbot",
    description="Ask me anything about studying at THD! I’ll search official info and respond using AI."
).launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://104f91c9541225a899.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


