In [1]:
!pip install transformers accelerate datasets faiss-cpu sentence-transformers evaluate

# Mount drive from access to
from google.colab import drive
drive.mount('/content/drive')

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu, evaluate
Successfully installed evaluate-0.4.6 faiss-cpu-1.12.0
Mounted at /content/drive


### Load Dataset from Google Drive

In [2]:

import os
import pandas as pd

# Path to your UWE Data folder
folder_path = "/content/drive/MyDrive/Colab Notebooks/UWE Data"

# Read all .md files in the folder
documents = []
filenames = []

for filename in os.listdir(folder_path):
    if filename.endswith(".md"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            content = file.read()
            documents.append(content)
            filenames.append(filename)

# Create a DataFrame
df = pd.DataFrame({
    'filename': filenames,
    'content': documents
})

df.head()



Unnamed: 0,filename,content
0,Undergraduate_study.md,# Undergraduate study - Courses and applying |...
1,Postgraduate_study.md,# Postgraduate study - Courses and applying | ...
2,Internship_study.md,# Error\n\n
3,Funding_scholarship.md,# Funding and scholarships - Courses and apply...
4,Careers.md,# Career development - Study | UWE Bristol\n\n...


### Model Selection
- Mistral 7B with Hugging Face Model ID of mistralai/Mistral-7B-Instruct-v0.1
- Falcon 7B with Hugging Face Model ID of tiiuae/falcon-7b-instruct
- Nous Hermes 2	with Hugging Face Model ID of NousResearch/Nous-Hermes-2-Mistral-7B-DPO

In [3]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Embed and Index Documents
embedder = SentenceTransformer('all-MiniLM-L6-v2')
documents = df['content'].tolist()
doc_embeddings = embedder.encode(documents, convert_to_numpy=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
# Build FAISS index
index = faiss.IndexFlatL2(doc_embeddings.shape[1])
index.add(doc_embeddings)

### Define Retrieval + Generation Function

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import time, psutil

def load_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name, device_map="auto", torch_dtype=torch.float16
    )
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
    return pipe

def retrieve_context(query, k=3):
    query_embedding = embedder.encode([query])
    D, I = index.search(query_embedding, k)
    return [documents[i] for i in I[0]]

def generate_answer(pipe, query, context_docs):
    context = "\n".join(context_docs)
    prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:"

    start_time = time.time()
    response = pipe(prompt, max_new_tokens=200)[0]['generated_text']
    duration = time.time() - start_time
    return response, duration


###  Evaluate Models

In [None]:
#from huggingface_hub import login
#login()

In [6]:
import time
import torch
import psutil
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

# Use smaller models (e.g., flan-t5-base)
models = {
    "Flan-T5-Base": "google/flan-t5-base"
}

queries = [
    "What is the main insight in row 12?",
    "Summarize findings for case study B.",
]

results = []

def load_model(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
    return pipeline("text2text-generation", model=model, tokenizer=tokenizer)

def retrieve_context(query):
    # Replace this with real retrieval logic using your markdown dataset
    return "This is a dummy context from the markdown content related to the query."

def generate_answer(pipe, query, context):
    full_prompt = f"Context: {context}\n\nQuestion: {query}"
    start = time.time()
    output = pipe(full_prompt, max_new_tokens=100)[0]['generated_text']
    end = time.time()
    return output, round(end - start, 2)

for name, model_id in models.items():
    try:
        pipe = load_model(model_id)
        for query in queries:
            context_docs = retrieve_context(query)
            response, inference_time = generate_answer(pipe, query, context_docs)
            ram_used = psutil.virtual_memory().used / 1e9
            results.append({
                "Model": name,
                "Query": query,
                "Response": response,
                "Inference Time (s)": inference_time,
                "RAM Used (GB)": round(ram_used, 2),
            })
    except torch.cuda.OutOfMemoryError:
        print(f"⚠️ CUDA OOM for model {name}. Try a smaller model or run on CPU.")
    except Exception as e:
        print(f"⚠️ Error running model {name}: {e}")

# Output results
df_results = pd.DataFrame(results)
print(df_results)



tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cuda:0


          Model                                 Query  \
0  Flan-T5-Base   What is the main insight in row 12?   
1  Flan-T5-Base  Summarize findings for case study B.   

                         Response  Inference Time (s)  RAM Used (GB)  
0  The main insight in row 12 is:                1.26           6.11  
1                   Case study B.                0.11           6.11  


### Load All Markdown Files as a Corpus

In [7]:
import os

# Path to your markdown files
markdown_folder = "/content/drive/MyDrive/Colab Notebooks/UWE Data"

# Load all .md files
documents = {}

for filename in os.listdir(markdown_folder):
    if filename.endswith(".md"):
        filepath = os.path.join(markdown_folder, filename)
        with open(filepath, "r", encoding="utf-8") as f:
            documents[filename] = f.read()

### Simple Context Retrieval Function

In [8]:
def retrieve_context(query, top_n=1):
    scored_docs = []
    for name, content in documents.items():
        score = query.lower() in content.lower()
        if score:
            scored_docs.append((name, content))

    # Return top matching document(s)
    if not scored_docs:
        return "No relevant context found."
    return "\n\n".join(doc for _, doc in scored_docs[:top_n])


In [9]:
!pip install -q sentence-transformers faiss-cpu

In [10]:
from sentence_transformers import SentenceTransformer
import os

markdown_folder = "/content/drive/MyDrive/Colab Notebooks/UWE Data"
raw_chunks = []

# Load and chunk files
for filename in os.listdir(markdown_folder):
    if filename.endswith(".md"):
        filepath = os.path.join(markdown_folder, filename)
        with open(filepath, "r", encoding="utf-8") as f:
            text = f.read()
            # Basic chunking by double newline or split every 200 words
            paragraphs = text.split("\n\n")
            for para in paragraphs:
                if len(para.strip()) > 20:
                    raw_chunks.append((filename, para.strip()))


In [11]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast and good for semantic search

texts = [chunk for _, chunk in raw_chunks]
embeddings = model.encode(texts, show_progress_bar=True)


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

In [12]:
import faiss
import numpy as np

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))


In [13]:
def semantic_retrieve_context(query, top_k=3):
    query_embedding = model.encode([query])
    distances, indices = index.search(np.array(query_embedding), top_k)

    results = []
    for idx in indices[0]:
        filename, chunk = raw_chunks[idx]
        results.append(f"From {filename}:\n{chunk}")

    return "\n\n".join(results)


In [14]:
context_docs = semantic_retrieve_context(query)


In [15]:
context_docs

'From Undergraduate_study.md:\n"This work gave me an opportunity to work with a real client and get feedback like I will when I work in the industry after I graduate."\n\nFrom Postgraduate_study.md:\n# Postgraduate study - Courses and applying | UWE Bristol\n\nFrom Careers.md:\nIf you\'re considering postgraduate study, we have a number of resources on Career Toolkit to help you take your next step.'

In [16]:
import pandas as pd
df_results = pd.DataFrame(results)
df_results

Unnamed: 0,Model,Query,Response,Inference Time (s),RAM Used (GB)
0,Flan-T5-Base,What is the main insight in row 12?,The main insight in row 12 is:,1.26,6.11
1,Flan-T5-Base,Summarize findings for case study B.,Case study B.,0.11,6.11


In [17]:
!pip install -q gradio


### Define the Chatbot Function

In [18]:
import gradio as gr

chat_history = []

def chatbot_interface(query):
    # Retrieve relevant context from your markdown files
    context = semantic_retrieve_context(query)

    # Prepare prompt
    prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"

    # Generate answer
    response = pipe(prompt, max_new_tokens=256, do_sample=False)[0]['generated_text']

    # Maintain chat history
    chat_history.append((query, response))
    return chat_history


### Launch Gradio Chatbot

In [19]:
with gr.Blocks() as demo:
    gr.Markdown("## 🤖 RAG Chatbot (Markdown + Falcon-7B)")
    chatbot = gr.Chatbot()
    msg = gr.Textbox(label="Your question:")
    clear = gr.Button("Clear")

    def respond(message):
        chat = chatbot_interface(message)
        return "", chat

    msg.submit(respond, [msg], [msg, chatbot])
    clear.click(lambda: [], None, chatbot)

demo.launch()


  chatbot = gr.Chatbot()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8ef4656e560ba9933d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


