In [1]:
!pip install python-docx pdfplumber beautifulsoup4 html2text --quiet


In [2]:
import os
import xml.etree.ElementTree as ET


In [3]:
def extract_summary_from_xml(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    
    # Extract all text from the XML (adjust tag logic if needed)
    text_chunks = []
    for elem in root.iter():
        if elem.text and elem.text.strip():
            text_chunks.append(elem.text.strip())
    
    return " ".join(text_chunks)


In [4]:
def load_all_summaries(folder_path):
    summaries = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".abssumm.xml"):
            full_path = os.path.join(folder_path, filename)
            summary_text = extract_summary_from_xml(full_path)
            if summary_text:
                summaries.append({
                    "filename": filename,
                    "text": summary_text
                })
    return summaries



## 📥 Added Support for Loading Multi-format Files (PDF, DOCX, TXT, HTML)

This section allows loading multiple document formats and standardizing them into plain text for chunking and embedding.


In [5]:

import os
import pdfplumber
from docx import Document
from bs4 import BeautifulSoup
import html2text

def extract_text_from_pdf(path):
    with pdfplumber.open(path) as pdf:
        return "\n".join([page.extract_text() or "" for page in pdf.pages])

def extract_text_from_docx(path):
    doc = Document(path)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_text_from_html(path):
    with open(path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
        return html2text.html2text(soup.prettify())

def extract_text_from_txt(path):
    with open(path, 'r', encoding='utf-8') as f:
        return f.read()

def read_any_file(filepath):
    ext = os.path.splitext(filepath)[-1].lower()
    if ext == '.pdf':
        return extract_text_from_pdf(filepath)
    elif ext == '.docx':
        return extract_text_from_docx(filepath)
    elif ext == '.html':
        return extract_text_from_html(filepath)
    elif ext in ['.txt', '.md']:
        return extract_text_from_txt(filepath)
    else:
        raise ValueError(f"Unsupported file type: {ext}")

def load_documents_from_folder(folder_path):
    all_docs = []
    for file in os.listdir(folder_path):
        full_path = os.path.join(folder_path, file)
        try:
            text = read_any_file(full_path)
            all_docs.append({
                "filename": file,
                "text": text
            })
        except Exception as e:
            print(f"❌ Could not read {file}: {e}")
    return all_docs


In [None]:

# # Use the same path format as in your existing notebook
# doc_folder = "ami_public_manual_1.6.2/abstractive"
# summaries = load_documents_from_folder(doc_folder)

# print(f"Loaded {len(summaries)} documents")
# print(summaries[0]['filename'])
# print(summaries[0]['text'][:500])


In [36]:
import os
import pdfplumber
import html2text
import xml.etree.ElementTree as ET
from docx import Document
from bs4 import BeautifulSoup

# ── file readers (unchanged) ──────────────────────────────────────────────────
def extract_text_from_pdf(path):
    with pdfplumber.open(path) as pdf:
        return "\n".join(page.extract_text() or "" for page in pdf.pages)

def extract_text_from_docx(path):
    return "\n".join(p.text for p in Document(path).paragraphs)

def extract_text_from_html(path):
    soup = BeautifulSoup(open(path, encoding="utf-8"), "html.parser")
    return html2text.html2text(soup.prettify())

def extract_text_from_txt(path):
    return open(path, encoding="utf-8").read()

def extract_text_from_xml(path):
    tree = ET.parse(path)
    return " ".join(e.text.strip() for e in tree.getroot().iter() if e.text)

def read_any_file(filepath):
    ext = os.path.splitext(filepath)[1].lower()
    if ext == ".pdf":      return extract_text_from_pdf(filepath)
    if ext == ".docx":     return extract_text_from_docx(filepath)
    if ext == ".html":     return extract_text_from_html(filepath)
    if ext in (".txt",".md",".css"): return extract_text_from_txt(filepath)
    if ext == ".xml":      return extract_text_from_xml(filepath)
    raise ValueError(f"Unsupported file type: {ext}")

# ── recursive loader ───────────────────────────────────────────────────────────
def load_all_documents(root_folder):
    """
    Walks every subdirectory under `root_folder`,
    reads each supported file, and returns a list of dicts.
    """
    docs = []
    for dirpath, _, filenames in os.walk(root_folder):
        for fname in filenames:
            full = os.path.join(dirpath, fname)
            try:
                text = read_any_file(full)
                docs.append({
                    "filename": os.path.relpath(full, root_folder),  # relative path
                    "text": text
                })
            except Exception as e:
                print(f"Skipped {full}: {e}")
    return docs

# ── use it ─────────────────────────────────────────────────────────────────────
root = r"F:\rag-ami\ami_public_manual_1.6.2"  
summaries = load_all_documents(root)
print(f"Loaded {len(summaries)} files:")
for doc in summaries[:10]:
    print(" ", doc["filename"])

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


Loaded 5161 files:
  00README_MANUAL.txt
  AMI-metadata.xml
  LICENCE.txt
  manifest_1.7.html
  MANIFEST_MANUAL.txt
  resource.xml
  abstractive\Akshat Mishra Resume DS .pdf
  abstractive\Arsalan_Anwar_Resume (1).pdf
  abstractive\ES2002a.abssumm.xml
  abstractive\ES2002b.abssumm.xml


In [23]:
!pip install transformers --quiet


In [24]:
from transformers import GPT2TokenizerFast

# Load GPT-2 tokenizer (used by OpenAI for estimating token counts)
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")


In [37]:
def chunk_text(text, max_tokens=500):
    words = text.split()
    chunks = []
    current_chunk = []
    current_tokens = 0

    for word in words:
        token_len = len(tokenizer.encode(word, add_special_tokens=False))
        if current_tokens + token_len > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_tokens = token_len
        else:
            current_chunk.append(word)
            current_tokens += token_len

    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks


In [38]:
chunked_data = []

for summary in summaries:
    chunks = chunk_text(summary["text"], max_tokens=500)
    for i, chunk in enumerate(chunks):
        chunked_data.append({
            "filename": summary["filename"],
            "chunk_id": i,
            "text": chunk
        })

print(f"Total chunks created: {len(chunked_data)}")
print(chunked_data[0])


Total chunks created: 3630
{'filename': '00README_MANUAL.txt', 'chunk_id': 0, 'text': "AMI Manual Annotations release 1.7 Date: 16th June 2014 Built by: Jonathan Kilgour Contact: amicorpus@amiproject.org Please read LICENCE.txt before using this data. Please quote the release number in any correspondence. The annotation data is in a format ready to be used directly by NXT. Download and further information here: http://www.ltg.ed.ac.uk/NITE/ This data requires NXT 1.4.1 or later. To use this data with AMI media files, make sure the signals you have downloaded from http://corpus.amiproject.org/ are in a directory called 'signals' under this directory. ------------------------ Changes in public release 1.7 from 1.6 Only one change: transcription files for non-scenario meetings updated to include more accurate and complete timings so that scripts to extract timing information do not return NaN (not a number) results. ------------------------ Changes in public release 1.6 from 1.5 For full 

In [27]:
pip install sentence-transformers


Note: you may need to restart the kernel to use updated packages.


In [39]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  # Fast, small, good quality

def get_local_embedding(text):
    return embedding_model.encode(text)

In [40]:
texts = [item['text'] for item in chunked_data]
embeddings = [get_local_embedding(text) for text in texts]


In [41]:
import faiss
import numpy as np

dimension = len(embeddings[0])  # should be 384 for MiniLM
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings).astype('float32'))

print(f"FAISS index contains {index.ntotal} vectors")


FAISS index contains 3630 vectors


In [42]:
!pip install requests --quiet

In [43]:
import requests

TOGETHER_API_KEY = "cc4b628095c0531f06fe08ff20e1f0bad8cf4e6c39ed2b3c70744a6278a7faab"  # paste the key from your dashboard

def generate_answer(prompt, model="mistralai/Mixtral-8x7B-Instruct-v0.1"):
    url = "https://api.together.xyz/v1/completions"
    headers = {
        "Authorization": f"Bearer {TOGETHER_API_KEY}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": model,
        "prompt": prompt,
        "max_tokens": 512,
        "temperature": 0.3,
        "top_p": 0.9
    }

    response = requests.post(url, headers=headers, json=payload)
    result = response.json()

    try:
        return result["choices"][0]["text"]
    except Exception as e:
        print("❌ Error parsing Together.ai response:")
        print(result)
        return "No output returned."



In [44]:
def search_faiss(query, top_k=5):
    query_embedding = get_local_embedding(query)
    D, I = index.search(np.array([query_embedding]).astype('float32'), top_k)
    return [chunked_data[i]['text'] for i in I[0]]


In [45]:
def ask_rag_agent(query):
    top_chunks = search_faiss(query, top_k=5)
    context = "\n\n".join(top_chunks)

    prompt = f"""You are a helpful meeting assistant.
Based on the following context from AMI meeting summaries, answer the question.

Context:
{context}

Question:
{query}

Answer:"""

    return generate_answer(prompt)


In [46]:
response = ask_rag_agent("What is the similarity between Arsalans and Akshats Resume?")
print(response)



Arsalan and Akshat have both worked on developing music recommendation systems and have experience with machine learning algorithms such as KNN and K-means. Additionally, they have both used Python and SQL in their professional experiences. Arsalan has built a music recommendation system for Spotify, while Akshat has designed a music recommendation system for the NYU Data Science Bootcamp. Both of them have achieved high accuracy in personalized suggestions and have enhanced user engagement through advanced clustering and similarity matching algorithms.


In [19]:
response = ask_rag_agent("Who was the main speaker in ES2002b and What were the usability concerns in TS3010b")
print(response)


The main speaker in ES2002b was the User Interface Designer, who presented the major components of the interface design, dividing the interface into two parts: voice commands and buttons. The Marketing Expert was also a key speaker, as she reported on research which shows that users think most remotes are ugly, easily lost and bad for RSI. Audio settings are rarely used, and the power, channel and volume buttons are used most often. The remote should be user-friendly and have a good look and feel.

The usability concerns in TS3010b were that remotes were too difficult to use, users want fancier and more ergonomic designs, shock protection, voice recognition, and LCD screens. The Project Manager also announced a new requirement that the remote is only to control televisions. The group decided to eliminate the LCD screen and voice recognition from the design due to time and cost restraints. They also decided to include a previous channel change button to the standard remote buttons, and

In [26]:
response = ask_rag_agent("TS3010b give the action items of this meeting, and who is responsible for what and write the name of person wherever possible?")
print(response)


Action Items from the Meeting:

1. Gather more information for the next meeting, the functional design meeting. (All Participants)
2. Decide on the inclusion of speech recognition in the design. (All Participants)
3. Determine the target group and features to attract them. (All Participants)
4. Discuss and decide on the buttons for the remote control. (All Participants)
5. Create the design of the remote control. (To be decided in the next meeting)
6. Industrial Designer: Work on the working design and technical function.
7. User Interface Designer: Work on the working design and functional design.
8. Marketing Manager: Look for user requirement specifications such as friendliness, selling price, and profit.
9. Consider the possibility of a touch screen, LCD, and other functions. (All Participants)
10. Fill out the questionnaire. (All Participants)
11. Receive specific instructions for the next meeting by email. (All Participants)
