In [None]:
import os
import sys
import re
import time
import wave
import hashlib
from uuid import uuid4
from datetime import datetime

import numpy as np
import pygame
import pdfplumber
# import pytesseract
# import soundfile as sf
from scipy.io import wavfile
from IPython.display import display, Audio
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.pagesizes import A4

from pinecone import Pinecone, ServerlessSpec
from langchain import hub
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import create_retrieval_chain
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_pinecone import PineconeVectorStore
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
import google.generativeai as genai

from kokoro import KPipeline



In [25]:
sys.path.append("../my_apis")
import apis

In [23]:
import sys
print(sys.executable)


c:\Users\user\Desktop\myproai\myai_pro\Scripts\python.exe


In [27]:
pinecone_api_key = apis.pinecone_Api_vector_db
gemini_api_key = apis.gemini_key
Index_name = "myproaiassistant"
# file_path = r"../my_documents/(Book)Linear Algebra and its Application.pdf"
file_path = r"../my_documents/pp_week1.pdf"
os.environ["GOOGLE_API_KEY"] = apis.gemini_key
os.environ["PINECONE_API_KEY"] = apis.pinecone_Api_vector_db
gemini_model_for_embaddings = "models/embedding-001"
gemini_model_for_query = "gemini-2.5-pro"
unique_id = uuid4()
print(file_path)

../my_documents/pp_week1.pdf


In [28]:
# === Step 1: Extract text from PDF ===
def Extract_ext_from_PDF(file_path):
    pdf_text = ""

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            pdf_text += page.extract_text() + "\n"

    return pdf_text


pdf_text = Extract_ext_from_PDF(file_path)

In [29]:
# === Step 2: Create chunker with overlap ===


def chunk_and_overlap(c_size, c_overlap):
    text_splitter = CharacterTextSplitter(
        separator="\n",  # split by newlines
        chunk_size=c_size,  # max characters per chunk
        chunk_overlap=c_overlap,  # overlap between chunks
        length_function=len,
    )

    chunks = text_splitter.split_text(pdf_text)

    # === Step 3: Show result ===
    for i, chunk in enumerate(chunks, start=1):  # first 5 chunks
        print(f"--- Chunk {i} ---")
        print(chunk)
        print()
    return chunks

chunks = chunk_and_overlap(700, 100)

--- Chunk 1 ---
Ms. Kainat Anjum(kainat@biit.edu.pk) Professional Ethics Whatsapp # 03215023609
(SSH 307)
Page number 1
Text book & References:
Engineering Ethics by Charles B. Fleddermann (Fourth edition)
Managing Business Ethics, Straight Talk About How To Do It Right by Linda K. Trevino&
Katherine A. Nelson
Lesson 01-02
OBJECTIVES:
• To introduce “ethics” and “professional ethics”
• To let students know about certain key concepts in professional Practices/Ethics • To
understand code of conduct in professional set up
LIST OF TOPICS TO BE COVERED:
1. Key Concepts in Professional Ethics
2. The Importance of Ethical Conduct in Business
3. Code of Ethics

--- Chunk 2 ---
2. The Importance of Ethical Conduct in Business
3. Code of Ethics
Ms. Kainat Anjum(kainat@biit.edu.pk) Professional Ethics Whatsapp # 03215023609
(SSH 307)
Page number 2
Ethics:
“A branch of philosophy concerned with that which is deemed acceptable in human
behaviour, with what is good or bad, right or wrong in human co

In [30]:
def generating_embeddings(my_model, my_pc_Api):

    embeddings = GoogleGenerativeAIEmbeddings(
        model=my_model, pinecone_api_key=my_pc_Api
    )
    return embeddings


embeddings = generating_embeddings(
    gemini_model_for_embaddings, os.environ["PINECONE_API_KEY"]
)

In [31]:
# Creating index


def creating_Index(my_pc, my_IN):

    pc = my_pc

    cloud = "aws"
    region = "us-east-1"
    spec = ServerlessSpec(cloud=cloud, region=region)

    index_name = my_IN

    if index_name not in pc.list_indexes().names():
        pc.create_index(
            name=index_name, dimension=embeddings.dimension, metric="cosine", spec=spec
        )

    # See that it is empty
    print("Index before upsert:")
    print(pc.Index(index_name).describe_index_stats())
    print("\n")
    return pc


pc = creating_Index(Pinecone(os.environ["PINECONE_API_KEY"]), Index_name)

Index before upsert:
{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'myproaivectors': {'vector_count': 18}},
 'total_vector_count': 18,
 'vector_type': 'dense'}




In [32]:
# Embed and upsert each chunk as a distinct record in a namespace called myproaiNamespace


def distinct_record(my_ns, my_IN):

    documents = [Document(page_content=chunk) for chunk in chunks]

    # Assign unique but repeatable IDs (hash from text)
    ids = [hashlib.md5(doc.page_content.encode()).hexdigest() for doc in documents]

    docsearch = PineconeVectorStore.from_documents(
        documents=documents,
        index_name=my_IN,
        embedding=embeddings,
        namespace=my_ns,
        ids=ids,
    )

    time.sleep(5)

    # See how many vectors have been upserted
    print("Index after upsert:")
    print(pc.Index(Index_name).describe_index_stats())
    print("\n")
    time.sleep(2)
    return docsearch


docsearch = distinct_record("myproaivectors", Index_name)

Index after upsert:
{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'myproaivectors': {'vector_count': 18}},
 'total_vector_count': 18,
 'vector_type': 'dense'}




In [33]:
def checking_records(my_IN, my_ns):

    index = pc.Index(my_IN)

    for ids in index.list(namespace=my_ns):
        query = index.query(
            id=ids[0],
            namespace=my_ns,
            top_k=1,
            include_values=True,
            include_metadata=True,
        )
        print(query)
        print("\n")


checking_records(Index_name, "myproaivectors")

{'matches': [{'id': '13646d9595edd371e63289210e28c6c9',
              'metadata': {'text': 'e.g. Medical and Dental Council, Police '
                                   'Service Code of Conduct, Estate Agents '
                                   'Code\n'
                                   'Ms. Kainat Anjum(kainat@biit.edu.pk) '
                                   'Professional Ethics Whatsapp # '
                                   '03215023609\n'
                                   '(SSH 307)\n'
                                   'Page number 7\n'
                                   'of Conduct. Codes may not be exhaustive '
                                   'and may not include all the rules and\n'
                                   'regulations that apply to every situation. '
                                   'The contents therefore have to be viewed '
                                   'within\n'
                                   'the framework of company policies, '
              

In [34]:
def save_to_simple_pdf(question, answer, filename="query_result.pdf"):
    """Simple PDF export using reportlab"""
    doc = SimpleDocTemplate(filename, pagesize=A4)
    styles = getSampleStyleSheet()
    story = []

    # Add question
    # story.append(Paragraph(f"<b><font size='16'>Question:</font></b>", styles['Heading1']))
    # # story.append(Paragraph(question, styles['Normal']))
    # story.append(Spacer(1, 20))

    # Add answer
    # story.append(Paragraph(f"<b><font size='16'>Answer:</font></b>", styles['Heading1']))

    # Format the answer (handle line breaks)
    formatted_answer = answer.replace("\n", "<br/>")
    story.append(Paragraph(formatted_answer, styles["Normal"]))
    story.append(Spacer(1, 20))

    # Add timestamp
    story.append(
        Paragraph(
            f"<i>Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</i>",
            styles["Italic"],
        )
    )

    doc.build(story)
    print(f" PDF saved: {filename}")

In [35]:
def relevant_knowledge(my_g_api_k, my_ns, my_k, my_g_qmodel, task_instruction):

    # ✅ Load retrieval QA prompt from LangChain Hub
    retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")

    # ✅ Create a retriever from your Pinecone vectorstore
    retriever = docsearch.as_retriever(search_kwargs={"namespace": my_ns, "k": my_k})

    # ✅ Gemini model for retrieval step
    llm_for_retrieval = ChatGoogleGenerativeAI(
        model=my_g_qmodel, temperature=0.0, google_api_key=my_g_api_k
    )

    # ✅ Combine docs chain
    combine_docs_chain = create_stuff_documents_chain(
        llm_for_retrieval, retrieval_qa_chat_prompt
    )

    # ✅ Retrieval chain
    retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

    # 🔹 Step 1: Get context from vectorstore
    retrieval_result = retrieval_chain.invoke({"input": task_instruction})
    context = retrieval_result["answer"]

    # 🔹 Step 2: One message with system + user prompt
    messages = [
        (
            "system",
            "You are a helpful assistant that helps professors with tasks like making quizzes, assignments, or summaries. **Strictly use only the retrieved content and do not extract contact information related to Ms. Kainat Anjum**",
        ),
        (
            "user",
            f"Based only on the following content:\n\n{retrieval_result}\n\nTask: {task_instruction}",
        ),
    ]

    # ✅ Step 3: Final LLM call
    llm_for_task = ChatGoogleGenerativeAI(
        model=my_g_qmodel, temperature=0.0, google_api_key=my_g_api_k
    )

    final_result = llm_for_task.invoke(messages)
    print(final_result.content)

    # Save data to pdf

    my_file = input("Enter the name of your file: ")
    save_to_simple_pdf(
        task_instruction, final_result.content, f"../my_pdf_files/{my_file}.pdf"
    )

    return final_result, my_file


final_result, my_file = relevant_knowledge(
    gemini_api_key,
    "myproaivectors",
    18,
    gemini_model_for_query,
    "extract the text from page number 2 of given pdf.",
)



2. The Importance of Ethical Conduct in Business
3. Code of Ethics

(SSH 307)

**Ethics:**
“A branch of philosophy concerned with that which is deemed acceptable in human behaviour, with what is good or bad, right or wrong in human conduct in pursuit of goals and aims.”

Ethics explores the nature of rights, of moral responsibilities, and of how to go about addressing an ethical problem.

Engineering ethics is the field of applied ethics which examines and sets standards for engineers' obligations to the public, their clients, employers and the profession and is appropriate.
 PDF saved: ../my_pdf_files/page2_context.pdf


In [36]:
pipeline = KPipeline(lang_code="a")  # Use appropriate language code
generator = pipeline(final_result.content, voice="am_onyx", speed=1)

audio_data = []

# Step 3: Loop through generated audio chunks
for i, (gs, ps, audio) in enumerate(generator):
    print(f"Chunk {i}: gs={gs}, ps={ps}")
    display(Audio(data=audio, rate=24000, autoplay=i == 0))
    audio_data.append(audio)

# Step 4: Process and save audio
if audio_data:
    # Concatenate all audio chunks
    combined_audio = np.concatenate(audio_data)

    # Normalize to prevent clipping (keep some headroom)
    max_val = np.max(np.abs(combined_audio))
    if max_val > 0:
        combined_audio = combined_audio / max_val * 0.9

    # Save using scipy (automatically handles float32 to int16 conversion)
    Audio_file_name = input("Enter the file name")
    my_audio_file_path = rf"../my_audios/{Audio_file_name}"
    wavfile.write(f"{my_audio_file_path}.wav", 24000, combined_audio)
    print("Audio generated successfully:")

    # Display final combined audio
    display(Audio(f"{my_audio_file_path}.wav"))
else:
    print("No audio data generated. Check your pipeline/text input.")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Chunk 0: gs=2. The Importance of Ethical Conduct in Business, ps=tˈu ði ɪmpˈɔɹtᵊns ʌv ˈɛθəkəl kˈɑndˌʌkt ɪn bˈɪznəs


Chunk 1: gs=3. Code of Ethics, ps=θɹˈi kˈOd ʌv ˈɛθɪks


Chunk 2: gs=(SSH 307), ps=(ˌɛsˌɛsˈAʧ θɹˈi hˈʌndɹəd sˈɛvən)


Chunk 3: gs=**Ethics:**, ps=ˈɛθɪks:ˈæstəɹɹˌɪsk


Chunk 4: gs=“A branch of philosophy concerned with that which is deemed acceptable in human behaviour, with what is good or bad, right or wrong in human conduct in pursuit of goals and aims.”, ps=“ɐ bɹˈænʧ ʌv fəlˈɑsəfi kənsˈɜɹnd wɪð ðˈæt wˌɪʧ ɪz dˈimd əksˈɛptəbᵊl ɪn hjˈumən bəhˈAvjəɹ, wɪð wˌʌt ɪz ɡˈʊd ɔɹ bˈæd, ɹˈIt ɔɹ ɹˈɔŋ ɪn hjˈumən kˈɑndˌʌkt ɪn pəɹsˈut ʌv ɡˈOlz ænd ˈAmz.”


Chunk 5: gs=Ethics explores the nature of rights, of moral responsibilities, and of how to go about addressing an ethical problem., ps=ˈɛθɪks ɪksplˈɔɹz ðə nˈAʧəɹ ʌv ɹˈIts, ʌv mˈɔɹᵊl ɹəspˌɑnsəbˈɪləTiz, ænd ʌv hˌW tə ɡˌO əbˈWt ədɹˈɛsɪŋ ɐn ˈɛθəkəl pɹˈɑbləm.


Chunk 6: gs=Engineering ethics is the field of applied ethics which examines and sets standards for engineers' obligations to the public, their clients, employers and the profession and is appropriate., ps=ˌɛnʤənˈɪɹɪŋ ˈɛθɪks ɪz ðə fˈild ʌv əplˈId ˈɛθɪks wˌɪʧ ɪɡzˈæmənz ænd sˈɛts stˈændəɹdz fɔɹ ˌɛnʤənˈɪɹz ˌɑbləɡˈAʃənz tə ðə pˈʌblɪk, ðɛɹ klˈIənts, ɪmplˈYjəɹz ænd ðə pɹəfˈɛʃən ænd ɪz əpɹˈOpɹiət.


Audio generated successfully:


In [37]:
#Moviepy

#Extract headings

def extract_headings_from_text(text):
    # Regex: **Heading** ya double asterisk ke andar ka text
    headings = re.findall(r"\*\*(.*?)\*\*", text)
    return headings
# Example usage after retrieval

def headings():
    retrieval_text = final_result.content
    extracted_headings = extract_headings_from_text(retrieval_text)
    
    # List ko newline separated string me convert karein
    headings_text = "\n".join(extracted_headings)

    my_file = input("Enter the name of your file: ")
    save_to_simple_pdf("", headings_text, f"../my_pdf_files/{my_file}.pdf")

headings()





 PDF saved: ../my_pdf_files/headings.pdf


In [None]:
# Add headings to frame


def create_headings_frame(headings_list, output_file="headings_frame.mp4"):
    # List ko ek string me convert karein, har heading alag line me
    text_content = "\n".join(headings_list)

    # Text clip banayein
    txt_clip = TextClip(
        text_content,
        fontsize=50,
        color="white",
        size=(1280, 720),  # resolution
        method="caption",  # line wrapping
        align="center",
    )

    # Duration set karein (video banane ke liye)
    txt_clip = txt_clip.set_duration(5)  # 5 seconds ka frame

    # Background black rakhein
    final_clip = CompositeVideoClip([txt_clip])

    # Video file save karein
    final_clip.write_videofile(output_file, fps=24)


# Example usage:
headings_list = ["Ethics", "Code of Conduct", "Moral Responsibilities"]
create_headings_frame(headings_list, "headings_frame.mp4")