In [352]:
from google.cloud import documentai
from google.oauth2 import service_account
from google.auth import load_credentials_from_file
import os
import re
import requests
from PyPDF2 import PdfReader, PdfWriter
from bs4 import BeautifulSoup
import pdfplumber  # for improved OCR if needed
import timeit
from collections import defaultdict
from typing import Dict, List
import tiktoken  # OpenAI's tokenization library
import json
import openai
from googlesearch import search
import unicodedata
import urllib.parse

<h1>Chunk data from lecture presentations</h1>

In [11]:

def split_pdf(input_pdf_path,file_name, max_pages=1):
    """
    Split a PDF into smaller chunks of max_pages.
    """
    reader = PdfReader(input_pdf_path)
    chunks = []
    for i in range(0, len(reader.pages), max_pages):
        writer = PdfWriter()
        for j in range(i, min(i + max_pages, len(reader.pages))):
            writer.add_page(reader.pages[j])
        chunk_path = f"./chunks/chunk_{i // max_pages + 1}_{file_name.split('.')[0]}.pdf"
        with open(chunk_path, "wb") as f:
            writer.write(f)
        chunks.append(chunk_path)
    return chunks

In [65]:
credentials = service_account.Credentials.from_service_account_file('coms-6998-applied-llm-class-4e98f4f7a361.json')
client = documentai.DocumentProcessorServiceClient(credentials=credentials)


In [109]:
all_files_to_extract_data_from = os.listdir('./lecture_pdfs')

In [110]:
all_chunks = []
for file_name in all_files_to_extract_data_from:
    file_directory = "./lecture_pdfs"
    pdf_path = os.path.join(file_directory, file_name)
    chunks = split_pdf(pdf_path,file_name)
    all_chunks = all_chunks + chunks

<h1>Extract text and links from chunks from lectures</h1>

In [191]:
def extract_text_links(text):
    links = []
    text = text.replace('-\n',"")
    page_links = re.findall(r'(https?://\S+)', text)
    links.extend(page_links)
    page_links = re.findall(r'(http?://\S+)', text)
    links.extend(page_links)
    return links

In [224]:
def get_document_extraction(file_name,project_id = "coms-6998-applied-llm-class",location = "us",processor_id = "398fd74279aa6748"):
    with open(file_name, "rb") as f:
        content = f.read()
    raw_document = documentai.RawDocument(content=content, mime_type="application/pdf")
    name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
    # Make the request
    request = documentai.ProcessRequest(name=name, raw_document=raw_document)
    response = client.process_document(request=request)
    document = response.document
    text = document.text
    links = extract_text_links(text)
    return text, links

In [136]:
def fetch_and_clean_text(url):
    """
    Fetches and cleans text from the given URL.
    :param url: The URL to fetch text from.
    :return: Cleaned text or an error message.
    """
    try:
        # Make an HTTP GET request
        response = requests.get(url, timeout=5)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
        
        # Parse the HTML content
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Extract the main text content
        # We can focus on specific tags (e.g., <p>, <div>) or use the whole text
        text_elements = soup.find_all(["p", "div"])
        text = " ".join(element.get_text() for element in text_elements)
        
        # Clean the text
        text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
        text = text.strip()  # Remove leading/trailing whitespace
        
        # Handle empty text scenario
        if not text:
            return f"Error: No extractable text found at {url}"
        return text
    
    except requests.exceptions.RequestException as e:
        # Handle HTTP and connection errors
        return f"Error: Unable to fetch content from {url}. Exception: {e}"
    except Exception as e:
        # Handle other unexpected errors
        return f"Error: Unexpected error while processing {url}. Exception: {e}"

In [137]:
def process_links(links):
    """
    Processes a list of links, extracting and cleaning text content.
    :param links: List of URLs.
    :return: Dictionary with URLs as keys and cleaned text (or error messages) as values.
    """
    results = {}
    for url in links:
        print(f"Processing: {url}")
        text = fetch_and_clean_text(url)
        results[url] = text
    return results

In [208]:
all_texts_with_links = [value['text'] for key,value in all_data.items() if len(value['links'])>0]

In [130]:
all_processed_chunks = list(all_data.keys())
start = timeit.default_timer()
for i,chunk in enumerate(all_chunks):
    if chunk not in all_processed_chunks:
        text, links = get_document_extraction(chunk)
        all_data[chunk] = {'text':text,'links':links}
        if i%5 ==0:
            end = timeit.default_timer()
            print(i, end-start, chunk)
            start = timeit.default_timer()

45 6.449538166999446 ./chunks/chunk_46_Lecture-12-Columbia.pdf
50 7.764267583000219 ./chunks/chunk_51_Lecture-12-Columbia.pdf
55 7.30011899999954 ./chunks/chunk_56_Lecture-12-Columbia.pdf
60 8.476970249999795 ./chunks/chunk_61_Lecture-12-Columbia.pdf
65 8.502008792000197 ./chunks/chunk_66_Lecture-12-Columbia.pdf
70 8.57982845800052 ./chunks/chunk_71_Lecture-12-Columbia.pdf
75 7.840684208000312 ./chunks/chunk_76_Lecture-12-Columbia.pdf
80 7.657551166999838 ./chunks/chunk_81_Lecture-12-Columbia.pdf
85 7.173694291999709 ./chunks/chunk_86_Lecture-12-Columbia.pdf
90 8.033632041000601 ./chunks/chunk_91_Lecture-12-Columbia.pdf
95 8.131241334000151 ./chunks/chunk_96_Lecture-12-Columbia.pdf
100 8.513085292000142 ./chunks/chunk_101_Lecture-12-Columbia.pdf
105 7.703719500000261 ./chunks/chunk_106_Lecture-12-Columbia.pdf
110 9.373608582999623 ./chunks/chunk_111_Lecture-12-Columbia.pdf
115 8.92286362499999 ./chunks/chunk_116_Lecture-12-Columbia.pdf
120 7.5701725419994546 ./chunks/chunk_121_Lecture-

670 6.929717415999221 ./chunks/chunk_16_Lecture-6-columbia-Fall2024.pdf
675 7.669259667000006 ./chunks/chunk_21_Lecture-6-columbia-Fall2024.pdf
680 7.963793916999748 ./chunks/chunk_26_Lecture-6-columbia-Fall2024.pdf
685 7.476365084000463 ./chunks/chunk_4_Lecture-10-Columbia.pdf
690 8.1687207089999 ./chunks/chunk_9_Lecture-10-Columbia.pdf
695 8.90650420899965 ./chunks/chunk_14_Lecture-10-Columbia.pdf
700 9.140342749999945 ./chunks/chunk_19_Lecture-10-Columbia.pdf
705 7.98733508299938 ./chunks/chunk_24_Lecture-10-Columbia.pdf
710 8.115165457999865 ./chunks/chunk_29_Lecture-10-Columbia.pdf
715 8.156138875000579 ./chunks/chunk_34_Lecture-10-Columbia.pdf
720 9.232960582999112 ./chunks/chunk_39_Lecture-10-Columbia.pdf
725 8.515497958000196 ./chunks/chunk_44_Lecture-10-Columbia.pdf
730 7.679936417000135 ./chunks/chunk_49_Lecture-10-Columbia.pdf
735 7.841302666000047 ./chunks/chunk_54_Lecture-10-Columbia.pdf
740 8.02238770799977 ./chunks/chunk_59_Lecture-10-Columbia.pdf
745 7.95550374999948 ./

In [214]:
all_data_cleaned = {}
for key,value in all_data.items():
    if len(value['links'])>0:
        all_data_cleaned[key] = {'text':value['text'],'links':extract_text_links(value['text'])}
    else:
        all_data_cleaned[key] = value

In [245]:
# # Specify the file name of the JSON file
# file_name = "data_from_presentations.json"

# # Load the JSON file
# with open(file_name, "r") as json_file:
#     data = json.load(json_file)

In [215]:
all_processed_chunks = list(all_data_cleaned.keys())

<h1>Aggregating all links from class presentations and HW</h1>

In [216]:
all_links = []
for extracted_data in list(list(all_data_cleaned.values())):
    all_links = all_links + extracted_data['links']

In [218]:
cleaned_extracted_data = {key:value for key, value in extracted_data.items() if len(value)>=1000}

In [221]:
all_files_to_extract_data_from = os.listdir('./HWs')

In [222]:
all_hw_chunks = []
for file_name in all_files_to_extract_data_from:
    file_directory = "./HWs"
    pdf_path = os.path.join(file_directory, file_name)
    chunks = split_pdf(pdf_path,file_name, max_pages = 15)
    all_hw_chunks = all_hw_chunks + chunks

In [225]:
# all_hw_data = {}

In [226]:
all_processed_chunks = list(all_hw_data.keys())
start = timeit.default_timer()
for i,chunk in enumerate(all_hw_chunks):
    if chunk not in all_processed_chunks:
        text, links = get_document_extraction(chunk)
        all_hw_data[chunk] = {'text':text,'links':links}
        if i%5 ==0:
            end = timeit.default_timer()
            print(i, end-start, chunk)
            start = timeit.default_timer()

0 3.847637875000146 ./chunks/chunk_1_HW4-PDF.pdf


In [227]:
all_hw_links = []
for extracted_data in list(all_hw_data.values()):
    all_hw_links = all_hw_links + extracted_data['links']

<h1>Finding new relevant links, by mining topics from the syllabus and finding relevant blog posts links</h1>

In [339]:
text,links = get_document_extraction('./Syllabus/Fall 2024 Syllabus-columbia-110524.pdf')

In [344]:
try:
    start = timeit.default_timer()
    all_messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": f"I am currently taking a class called Introduction to Deep Learning and LLM based Generative AI Systems"},
    {"role": "user", "content": f"I want you to extract all topics I will learn from this class: {text}."},
    {"role": "user", "content": f"Please make sure to only extract topics related to Machine Learning, Large Language Models, Computer Science, and Software Engineering topics"},
    {"role": "user", "content": "Please format the output as a list topics. Here is an example: ['model parallelism','Devops principles in machine learning']"},
    {"role": "user", "content": f"Please return nothing else other than a string version of the list"}
    ]
    response = openai.ChatCompletion.create(
    model="gpt-4o",
    max_tokens = 8000,
    messages=all_messages
    )
    course_topics = response['choices'][0]['message']['content']
    course_topics_cleaned = clean_q_a_string_json(course_topics)

except Exception as e:
    print(e)

In [347]:
def parse_google_search_results_html(response):
    if response.status_code == 200:
        data = response.json()  # Parse the JSON response
        html_content = data.get("body", "")  # Get the raw HTML from the "body" key
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(html_content, "lxml")
        # Dictionary to hold the results
        results_dict = {}
        # Loop through search result elements - adjust as necessary
        for result in soup.find_all("div", class_="g"):  # "g" is the common class for Google search results
            link_tag = result.find("a", href=True)
            title_tag = result.find("h3")
            if link_tag and title_tag:
                url = link_tag["href"]
                title = title_tag.get_text()
                results_dict[url] = title
        return results_dict
    else:
        print(f"Error: Received status code {response.status_code}")
        print(response.text)
        return {}

In [374]:
def get_google_search_results(query,api_token = "0fbec085971dc1ca50b111c6433d49bd989a57b81344bfb508754d9687d19efa"):
    search_url = f"https://www.google.com/search?q={query.replace(' ', '+')}"
    url = "https://api.brightdata.com/request"
    payload = {
        "zone": "serp_api3",  # Replace with your actual zone if different
        "url": search_url,
        "format": "json"
    }
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_token}"
    }

    # Make the request
    response = requests.post(url, headers=headers, data=json.dumps(payload))
    all_search_results = parse_google_search_results_html(response)
    return all_search_results

In [375]:
def sanitize_query_string(query):
    # Normalize the query to decompose special characters
    normalized = unicodedata.normalize("NFD", query)
    # Encode to ASCII, ignoring any non-ASCII characters
    ascii_encoded = normalized.encode("ascii", "ignore").decode("utf-8")
    # URL-encode the sanitized query string
    return urllib.parse.quote_plus(ascii_encoded)

In [380]:
query = f"Blog post explaining {course_topics_cleaned[1]} in Deep Learning, Machine Learning, Computer Science, or Software Engineering "
linkedin_url = None
#     print(query)
sanitized_query = sanitize_query_string(query)
results = get_google_search_results(sanitized_query)

In [384]:
all_results = {}
start = timeit.default_timer()
for i,topic in enumerate(course_topics_cleaned):
    query = f"Blog post explaining {topic} in Deep Learning, Machine Learning, Computer Science, or Software Engineering "
    linkedin_url = None
    #     print(query)
    sanitized_query = sanitize_query_string(query)
    results = get_google_search_results(sanitized_query)
    num_articles= 0
    for key,value in results.items():
        if num_articles<=4:
            all_results[key] = value
            num_articles+=1
        else:
            break
    end = timeit.default_timer()
    if i%5 ==0:
        print(i,end-start,len(list(all_results.keys())))

0 1.8749121250002645 5
5 17.543654875000357 29
10 31.217416375002358 54
15 63.02728124999703 79
20 73.60367716699693 100
25 88.57625995900162 123
30 99.52590387500095 148
35 110.69694012500258 171
40 139.45502395900257 195
45 155.12103224999737 215
50 164.55748754199885 239
55 179.83256741699734 262
60 190.92883874999825 283
65 202.32500008399802 305
70 212.8582381669985 329
75 225.31525770900043 354
80 237.91018229199835 373
85 250.4848908749991 391
90 260.39152995900076 415
95 274.1067696250029 437
100 283.8554647089986 459
105 298.89148462499725 480


In [390]:
all_google_blog_links = list(all_results.keys())

<h1>Extracting all text from links</h1>

In [217]:
extracted_data = process_links(all_links)

Processing: https://github.com/ray-project/llm-numbers#1-mb-gpu-memory-required-for-1-token-of-output-with-a-13b-parameter-model
Processing: https://arxiv.org/abs/2205.14135
Processing: https://ai.stanford.edu/blog/longer-sequencesnext-leap-ai/
Processing: https://github.com/vllm-project/vllm
Processing: https://vllm.ai
Processing: https://arxiv.org/abs/2309.06180
Processing: https://discord.gg/jz7wjKhh6g
Processing: https://docs.nvidia.com/datacenter/tesla/mig-userguide/index.html
Processing: https://huggingface.co/blog/trl-peft
Processing: https://arxiv.org/pdf/2202.05924
Processing: https://splab.sdu.edu.cn/G
Processing: https://research.google/blog/pathways-languagemodel-palm-scaling-to-540-billion-parameters-for-breakthrough-performance/
Processing: https://arxiv.org/pdf/2202.05924
Processing: https://www.youtube.com/watch?v=EnJ7qX9fkcU
Processing: https://jvns.ca/blog/2016/10/10/what-even-is-a-container/
Processing: https://kubernetes.io/
Processing: https://cloud.google.com/kube

In [228]:
extracted_hw_data = process_links(all_hw_links)

Processing: https://dustinstansbury.github.io/theclevermachine/bias-variance-tradeoff.
Processing: https://arxiv.org/pdf/1611.03530.pdf.
Processing: https://arxiv.org/abs/1506.01186.
Processing: https://arxiv.org/pdf/1611.03530.pdf
Processing: https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutionalneural-networks.pdf
Processing: https://arxiv.org/pdf/1409.1556.pdf
Processing: https://arxiv.org/pdf/1409.4842.pdf
Processing: https://github.com/qfgaohao/pytorch-ssd
Processing: https://pytorch.org/tutorials/advanced/super_resolution_with_onnxruntime.html
Processing: https://github.com/onnx/tutorials/blob/master/tutorials/OnnxRuntimeServerSSDModel.ipynb
Processing: https://storage.googleapis.com/openimages/web/index.html
Processing: http://host.robots.ox.ac.uk/pascal/VOC/voc2007/
Processing: https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html
Processing: https://cs231n.github.io/transfer-learning/
Processing: http://host.robots.ox.ac.uk/pascal/VO

In [392]:
extracted_google_blog_data = process_links(all_google_blog_links)

Processing: https://arize.com/blog/understanding-bias-in-ml-models/
Processing: https://medium.com/@sruthy.sn91/addressing-bias-in-machine-learning-techniques-and-ethical-considerations-fe9d9532d657
Processing: https://www.scalablepath.com/machine-learning/bias-machine-learning
Processing: https://www.wovenware.com/blog/2020/07/3-bias-machine-learning/
Processing: https://www.encora.com/insights/a-short-discussion-on-bias-in-machine-learning
Processing: https://www.simplilearn.com/tutorials/machine-learning-tutorial/bias-and-variance
Processing: https://www.bmc.com/blogs/bias-variance-machine-learning/
Processing: https://data-science-blog.com/blog/2020/11/02/bias-and-variance-in-machine-learning/
Processing: http://varianceexplained.org/r/ds-ml-ai/
Processing: https://towardsai.net/p/l/mastering-the-bias-variance-dilemma-a-guide-for-machine-learning-practitioners
Processing: http://research.google/blog/a-new-lens-on-understanding-generalization-in-deep-learning/
Processing: https://do

Processing: https://cyborgcodes.medium.com/what-is-early-stopping-in-deep-learning-eeb1e710a3cf
Processing: https://www.machinelearningmastery.com/how-to-stop-training-deep-neural-networks-at-the-right-time-using-early-stopping/
Processing: https://towardsdatascience.com/early-stopping-why-did-your-machine-learning-model-stop-training-c6b1d64e009e
Processing: https://insights.daffodilsw.com/blog/what-is-data-augmentation-in-deep-learning
Processing: https://aws.amazon.com/what-is/data-augmentation/
Processing: https://www.f22labs.com/blogs/what-is-data-augmentation/
Processing: https://medium.com/@saiwadotai/the-essential-guide-to-data-augmentation-in-deep-learning-f66e0907cdc8
Processing: https://gretel.ai/technical-glossary/what-is-data-augmentation
Processing: https://medium.com/udemy-engineering/delivering-ai-ml-products-efficiently-the-single-node-machine-learning-workflow-bad1389410af
Processing: https://www.enthought.com/blog/a-beginners-guide-to-deep-learning/
Processing: https

Processing: https://cloud.google.com/discover/deep-learning-vs-machine-learning
Processing: https://medium.com/@markpalatucci/deep-learning-in-the-cloud-vs-on-premises-machines-d9707ddfec22
Processing: https://aws.amazon.com/blogs/machine-learning/
Processing: https://aws.amazon.com/what-is/deep-learning/
Processing: https://aws.amazon.com/blogs/architecture/lets-architect-learn-about-machine-learning-on-aws/
Processing: https://www.whizlabs.com/blog/aws-deep-learning/
Processing: https://k21academy.com/amazon-web-services/aws-ml/deep-learning/
Processing: https://techcommunity.microsoft.com/tag/software%20engineering?nodeId=board%3AEducatorDeveloperBlog
Processing: https://opensource.microsoft.com/blog/topic/deep-learning/
Processing: https://learn.microsoft.com/en-us/community/content/get-started-machine-learning
Processing: https://blog.acolyer.org/2019/07/08/software-engineering-for-machine-learning/
Processing: https://www.microsoft.com/en-us/research/project/deep-program-understa

Processing: https://vente.medium.com/mlperf-vs-my-neural-net-training-time-nightmare-1a0a5ee624b6?source=post_internal_links---------4----------------------------
Processing: https://www.analyticsvidhya.com/blog/2019/11/comprehensive-guide-attention-mechanism-deep-learning/
Processing: https://www.kdnuggets.com/2021/01/attention-mechanism-deep-learning-explained.html
Processing: https://medium.com/@prakhargannu/attention-mechanism-in-deep-learning-simplified-d6a5830a079d
Processing: https://www.unthinkable.co/blog/exploring-the-concept-of-attention-mechanism-in-deep-learning/
Processing: https://insights.daffodilsw.com/blog/what-is-the-attention-mechanism-in-deep-learning
Processing: https://blogs.nvidia.com/blog/what-is-a-transformer-model/
Processing: https://www.datacamp.com/tutorial/how-transformers-work
Processing: https://www.turing.com/kb/brief-introduction-to-transformers-and-their-power
Processing: https://blog.nelhage.com/post/transformers-for-software-engineers/
Processing: 

Processing: https://medium.com/pinterest-engineering/understanding-pins-through-keyword-extraction-40cf94214c18
Processing: https://www.seoclarity.net/blog/machine-learning-and-seo-16591/
Processing: https://blog.google/products/search/search-language-understanding-bert/
Processing: https://softwaredoug.com/blog/2024/06/25/what-ai-engineers-need-to-know-search
Processing: https://www.quora.com/What-is-a-great-blog-for-machine-learning
Processing: https://encord.com/blog/embeddings-machine-learning/
Processing: https://medium.com/@alok.g.v/understanding-embedding-machine-learning-6b0712242bef
Processing: https://developers.google.com/machine-learning/crash-course/embeddings
Processing: https://aws.amazon.com/what-is/embeddings-in-machine-learning/
Processing: https://www.reddit.com/r/learnmachinelearning/comments/tfpl7c/a_deep_dive_into_word_embeddings_nlp/
Processing: https://medium.com/@aikho/deep-learning-in-information-retrieval-part-ii-dense-retrieval-1f9fecb47de9
Processing: https

Processing: https://www.linkedin.com/posts/optimumai_peft-newsletter-ai-activity-7201972096032272384-uGEa
Processing: https://medium.com/nlplanet/two-minutes-nlp-learn-the-rouge-metric-by-examples-f179cc285499
Processing: https://www.traceloop.com/blog/evaluating-model-performance-with-the-rouge-metric-a-comprehensive-guide
Processing: https://www.linkedin.com/advice/1/what-rouge-score-how-can-you-use-evaluate-nlp-euj9e
Processing: https://towardsdatascience.com/to-rouge-or-not-to-rouge-6a5f3552ea45
Processing: https://medium.com/free-code-camp/what-is-rouge-and-how-it-works-for-evaluation-of-summaries-e059fb8ac840
Processing: https://kantanmtblog.com/2015/07/14/understanding-bleu-for-machine-translation/
Processing: https://www.traceloop.com/blog/demystifying-the-bleu-metric
Processing: https://kvashee.medium.com/understanding-mt-quality-bleu-scores-9a19ed20526d
Processing: https://towardsdatascience.com/evaluating-text-output-in-nlp-bleu-at-your-own-risk-e8609665a213
Processing: http

In [393]:
links_dict = defaultdict(lambda: "")
for key,value in cleaned_extracted_data.items():
    links_dict[key] = value
for key,value in cleaned_extracted_hw_data.items():
    links_dict[key] = value
for key,value in extracted_google_blog_data.items():
    links_dict[key] = value
cleaned_links_dict = {key:value for key, value in links_dict.items() if len(value)>=1000}

<h1>Chunking scraped data from links for VDB</h1>

In [394]:
def split_into_sentences(text: str) -> List[str]:
    """
    Split text into sentences using a regex-based sentence tokenizer.
    """
    sentence_endings = re.compile(r'(?<=[.!?]) +')  # Match end of sentence followed by space
    return sentence_endings.split(text)

def chunk_text_by_sentence(text: str, max_tokens: int, tokenizer) -> List[str]:
    """
    Chunk text into pieces of max_tokens length, ensuring chunks do not cut sentences.
    
    Args:
        text (str): The input text to chunk.
        max_tokens (int): The maximum number of tokens per chunk.
        tokenizer: The tokenizer instance for tokenizing the text.
    
    Returns:
        List[str]: A list of text chunks.
    """
    sentences = split_into_sentences(text)
    chunks = []
    current_chunk = []

    current_tokens = 0
    for sentence in sentences:
        sentence_tokens = tokenizer.encode(sentence)
        if current_tokens + len(sentence_tokens) <= max_tokens:
            current_chunk.append(sentence)
            current_tokens += len(sentence_tokens)
        else:
            # Complete the current chunk
            if current_chunk:
                chunks.append(" ".join(current_chunk))
            # Start a new chunk
            current_chunk = [sentence]
            current_tokens = len(sentence_tokens)

    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

def chunk_documents_by_sentence(documents: Dict[str, str], max_tokens: int = 500) -> Dict[str, List[str]]:
    """
    Chunk the text of multiple documents into smaller pieces, ensuring no sentence is cut.
    
    Args:
        documents (Dict[str, str]): A dictionary with document IDs as keys and text as values.
        max_tokens (int): The maximum number of tokens per chunk.
    
    Returns:
        Dict[str, List[str]]: A dictionary with document IDs as keys and lists of chunked text as values.
    """
    tokenizer = tiktoken.get_encoding("cl100k_base")  # Use the tokenizer compatible with OpenAI models
    chunked_documents = {}
    
    for doc_id, text in documents.items():
        chunked_documents[doc_id] = chunk_text_by_sentence(text, max_tokens, tokenizer)
    
    return chunked_documents

In [395]:
chunked_links_dict = chunk_documents_by_sentence(cleaned_links_dict)

In [396]:
file_name = "data_from_embedded_links.json"
with open(file_name, "w") as json_file:
    json.dump(chunked_links_dict, json_file, indent=4)

<h1>Pulling Q and A docs from Quizlet</h1>

In [409]:
all_q_and_a_docs_final = all_q_and_a_docs + all_q_and_a_docs_2

In [241]:
file_name = "all_q_and_a_docs_final.json"
with open(file_name, "w") as json_file:
    json.dump(all_q_and_a_docs_final, json_file, indent=4)

In [402]:
all_q_and_a_docs_final

[{'input': 'Large Language Model',
  'output': 'A type of foundation model applied specifically to text with the ability to understand and generate human language, enabling applications such as translation, summarization, and question-answering. Foundation Model: Pre-trained on large amounts of unlabeled and self-supervised data for very general use cases.'},
 {'input': 'Transformer',
  'output': 'A type of neural network architecture designed for handling sequences of data, particularly in natural language processing tasks. Transformers are known for their self-attention mechanism, which allows them to weigh the importance of different parts of an input sequence. They learn context and track relationships in sequential data like words in a sentence.'},
 {'input': 'Pretraining',
  'output': 'The initial phase of training a large language model, during which the model learns general language patterns and structures from a vast corpus of text data.'},
 {'input': 'Fine tuning',
  'output'

<h1>Building new Q and A set from scraped links text</h1>

In [397]:
import ast
import numpy as np

In [398]:
all_embedded_blogs = list(cleaned_links_dict.values())

In [399]:
def clean_q_a_string_json(text):
    clean_response = text.strip('```python\n').strip('```')
    try:
        quiz_data = ast.literal_eval(clean_response)
        return quiz_data
    except Exception as e:
        print("Error parsing the response:", e)
        return []

In [None]:
# all_q_a = []
for key,value in links_dict.items():
    try:
        start = timeit.default_timer()
        all_messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": f"I am trying to create a dataset of quiz questions and answers I can use to fine-tune a model. I want you to create that set of up to 10 quiz questions and answers using the data I give you below"},
        {"role": "user", "content": f"Here is the data I want you to make quiz questions and answers from: {value}."},
        {"role": "user", "content": f"Please make sure to only make questions related to Machine Learning, Large Language Models, Computer Science, and Software Engineering topics"},
        {"role": "user", "content": "Please format the output as a list of python dictionaries where each dictionary represents one question answer pair. Here is an example of the structure [{'question':extracted question, 'answer':extracted answer}]"},
        {"role": "user", "content": f"Please return nothing else other than a string version of the python dictionary"}
        ]
        response = openai.ChatCompletion.create(
        model="gpt-4o",
        max_tokens = 8000,
        messages=all_messages
        )
        q_a_json_text = response['choices'][0]['message']['content']
        q_a_list = clean_q_a_string_json(q_a_json_text)
        all_q_a = all_q_a + q_a_list
        end = timeit.default_timer()
        print(end-start,key,q_a_list,len(all_q_a))
    except Exception as e:
        print(e)

In [410]:
# all_q_and_a_docs_final = []
for q_a in all_q_a:
    all_keys = q_a.keys()
    if ('question' in all_keys)&('answer' in all_keys):
        all_q_and_a_docs_final.append({'input':q_a['question'],'output':q_a['answer']})

In [414]:
all_q_and_a_docs_final_cleaned = np.array([q_a if "?" in q_a['input'] else {"input":f"What is {q_a['input']}?","output":q_a['output']} for q_a in all_q_and_a_docs_final ])

In [418]:
all_indices = np.arange(0,len(all_q_and_a_docs_final_cleaned))
train_indices = np.random.choice(all_indices, size = int(len(all_q_and_a_docs_final_cleaned)*.7))
test_indices = np.array([index for index in all_indices if index not in train_indices])

In [429]:
training_data = all_q_and_a_docs_final_cleaned[train_indices]
test_data = all_q_and_a_docs_final_cleaned[test_indices]
test_data_list = list(test_data)

In [430]:
file_name = "./Test Data/test_data.json"
with open(file_name, "w") as json_file:
    json.dump(test_data_list, json_file, indent=4)

In [424]:
output_file = "./Fine Tuning Data/training_data.jsonl"
with open(output_file, 'w') as outfile:
    for line in training_data:
        try:
            # Parse the JSON line
            # Create the required structure
            transformed = {
                "contents": [
                    {
                        "role": "user",
                        "parts": [{"text": line.get("input", "")}]
                    },
                    {
                        "role": "model",
                        "parts": [{"text": line.get("output", "")}]
                    }
                ]
            }
            # Write the transformed JSON object as a line
            outfile.write(json.dumps(transformed) + "\n")
        except Exception as e:
            print(f"Error processing line: {line.strip()}\nError: {e}")

In [425]:
output_file = "./Fine Tuning Data/test_data.jsonl"
with open(output_file, 'w') as outfile:
    for line in test_data:
        try:
            # Parse the JSON line
            # Create the required structure
            transformed = {
                "contents": [
                    {
                        "role": "user",
                        "parts": [{"text": line.get("input", "")}]
                    },
                    {
                        "role": "model",
                        "parts": [{"text": line.get("output", "")}]
                    }
                ]
            }
            # Write the transformed JSON object as a line
            outfile.write(json.dumps(transformed) + "\n")
        except Exception as e:
            print(f"Error processing line: {line.strip()}\nError: {e}")

In [431]:
openai_formatted_data = []
for entry in training_data:
    formatted_entry = {
        "messages": [
            {"role": "system", "content": "You are a helpful tutor who answers questions about a class called Introduction to Deep Learning and LLM based Generative AI Systems"},
            {"role": "user", "content": entry["input"]},
            {"role": "assistant", "content": entry["output"]}
        ]
    }
    openai_formatted_data.append(formatted_entry)

In [432]:
with open("./Fine Tuning Data/openai_training_data.jsonl", "w") as f:
    for entry in openai_formatted_data:
        f.write(json.dumps(entry) + "\n")

In [433]:
response = openai.File.create(
    file=open("./Fine Tuning Data/openai_training_data.jsonl", "rb"),
    purpose='fine-tune'
)

In [435]:
file_id = response['id']

In [439]:
file_id

'file-TM899BA8CGhM4sLZNiaaQE'

In [438]:
from openai import OpenAI

ImportError: cannot import name 'OpenAI' from 'openai' (/opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages/openai/__init__.py)

In [436]:
fine_tune_response = openai.FineTune.create(
    training_file=file_id,
    model="gpt-4o-mini-2024-07-18"  # Or another base model like 'curie', 'babbage', 'ada'
)

InvalidRequestError: Unknown request URL: POST /v1/fine-tunes. Please check the URL for typos, or see the docs at https://platform.openai.com/docs/api-reference/.

In [259]:
all_messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": f"I am trying to create a dataset of quiz questions and answers I can use to fine-tune a model. I want you to create that set of up to 10 quiz questions and answers using the data I give you below"},
    {"role": "user", "content": f"Here is the data I want you to make quiz questions and answers from: {all_embedded_blogs[0]}."},
    {"role": "user", "content": "Please format the output as a list of python dictionaries where each dictionary represents one question answer pair. Here is an example of the structure [{'question':extracted question, 'answer':extracted answer}]"},
    {"role": "user", "content": f"Please return nothing else other than a string version of the python dictionary"}
]

In [260]:
response = openai.ChatCompletion.create(
    model="gpt-4o",
    max_tokens = 8000,
    messages=all_messages
)

In [263]:
q_a_json_text = response['choices'][0]['message']['content']

In [265]:
clean_response = q_a_json_text.strip('```python\n').strip('```')

# Step 2: Safely parse the string into a Python list
try:
    quiz_data = ast.literal_eval(clean_response)
    print(quiz_data)
except Exception as e:
    print("Error parsing the response:", e)

[{'question': 'What is the average token-to-word ratio for a 750-word English document in LLMs?', 'answer': '1.3:1, meaning a 750-word document is approximately 1000 tokens.'}, {'question': 'How much can be saved by appending "Be Concise" to a prompt when using an LLM?', 'answer': '40-90% of the tokens can be saved.'}, {'question': 'What is the typical cost ratio of using GPT-4 compared to GPT-3.5 Turbo?', 'answer': 'The cost ratio is approximately 50:1.'}, {'question': 'What is the typical cost ratio of generating text with GPT-3.5 Turbo versus looking it up with OpenAI embedding?', 'answer': 'The cost ratio is 5:1.'}, {'question': 'What is the cost ratio of OpenAI embedding services to self-hosted embedding?', 'answer': 'The cost ratio is approximately 10:1.'}, {'question': 'What is the cost ratio of serving a fine-tuned model versus a base model on OpenAI?', 'answer': 'The cost ratio is 6:1.'}, {'question': 'How much does it typically cost to train a 13 billion parameter model on 1.

In [43]:
from google.cloud import aiplatform
import vertexai
from vertexai.language_models import TextGenerationModel
from vertexai.generative_models import GenerativeModel, SafetySetting, Part

In [49]:
generation_config = {
    "max_output_tokens": 1024,
    "temperature": 0.2,
    "top_p": 0.8,
}

safety_settings = [
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
]


In [19]:
input_text = "Explain the concept of gradient descent in simple terms."


In [45]:
credentials, project_id = load_credentials_from_file("./GSuite Text Extraction Creds/vertex_ai_key.json")
vertexai.init(credentials=credentials,project="90458358443", location="us-central1")


In [46]:
model = GenerativeModel(
    "projects/90458358443/locations/us-central1/endpoints/326380131100655616",
    system_instruction=["You are a helpful tutor for the class - Applied Large Language Models and Natural Language Processing"]
)

In [51]:
chat = model.start_chat(response_validation=False)

In [52]:
chat.send_message(
        ["""What is a RAG process?"""],
        generation_config=generation_config,
        safety_settings=safety_settings
    )

candidates {
  content {
    role: "model"
    parts {
      text: "RAG stands for Retrieval Augmented Generation.  It\'s a technique that combines the strengths of large language models (LLMs) with the ability to access and process external knowledge.\n\nHere\'s a breakdown of the process:\n\n1. **Retrieval:** The RAG process begins with a user query.  The query is then used to search a knowledge base.  The knowledge base can be a database, a file system, or a cloud storage service.  The search results are then used to retrieve relevant documents.\n\n2. **Augmentation:** The retrieved documents are then used to augment the user query.  This means that the user query is modified to include information from the retrieved documents.  This is done by adding the retrieved documents to the user query.\n\n3. **Generation:** The augmented query is then used to generate a response.  This is done by using a large language model (LLM).  The LLM is used to generate a response that is based on the

In [42]:
def multiturn_generate_content():
    vertexai.init(project="90458358443", location="us-central1")
    model = GenerativeModel(
        "projects/90458358443/locations/us-central1/endpoints/326380131100655616",
        system_instruction=["You are a helpful tutor for the class - Applied Large Language Models and Natural Language Processing"]
    )
    chat = model.start_chat()

In [41]:
# Load the fine-tuned Gemini model using get_tuned_model
tuned_model_name = f"projects/{project_id}/locations/us-central1/models/326380131100655616"
gemini_model = TextGenerationModel.get_tuned_model(tuned_model_name=tuned_model_name)


NotFound: 404 The Model does not exist.

In [37]:
endpoint_id = "326380131100655616"
endpoint = aiplatform.Endpoint(endpoint_name=f"projects/{project_id}/locations/us-central1/endpoints/{endpoint_id}")

In [38]:
# Run inference
prompt = "What is Retrieval Augmented Generation (RAG)?"
instances = [{"content": prompt}]  # Ensure the input format matches your model's schema

response = endpoint.predict(instances)

FailedPrecondition: 400 Gemini cannot be accessed through Vertex Predict/RawPredict API. Please follow https://cloud.google.com/vertex-ai/docs/generative-ai/start/quickstarts/quickstart-multimodal for Gemini usage.

In [18]:
safety_settings = [
    SafetySetting(
        category="HARM_CATEGORY_HARASSMENT",
        threshold=1,  # 1 is the most restrictive; adjust as needed
    )
]

In [23]:
dir(gemini_model)[-30:]

['_gapic_compute_tokens',
 '_gapic_compute_tokens_async',
 '_gapic_count_tokens',
 '_gapic_count_tokens_async',
 '_generate_content',
 '_generate_content_async',
 '_generate_content_streaming',
 '_generate_content_streaming_async',
 '_generation_config',
 '_labels',
 '_llm_utility_async_client',
 '_llm_utility_client',
 '_location',
 '_model_name',
 '_parse_response',
 '_prediction_async_client',
 '_prediction_client',
 '_prediction_resource_name',
 '_prepare_request',
 '_safety_settings',
 '_system_instruction',
 '_tool_config',
 '_tools',
 'compute_tokens',
 'compute_tokens_async',
 'count_tokens',
 'count_tokens_async',
 'generate_content',
 'generate_content_async',
 'start_chat']

In [24]:
response = gemini_model.generate_content(
    prompt=input_text,
    temperature=0.7,  # Controls randomness; lower is less random
    max_output_tokens=256,  # Limit on output length
    top_p=0.8,  # Controls diversity via nucleus sampling
    top_k=40,  # Controls diversity via token sampling
    safety_settings=safety_settings  # Optional, set this if needed
)

TypeError: _GenerativeModel.generate_content() got an unexpected keyword argument 'prompt'

In [318]:
endpoint = tuned_model.deploy(
    machine_type="n1-standard-4",  # Choose an appropriate machine type
    min_replica_count=1,
    max_replica_count=1
)

Model does not support deployment. See https://cloud.google.com/vertex-ai/docs/reference/rpc/google.cloud.aiplatform.v1#google.cloud.aiplatform.v1.Model.FIELDS.repeated.google.cloud.aiplatform.v1.Model.DeploymentResourcesType.google.cloud.aiplatform.v1.Model.supported_deployment_resources_types


In [319]:
print("Model deployed to endpoint:", endpoint.name)

Model deployed to endpoint: 3785355751153729536


In [320]:
input_text = "What is Retrieval Augmented Generation (RAG)?"
instances = [{"content": input_text}]

response = endpoint.predict(instances)

FailedPrecondition: 400 Gemini cannot be accessed through Vertex Predict/RawPredict API. Please follow https://cloud.google.com/vertex-ai/docs/generative-ai/start/quickstarts/quickstart-multimodal for Gemini usage.