In [240]:
from google.cloud import documentai
from google.oauth2 import service_account
import os
import re
import requests
from PyPDF2 import PdfReader, PdfWriter
from bs4 import BeautifulSoup
import pdfplumber  # for improved OCR if needed
import timeit
from collections import defaultdict
from typing import Dict, List
import tiktoken  # OpenAI's tokenization library
import json

In [None]:
pinecone_api_key = "pcsk_68gSai_Jvtbix4qBZ8Z3ccEr6HincaEb4ewKM5GzK23pbEkfNZ6UoNAN2jJimXGg3iyYpg"

In [64]:

def split_pdf(input_pdf_path,file_name, max_pages=1):
    """
    Split a PDF into smaller chunks of max_pages.
    """
    reader = PdfReader(input_pdf_path)
    chunks = []
    for i in range(0, len(reader.pages), max_pages):
        writer = PdfWriter()
        for j in range(i, min(i + max_pages, len(reader.pages))):
            writer.add_page(reader.pages[j])
        chunk_path = f"./chunks/chunk_{i // max_pages + 1}_{file_name.split('.')[0]}.pdf"
        with open(chunk_path, "wb") as f:
            writer.write(f)
        chunks.append(chunk_path)
    return chunks

In [65]:
credentials = service_account.Credentials.from_service_account_file('coms-6998-applied-llm-class-4e98f4f7a361.json')
client = documentai.DocumentProcessorServiceClient(credentials=credentials)


In [109]:
all_files_to_extract_data_from = os.listdir('./lecture_pdfs')

In [110]:
all_chunks = []
for file_name in all_files_to_extract_data_from:
    file_directory = "./lecture_pdfs"
    pdf_path = os.path.join(file_directory, file_name)
    chunks = split_pdf(pdf_path,file_name)
    all_chunks = all_chunks + chunks

In [191]:
def extract_text_links(text):
    links = []
    text = text.replace('-\n',"")
    page_links = re.findall(r'(https?://\S+)', text)
    links.extend(page_links)
    page_links = re.findall(r'(http?://\S+)', text)
    links.extend(page_links)
    return links

In [224]:
def get_document_extraction(file_name,project_id = "coms-6998-applied-llm-class",location = "us",processor_id = "398fd74279aa6748"):
    with open(file_name, "rb") as f:
        content = f.read()
    raw_document = documentai.RawDocument(content=content, mime_type="application/pdf")
    name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
    # Make the request
    request = documentai.ProcessRequest(name=name, raw_document=raw_document)
    response = client.process_document(request=request)
    document = response.document
    text = document.text
    links = extract_text_links(text)
    return text, links

In [136]:
def fetch_and_clean_text(url):
    """
    Fetches and cleans text from the given URL.
    :param url: The URL to fetch text from.
    :return: Cleaned text or an error message.
    """
    try:
        # Make an HTTP GET request
        response = requests.get(url, timeout=5)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
        
        # Parse the HTML content
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Extract the main text content
        # We can focus on specific tags (e.g., <p>, <div>) or use the whole text
        text_elements = soup.find_all(["p", "div"])
        text = " ".join(element.get_text() for element in text_elements)
        
        # Clean the text
        text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
        text = text.strip()  # Remove leading/trailing whitespace
        
        # Handle empty text scenario
        if not text:
            return f"Error: No extractable text found at {url}"
        return text
    
    except requests.exceptions.RequestException as e:
        # Handle HTTP and connection errors
        return f"Error: Unable to fetch content from {url}. Exception: {e}"
    except Exception as e:
        # Handle other unexpected errors
        return f"Error: Unexpected error while processing {url}. Exception: {e}"

In [137]:
def process_links(links):
    """
    Processes a list of links, extracting and cleaning text content.
    :param links: List of URLs.
    :return: Dictionary with URLs as keys and cleaned text (or error messages) as values.
    """
    results = {}
    for url in links:
        print(f"Processing: {url}")
        text = fetch_and_clean_text(url)
        results[url] = text
    return results

In [208]:
all_texts_with_links = [value['text'] for key,value in all_data.items() if len(value['links'])>0]

In [130]:
all_processed_chunks = list(all_data.keys())
start = timeit.default_timer()
for i,chunk in enumerate(all_chunks):
    if chunk not in all_processed_chunks:
        text, links = get_document_extraction(chunk)
        all_data[chunk] = {'text':text,'links':links}
        if i%5 ==0:
            end = timeit.default_timer()
            print(i, end-start, chunk)
            start = timeit.default_timer()

45 6.449538166999446 ./chunks/chunk_46_Lecture-12-Columbia.pdf
50 7.764267583000219 ./chunks/chunk_51_Lecture-12-Columbia.pdf
55 7.30011899999954 ./chunks/chunk_56_Lecture-12-Columbia.pdf
60 8.476970249999795 ./chunks/chunk_61_Lecture-12-Columbia.pdf
65 8.502008792000197 ./chunks/chunk_66_Lecture-12-Columbia.pdf
70 8.57982845800052 ./chunks/chunk_71_Lecture-12-Columbia.pdf
75 7.840684208000312 ./chunks/chunk_76_Lecture-12-Columbia.pdf
80 7.657551166999838 ./chunks/chunk_81_Lecture-12-Columbia.pdf
85 7.173694291999709 ./chunks/chunk_86_Lecture-12-Columbia.pdf
90 8.033632041000601 ./chunks/chunk_91_Lecture-12-Columbia.pdf
95 8.131241334000151 ./chunks/chunk_96_Lecture-12-Columbia.pdf
100 8.513085292000142 ./chunks/chunk_101_Lecture-12-Columbia.pdf
105 7.703719500000261 ./chunks/chunk_106_Lecture-12-Columbia.pdf
110 9.373608582999623 ./chunks/chunk_111_Lecture-12-Columbia.pdf
115 8.92286362499999 ./chunks/chunk_116_Lecture-12-Columbia.pdf
120 7.5701725419994546 ./chunks/chunk_121_Lecture-

670 6.929717415999221 ./chunks/chunk_16_Lecture-6-columbia-Fall2024.pdf
675 7.669259667000006 ./chunks/chunk_21_Lecture-6-columbia-Fall2024.pdf
680 7.963793916999748 ./chunks/chunk_26_Lecture-6-columbia-Fall2024.pdf
685 7.476365084000463 ./chunks/chunk_4_Lecture-10-Columbia.pdf
690 8.1687207089999 ./chunks/chunk_9_Lecture-10-Columbia.pdf
695 8.90650420899965 ./chunks/chunk_14_Lecture-10-Columbia.pdf
700 9.140342749999945 ./chunks/chunk_19_Lecture-10-Columbia.pdf
705 7.98733508299938 ./chunks/chunk_24_Lecture-10-Columbia.pdf
710 8.115165457999865 ./chunks/chunk_29_Lecture-10-Columbia.pdf
715 8.156138875000579 ./chunks/chunk_34_Lecture-10-Columbia.pdf
720 9.232960582999112 ./chunks/chunk_39_Lecture-10-Columbia.pdf
725 8.515497958000196 ./chunks/chunk_44_Lecture-10-Columbia.pdf
730 7.679936417000135 ./chunks/chunk_49_Lecture-10-Columbia.pdf
735 7.841302666000047 ./chunks/chunk_54_Lecture-10-Columbia.pdf
740 8.02238770799977 ./chunks/chunk_59_Lecture-10-Columbia.pdf
745 7.95550374999948 ./

In [214]:
all_data_cleaned = {}
for key,value in all_data.items():
    if len(value['links'])>0:
        all_data_cleaned[key] = {'text':value['text'],'links':extract_text_links(value['text'])}
    else:
        all_data_cleaned[key] = value

In [245]:
# Specify the file name of the JSON file
file_name = "data_from_presentations.json"

# Load the JSON file
with open(file_name, "r") as json_file:
    data = json.load(json_file)

In [215]:
all_processed_chunks = list(all_data_cleaned.keys())

In [216]:
all_links = []
for extracted_data in list(list(all_data_cleaned.values())):
    all_links = all_links + extracted_data['links']

In [217]:
extracted_data = process_links(all_links)

Processing: https://github.com/ray-project/llm-numbers#1-mb-gpu-memory-required-for-1-token-of-output-with-a-13b-parameter-model
Processing: https://arxiv.org/abs/2205.14135
Processing: https://ai.stanford.edu/blog/longer-sequencesnext-leap-ai/
Processing: https://github.com/vllm-project/vllm
Processing: https://vllm.ai
Processing: https://arxiv.org/abs/2309.06180
Processing: https://discord.gg/jz7wjKhh6g
Processing: https://docs.nvidia.com/datacenter/tesla/mig-userguide/index.html
Processing: https://huggingface.co/blog/trl-peft
Processing: https://arxiv.org/pdf/2202.05924
Processing: https://splab.sdu.edu.cn/G
Processing: https://research.google/blog/pathways-languagemodel-palm-scaling-to-540-billion-parameters-for-breakthrough-performance/
Processing: https://arxiv.org/pdf/2202.05924
Processing: https://www.youtube.com/watch?v=EnJ7qX9fkcU
Processing: https://jvns.ca/blog/2016/10/10/what-even-is-a-container/
Processing: https://kubernetes.io/
Processing: https://cloud.google.com/kube

In [218]:
cleaned_extracted_data = {key:value for key, value in extracted_data.items() if len(value)>=1000}

In [221]:
all_files_to_extract_data_from = os.listdir('./HWs')

In [222]:
all_hw_chunks = []
for file_name in all_files_to_extract_data_from:
    file_directory = "./HWs"
    pdf_path = os.path.join(file_directory, file_name)
    chunks = split_pdf(pdf_path,file_name, max_pages = 15)
    all_hw_chunks = all_hw_chunks + chunks

In [225]:
# all_hw_data = {}

In [226]:
all_processed_chunks = list(all_hw_data.keys())
start = timeit.default_timer()
for i,chunk in enumerate(all_hw_chunks):
    if chunk not in all_processed_chunks:
        text, links = get_document_extraction(chunk)
        all_hw_data[chunk] = {'text':text,'links':links}
        if i%5 ==0:
            end = timeit.default_timer()
            print(i, end-start, chunk)
            start = timeit.default_timer()

0 3.847637875000146 ./chunks/chunk_1_HW4-PDF.pdf


In [227]:
all_hw_links = []
for extracted_data in list(all_hw_data.values()):
    all_hw_links = all_hw_links + extracted_data['links']

In [228]:
extracted_hw_data = process_links(all_hw_links)

Processing: https://dustinstansbury.github.io/theclevermachine/bias-variance-tradeoff.
Processing: https://arxiv.org/pdf/1611.03530.pdf.
Processing: https://arxiv.org/abs/1506.01186.
Processing: https://arxiv.org/pdf/1611.03530.pdf
Processing: https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutionalneural-networks.pdf
Processing: https://arxiv.org/pdf/1409.1556.pdf
Processing: https://arxiv.org/pdf/1409.4842.pdf
Processing: https://github.com/qfgaohao/pytorch-ssd
Processing: https://pytorch.org/tutorials/advanced/super_resolution_with_onnxruntime.html
Processing: https://github.com/onnx/tutorials/blob/master/tutorials/OnnxRuntimeServerSSDModel.ipynb
Processing: https://storage.googleapis.com/openimages/web/index.html
Processing: http://host.robots.ox.ac.uk/pascal/VOC/voc2007/
Processing: https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html
Processing: https://cs231n.github.io/transfer-learning/
Processing: http://host.robots.ox.ac.uk/pascal/VO

In [229]:
cleaned_extracted_hw_data = {key:value for key, value in extracted_hw_data.items() if len(value)>=1000}

In [230]:
links_dict = defaultdict(lambda: "")
for key,value in cleaned_extracted_data.items():
    links_dict[key] = value
for key,value in cleaned_extracted_hw_data.items():
    links_dict[key] = value

In [231]:
def split_into_sentences(text: str) -> List[str]:
    """
    Split text into sentences using a regex-based sentence tokenizer.
    """
    sentence_endings = re.compile(r'(?<=[.!?]) +')  # Match end of sentence followed by space
    return sentence_endings.split(text)

def chunk_text_by_sentence(text: str, max_tokens: int, tokenizer) -> List[str]:
    """
    Chunk text into pieces of max_tokens length, ensuring chunks do not cut sentences.
    
    Args:
        text (str): The input text to chunk.
        max_tokens (int): The maximum number of tokens per chunk.
        tokenizer: The tokenizer instance for tokenizing the text.
    
    Returns:
        List[str]: A list of text chunks.
    """
    sentences = split_into_sentences(text)
    chunks = []
    current_chunk = []

    current_tokens = 0
    for sentence in sentences:
        sentence_tokens = tokenizer.encode(sentence)
        if current_tokens + len(sentence_tokens) <= max_tokens:
            current_chunk.append(sentence)
            current_tokens += len(sentence_tokens)
        else:
            # Complete the current chunk
            if current_chunk:
                chunks.append(" ".join(current_chunk))
            # Start a new chunk
            current_chunk = [sentence]
            current_tokens = len(sentence_tokens)

    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

def chunk_documents_by_sentence(documents: Dict[str, str], max_tokens: int = 500) -> Dict[str, List[str]]:
    """
    Chunk the text of multiple documents into smaller pieces, ensuring no sentence is cut.
    
    Args:
        documents (Dict[str, str]): A dictionary with document IDs as keys and text as values.
        max_tokens (int): The maximum number of tokens per chunk.
    
    Returns:
        Dict[str, List[str]]: A dictionary with document IDs as keys and lists of chunked text as values.
    """
    tokenizer = tiktoken.get_encoding("cl100k_base")  # Use the tokenizer compatible with OpenAI models
    chunked_documents = {}
    
    for doc_id, text in documents.items():
        chunked_documents[doc_id] = chunk_text_by_sentence(text, max_tokens, tokenizer)
    
    return chunked_documents

In [232]:
chunked_links_dict = chunk_documents_by_sentence(links_dict)

In [243]:
file_name = "data_from_embedded_links.json"
with open(file_name, "w") as json_file:
    json.dump(chunked_links_dict, json_file, indent=4)

In [238]:
all_q_and_a_docs_final = all_q_and_a_docs + all_q_and_a_docs_2

In [241]:
file_name = "all_q_and_a_docs_final.json"
with open(file_name, "w") as json_file:
    json.dump(all_q_and_a_docs_final, json_file, indent=4)

In [239]:
all_q_and_a_docs_final

[{'input': 'Large Language Model',
  'output': 'A type of foundation model applied specifically to text with the ability to understand and generate human language, enabling applications such as translation, summarization, and question-answering. Foundation Model: Pre-trained on large amounts of unlabeled and self-supervised data for very general use cases.'},
 {'input': 'Transformer',
  'output': 'A type of neural network architecture designed for handling sequences of data, particularly in natural language processing tasks. Transformers are known for their self-attention mechanism, which allows them to weigh the importance of different parts of an input sequence. They learn context and track relationships in sequential data like words in a sentence.'},
 {'input': 'Pretraining',
  'output': 'The initial phase of training a large language model, during which the model learns general language patterns and structures from a vast corpus of text data.'},
 {'input': 'Fine tuning',
  'output'

In [233]:
chunked_links_dict['https://github.com/ray-project/llm-numbers#1-mb-gpu-memory-required-for-1-token-of-output-with-a-13b-parameter-model']

['Skip to content Navigation Menu Toggle navigation Sign in Product GitHub Copilot Write better code with AI Security Find and fix vulnerabilities Actions Automate any workflow Codespaces Instant dev environments Issues Plan and track work Code Review Manage code changes Discussions Collaborate outside of code Code Search Find more, search less Explore All features Documentation GitHub Skills Blog Solutions By company size Enterprises Small and medium teams Startups By use case DevSecOps DevOps CI/CD View all use cases By industry Healthcare Financial services Manufacturing Government View all industries View all solutions Resources Topics AI DevOps Security Software Development View all Explore Learning Pathways White papers, Ebooks, Webinars Customer Stories Partners Open Source GitHub Sponsors Fund open source developers The ReadME Project GitHub community articles Repositories Topics Trending Collections Enterprise Enterprise platform AI-powered developer platform Available add-ons

In [164]:
cleaned_extracted_hw_data

{'https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-': 'NeurIPS Proceedings Search ImageNet Classification with Deep Convolutional Neural Networks Part of Advances in Neural Information Processing Systems 25 (NIPS 2012) Bibtex Metadata Paper Supplemental Authors Alex Krizhevsky, Ilya Sutskever, Geoffrey E. Hinton Abstract We trained a large, deep convolutional neural network to classify the 1.3 million high-resolution images in the LSVRC-2010 ImageNet training set into the 1000 different classes. On the test data, we achieved top-1 and top-5 error rates of 39.7\\% and 18.9\\% which is considerably better than the previous state-of-the-art results. The neural network, which has 60 million parameters and 500,000 neurons, consists of five convolutional layers, some of which are followed by max-pooling layers, and two globally connected layers with a final 1000-way softmax. To make training faster, we used non-saturating neurons and a very efficient GPU impl

In [235]:
len(list(chunked_links_dict.keys()))

48