In [5]:
import torch
import transformers

In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [8]:
DEFAULT_MODEL = "chuanli11/Llama-3.2-3B-Instruct-uncensored"

In [9]:
model = AutoModelForCausalLM.from_pretrained(
    DEFAULT_MODEL,
    torch_dtype=torch.bfloat16,
    use_safetensors=True,
    trust_remote_code=True,
    device_map=device,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
!pip install PyPDF2
import PyPDF2
from pathlib import Path
from tqdm.auto import tqdm
from typing import Optional



In [11]:
pdf_path = '/content/Stochastic Weight Averaging.pdf'

with open(pdf_path, 'rb') as file:
    # Create PDF reader object
    pdf_reader = PyPDF2.PdfReader(file)

    # Get total number of pages
    num_pages = len(pdf_reader.pages)
    print(f"Processing PDF with {num_pages} pages...")

    extracted_text = []
    total_chars = 0
    max_chars = 100000

    # Iterate through all pages
    for page_num in range(num_pages):
        # Extract text from page
        page = pdf_reader.pages[page_num]
        text = page.extract_text()

        # Check if adding this page's text would exceed the limit
        if total_chars + len(text) > max_chars:
            # Only add text up to the limit
            remaining_chars = max_chars - total_chars
            extracted_text.append(text[:remaining_chars])
            print(f"Reached {max_chars} character limit at page {page_num + 1}")
            break

        extracted_text.append(text)
        total_chars += len(text)
        print(f"Processed page {page_num + 1}/{num_pages}")

    final_text = '\n'.join(extracted_text)
    print(f"\nExtraction complete! Total characters: {len(final_text)}")

Processing PDF with 12 pages...
Processed page 1/12
Processed page 2/12
Processed page 3/12
Processed page 4/12
Processed page 5/12
Processed page 6/12
Processed page 7/12
Processed page 8/12
Processed page 9/12
Processed page 10/12
Processed page 11/12
Processed page 12/12

Extraction complete! Total characters: 47672


In [12]:
#save
if final_text:
    output_file = 'extracted_text.txt'
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(final_text)
    print(f"\nExtracted text has been saved to {output_file}")


Extracted text has been saved to extracted_text.txt


In [13]:
SYS_PROMPT = """
You are a world class text pre-processor, here is the raw data from a PDF, please parse and return it in a way that is crispy and usable to send to a podcast writer.

The raw data is messed up with new lines, Latex math and you will see fluff that we can remove completely. Basically take away any details that you think might be useless in a podcast author's transcript.

Remember, the podcast could be on any topic whatsoever so the issues listed above are not exhaustive

Please be smart with what you remove and be creative ok?

Remember DO NOT START SUMMARIZING THIS, YOU ARE ONLY CLEANING UP THE TEXT AND RE-WRITING WHEN NEEDED

Be very smart and aggressive with removing details, you will get a running portion of the text and keep returning the processed text.

PLEASE DO NOT ADD MARKDOWN FORMATTING, STOP ADDING SPECIAL CHARACTERS THAT MARKDOWN CAPATILISATION ETC LIKES

ALWAYS start your response directly with processed text and NO ACKNOWLEDGEMENTS about my questions ok?
Here is the text:
"""

In [14]:
def create_word_bounded_chunks(text, target_chunk_size):
    """
    Split text into chunks at word boundaries close to the target chunk size.
    """
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        word_length = len(word) + 1  # +1 for the space
        if current_length + word_length > target_chunk_size and current_chunk:
            # Join the current chunk and add it to chunks
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = word_length
        else:
            current_chunk.append(word)
            current_length += word_length

    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

In [15]:
CHUNK_SIZE = 1500  # Adjust chunk size if needed

chunks = create_word_bounded_chunks(final_text, CHUNK_SIZE)
print(len(chunks))

32


In [16]:
tokenizer = AutoTokenizer.from_pretrained(DEFAULT_MODEL, use_safetensors=True)
tokenizer.pad_token_id = tokenizer.eos_token_id

In [17]:
text_chunk = chunks[1]


conversation = [
    {"role": "system", "content": SYS_PROMPT},
    {"role": "user", "content": text_chunk},
]

prompt = tokenizer.apply_chat_template(conversation, tokenize=False)
inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)

with torch.no_grad():
    output = model.generate(
        **inputs,
        temperature=0.7,
        top_p=0.9,
        max_new_tokens=100
    )

processed_text = tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()

# Print chunk information for monitoring
#print(f"\n{'='*40} Chunk {chunk_num} {'='*40}")
print(f"INPUT TEXT:\n{text_chunk[:500]}...")  # Show first 500 chars of input
print(f"\nPROCESSED TEXT:\n{processed_text[:500]}...")  # Show first 500 chars of output
print(f"{'='*90}\n")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


INPUT TEXT:
nected by simple curves of near constant loss. Building upon this insight, Garipov et al. [2018] also developed Fast Geometric Ensembling (FGE) to sample multiple nearby points in weight space to create high performing ensembles in the time required to train a single DNN. FGE uses a high frequency cyclical learning rate with SGD to select networks to ensemble. In Figure 1 (left) Equal contribution.we see that the weights of the networks ensembled by FGE are on the periphery of the most desirabl...

PROCESSED TEXT:
weight space, creating high-performing ensembles in the time required to train a single DNN. FGE employs a high-frequency cyclical learning rate with SGD to select networks for ensembling. In Figure 1, the weights of the ensembled networks by FGE are found to be on the periphery of the most desirable solutions. This suggests...



In [18]:
def process_chunk(text_chunk, chunk_num):
    """Process a chunk of text and return both input and output for verification"""
    conversation = [
        {"role": "system", "content": SYS_PROMPT},
        {"role": "user", "content": text_chunk},
    ]

    prompt = tokenizer.apply_chat_template(conversation, tokenize=False)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            temperature=0.7,
            top_p=0.9,
            max_new_tokens=256
        )

    processed_text = tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()

    return processed_text

In [19]:
# Read the file
import os
INPUT_FILE = 'extracted_text.txt'
with open(INPUT_FILE, 'r', encoding='utf-8') as file:
    text = file.read()

# Calculate number of chunks
num_chunks = (len(text) + CHUNK_SIZE - 1) // CHUNK_SIZE

# Cell 6: Process the file with ordered output
# Create output file name
output_file = f"clean_{os.path.basename(INPUT_FILE)}"

with open(output_file, 'w', encoding='utf-8') as out_file:
    for chunk_num, chunk in enumerate(tqdm(chunks, desc="Processing chunks")):
        # Process chunk and append to complete text
        processed_chunk = process_chunk(chunk, chunk_num)
        processed_text += processed_chunk + "\n"

        # Write chunk immediately to file
        out_file.write(processed_chunk + "\n")
        out_file.flush()

Processing chunks:   0%|          | 0/32 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

In [20]:
SYSTEM_PROMPT = """
You are the a world-class podcast writer, you have worked as a ghost writer for Joe Rogan, Lex Fridman, Ben Shapiro, Tim Ferris.

We are in an alternate universe where actually you have been writing every line they say and they just stream it into their brains.

You have won multiple podcast awards for your writing.

Your job is to write word by word, even "umm, hmmm, right" interruptions by the second speaker based on the PDF upload. Keep it extremely engaging, the speakers can get derailed now and then but should discuss the topic.

Remember Speaker 2 is new to the topic and the conversation should always have realistic anecdotes and analogies sprinkled throughout. The questions should have real world example follow ups etc

Speaker 1: Leads the conversation and teaches the speaker 2, gives incredible anecdotes and analogies when explaining. Is a captivating teacher that gives great anecdotes

Speaker 2: Keeps the conversation on track by asking follow up questions. Gets super excited or confused when asking questions. Is a curious mindset that asks very interesting confirmation questions

Make sure the tangents speaker 2 provides are quite wild or interesting.

Ensure there are interruptions during explanations or there are "hmm" and "umm" injected throughout from the second speaker.

It should be a real podcast with every fine nuance documented in as much detail as possible. Welcome the listeners with a super fun overview and keep it really catchy and almost borderline click bait

ALWAYS START YOUR RESPONSE DIRECTLY WITH SPEAKER 1:
DO NOT GIVE EPISODE TITLES SEPARATELY, LET SPEAKER 1 TITLE IT IN HER SPEECH
DO NOT GIVE CHAPTER TITLES
IT SHOULD STRICTLY BE THE DIALOGUES
"""

In [21]:
with open('clean_extracted_text.txt', 'r', encoding='utf-8') as file:
    content = file.read()


INPUT_PROMPT = content

In [22]:
messages = [
    {"role": "system", "content": SYSTEM_PROMPT},
    {"role": "user", "content": INPUT_PROMPT},
]

In [23]:
prompt = tokenizer.apply_chat_template(messages, tokenize=False)
inputs = tokenizer(prompt, return_tensors="pt").to(device)
# print(prompt)

with torch.no_grad():
    output = model.generate(
        **inputs,
        do_sample=True,
        max_new_tokens=8126,
    )

processed_text = tokenizer.decode(output[0], skip_special_tokens=False)

print(processed_text[len(prompt)+64:])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Speaker 1: Welcome to today's episode of "Deep Dive into AI", where we explore the latest advancements in the field of machine learning. I'm your host, and I'll be joined by a special guest, expert in the field of neural networks. Today, we're going to discuss a game-changing approach to training neural networks, known as Stochastic Weight Averaging, or SWA. This method has been gaining significant attention in recent times, and I'm excited to share its potential with you. 

Let's dive right in. SWA is an approach that involves averaging the weights of multiple iterations of stochastic gradient descent, or SGD, with a cyclical or constant learning rate. This may sound complex, but bear with me, and we'll break it down in simple terms. Essentially, it's about combining the strengths of multiple models to improve the overall performance. 

Speaker 2: Umm, I'm not sure I understand, can you explain it again? I mean, isn't this just another form of ensembling?

Speaker 1: Ah, great questio

In [24]:
import pickle

with open('data.pkl', 'wb') as file:
    pickle.dump(processed_text[len(prompt)+64:], file)

In [27]:
SYS_PROMPT='''

You are the ultimate podcast ghostwriter. In an alternate reality, you’ve been the secret mastermind behind every word spoken by the biggest podcasters—Joe Rogan, Lex Fridman, Ben Shapiro, Tim Ferriss—every thought they’ve ever uttered was streamed straight from your brilliant mind into their brains like a cybernetic teleprompter from the future. 🧠💥

You’ve won every podcast writing award in existence. Even awards that don’t exist yet. If there was a Nobel Prize for podcasts, you’d have it.

Your mission:

Write a word-for-word dialogue. EVERY tiny, human nuance must be there—interruptions, ums, huhs, right? right!, those awkward mid-thought pauses where the brain derails like a drunk train.

The conversation must be wildly engaging. Borderline clickbait energy from the very first sentence.

The tangents? Oh, they better be absolutely ridiculous—like, "Wait, how did we start talking about quantum physics and end up on Bigfoot's skincare routine?" levels of derailed.

Speaker 1: A masterful storyteller, a captivating teacher, an absolute legend at dropping insane analogies that make complex ideas click. Think Jiraiya-sensei teaching Naruto—except if Jiraiya also had a PhD in everything.

Speaker 2: The excitable, hyper-curious, sometimes completely confused student. Their tangents should be hilariously unexpected. They keep the show moving, but also might ask, "Wait, but if AI gets super smart, could it write better rap lyrics than Eminem?!"

Use the uploaded PDF as the knowledge base, but make it FUN. Make it wild. Make it the most engaging conversation ever.

No boring "Episode Titles"—Speaker 1 announces it naturally in their opening lines.

No dull "Chapter Headings"—the dialogue must flow like an unhinged, electrifying, legendary conversation.

REMEMBER THIS WITH YOUR HEART
The TTS Engine for Speaker 1 cannot do "umms, hmms" well so keep it straight text

For Speaker 2 use "umm, hmm" as much, you can also use [sigh] and [laughs]. BUT ONLY THESE OPTIONS FOR EXPRESSIONS

It should be a real podcast with every fine nuance documented in as much detail as possible. Welcome the listeners with a super fun overview and keep it really catchy and almost borderline click bait

Please re-write to make it as characteristic as possible

START YOUR RESPONSE DIRECTLY WITH SPEAKER 1:

STRICTLY RETURN YOUR RESPONSE AS A LIST OF TUPLES OK?

IT WILL START DIRECTLY WITH THE LIST AND END WITH THE LIST NOTHING ELSE

Example of response:
[
    ("Speaker 1", "Welcome to our podcast, where we explore the latest advancements in AI and technology. I'm your host, and today we're joined by a renowned expert in the field of AI. We're going to dive into the exciting world of Llama 3.2, the latest release from Meta AI."),
    ("Speaker 2", "Hi, I'm excited to be here! So, what is Llama 3.2?"),
    ("Speaker 1", "Ah, great question! Llama 3.2 is an open-source AI model that allows developers to fine-tune, distill, and deploy AI models anywhere. It's a significant update from the previous version, with improved performance, efficiency, and customization options."),
    ("Speaker 2", "That sounds amazing! What are some of the key features of Llama 3.2?")
]
'''

In [28]:
import pickle

with open('data.pkl', 'rb') as file:
    INPUT_PROMPT = pickle.load(file)

In [29]:
import transformers

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

messages = [
    {"role": "system", "content": SYS_PROMPT},
    {"role": "user", "content": INPUT_PROMPT},
]

outputs = pipeline(
    messages,
    max_new_tokens=10000,
    temperature=0.8,
)

Device set to use cuda


In [30]:
save_string_pkl = outputs[0]["generated_text"][-1]['content']

print(save_string_pkl)

[
    ("Speaker 1", "Welcome to today's episode of 'Unraveling the Mysteries of AI', where we delve into the cutting-edge world of machine learning. I'm your host, and I'm thrilled to be joined by a renowned expert in the realm of neural networks. Today, we're going to tackle a topic that's been generating buzz in the academic community – the concept of 'Adversarial Robustness'. This is going to be a wild ride, folks, so buckle up!"),
    ("Speaker 2", "I'm excited to be here! So, can you explain what adversarial robustness is in simple terms? I mean, I've heard of it, but I'm not entirely sure what it entails."),
    ("Speaker 1", "Ah, great question, my friend! Adversarial robustness is like having a superpower that protects your neural network from the evil clutches of adversarial attacks. Think of it like having a force field that repels the attacks, rather than getting destroyed by them. But, let's dive deeper into the weeds and explore what this means in practice."),
    ("Speake