<a href="https://colab.research.google.com/github/BraedynL0530/PortfolioWebsite/blob/master/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# README Summary Generator for Training Data
# Checkpoints so collab dosent cut off traing data generation(traing later once model is done)


# SETUP

# Mount Google Drive for checkpointing
from google.colab import drive
drive.mount('/content/drive')

# Install dependencies
!pip install -q transformers torch accelerate

# CONFIGURATION


import json
import os
from pathlib import Path
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Paths
DRIVE_BASE = '/content/drive/MyDrive/readme_training'
INPUT_FILE = f'{DRIVE_BASE}/training_data.json'  # From Go scraper
CHECKPOINT_FILE = f'{DRIVE_BASE}/summaries_checkpoint.json'
OUTPUT_FILE = f'{DRIVE_BASE}/summaries_final.json'

# Create directory if needed
os.makedirs(DRIVE_BASE, exist_ok=True)

# Model config
MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"
BATCH_SIZE = 5  # Process N at a time before saving
CHECKPOINT_INTERVAL = 10  # Save every N summaries

print(f"‚úÖ Using model: {MODEL_NAME}")
print(f"‚úÖ Checkpointing every {CHECKPOINT_INTERVAL} summaries")
print(f"‚úÖ Drive path: {DRIVE_BASE}")

# LOAD MODEL

print("\nüì¶ Loading model...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.float16,  # Faster on GPU
    low_cpu_mem_usage=True
)

# Set pad token if needed
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("‚úÖ Model loaded!")


# LOAD DATA


print("\nüìÇ Loading README data...")

# Load scraped READMEs
with open(INPUT_FILE, 'r') as f:
    readmes_data = json.load(f)

print(f"‚úÖ Loaded {len(readmes_data)} READMEs")

# Load existing checkpoint if available
try:
    with open(CHECKPOINT_FILE, 'r') as f:
        processed_summaries = json.load(f)
    processed_names = {s['repo_name'] for s in processed_summaries}
    print(f"üìã Resuming from checkpoint: {len(processed_summaries)} already done")
except FileNotFoundError:
    processed_summaries = []
    processed_names = set()
    print("üìã Starting fresh (no checkpoint found)")

# Filter out already processed
remaining = [r for r in readmes_data if r['repo_name'] not in processed_names]
print(f"üìä Remaining to process: {len(remaining)}")

# ============================================================
# SUMMARY GENERATOR
# ============================================================

def generate_summary(readme_text, repo_name="", max_readme_length=2000):
    """Generate a 2-3 sentence summary of a README"""

    # Truncate if too long (Llama has token limits)
    if len(readme_text) > max_readme_length:
        readme_text = readme_text[:max_readme_length] + "..."

    prompt = f"""Summarize this GitHub README in exactly 2-3 sentences. Focus on what the project does and its key features.

README:
{readme_text}

Summary:"""

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=2048
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.pad_token_id
        )

    # Decode and extract just the summary part
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract summary (everything after "Summary:")
    if "Summary:" in full_output:
        summary = full_output.split("Summary:")[-1].strip()
    else:
        summary = full_output[len(prompt):].strip()

    # Clean up
    summary = summary.replace('\n', ' ').strip()

    return summary

# PROCESS IN BATCHES WITH CHECKPOINTING

print("\nüöÄ Starting summary generation...\n")

for i in tqdm(range(0, len(remaining), BATCH_SIZE)):
    batch = remaining[i:i+BATCH_SIZE]

    for readme_data in batch:
        try:
            summary = generate_summary(
                readme_data['readme'],
                readme_data['repo_name']
            )

            # Add summary to data
            readme_data['summary'] = summary
            processed_summaries.append(readme_data)

            # Print progress
            print(f"\n‚úÖ {readme_data['owner']}/{readme_data['repo_name']}")
            print(f"   üìù {summary[:100]}...")

        except Exception as e:
            print(f"\n‚ùå Failed {readme_data['repo_name']}: {e}")
            continue

    # Save checkpoint after each batch
    if len(processed_summaries) % CHECKPOINT_INTERVAL == 0:
        with open(CHECKPOINT_FILE, 'w') as f:
            json.dump(processed_summaries, f, indent=2)
        print(f"\nüíæ Checkpoint saved: {len(processed_summaries)} summaries")

#
# FINAL SAVE
#

print("\nüíæ Saving final results...")

# Save checkpoint (in case of issues)
with open(CHECKPOINT_FILE, 'w') as f:
    json.dump(processed_summaries, f, indent=2)

# Save final output
with open(OUTPUT_FILE, 'w') as f:
    json.dump(processed_summaries, f, indent=2)

print(f"""
‚ú® COMPLETE! ‚ú®

üìä Stats:
   - Total summaries: {len(processed_summaries)}
   - Checkpoint: {CHECKPOINT_FILE}
   - Final output: {OUTPUT_FILE}

üéØ Next steps:
   1. Download {OUTPUT_FILE} from Google Drive
   2. Use it to train your PyTorch model
   3. Ship your portfolio!
""")

#
# PREVIEW RESULTS
#

print("\nüìã Sample summaries:")
for i, item in enumerate(processed_summaries[:3]):
    print(f"\n{i+1}. {item['owner']}/{item['repo_name']} ({item['stars']} ‚≠ê)")
    print(f"   {item['summary']}")

MessageError: Error: credential propagation was unsuccessful

In [None]:
"""import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import math
from dataclasses import  dataclass
torch.manual_seed(42)

#GPT-2 Example, resuse :3
class config:
  vocab_size: int #unique words
  block_size: int #how far back(context) it can see, memory/ how many tokens back
  n_layer: int = 12 # stacked blocks, more layers more reasoning more train time
  n_head: int = 12 # attentions per layer, how many "heads" looking for a new pattern
  n_embd: int = 768 #size of vector for each token
  dropout: float = 0.1  #prevents overfitting by stopping random paths"""


# Word token embedding, wte
# nn layer
vocab_size = 10
n_embed = 4


token_embedding_tabel = nn.Embedding(vocab_size, n_embd)

print("shape:", token_embedding_tabel.weight.shape)
print("weight:",token_embedding_tabel.weight)

# Word posetional embedding, wpe
"""
makes a unique vector for each poestion
vectors for each posetions, another nueral network layer
word and poestion exist in the same n_embd dimesion space
adding them creates a  unique point in the said space
learns that "the" in the beginning postion 1 isnt the same as "the" at posetion 12, distictions
"""


B, T, C = 2,5,n_embed #batch time/sequence length(tokens) channels(dimesions/n_embed)
block_size = 8
vocab_size = 10



poesition_embeding_tabel = nn.Embedding(block_size, C)

#input data
idx = torch.randint(0, vocab_size, (B,T))

#token emb data before
tok_emb = token_embedding_tabel(idx)

#after / poesitions for latest sequence length
pos = torch.arange(0, T, dtype=torch.long) #just counts up from 0 and says this is index 1 2 3... for each sequnce ex T=5 returns a tensor or ([0,1,2,3,4]) helps pos tabel look uip
pos_emb = poesition_embeding_tabel(pos)

x = tok_emb + pos_emb #makes a vector with both data ex tok=[1,5,8...] pos = [-.2, .3,-4...] x =[.8, 4.7, 4]


"""
nn.Embeddings create and store vectors,
torch.arrange tells what rows to return
batches are a single sequence, could be a set token ex 128,  a Readme for a project example, paragraph senante etc MY CHOICE AS TO WHAT, T is sequence length, blocksize is max sequnce
can go under but block_size is the limit
"""

print(token_embedding_tabel.weight.shape)
print(poesition_embeding_tabel.weight.shape)
print("final emb:",x)



shape: torch.Size([10, 4])
weight: Parameter containing:
tensor([[ 1.9269,  1.4873,  0.9007, -2.1055],
        [ 0.6784, -1.2345, -0.0431, -1.6047],
        [-0.7521,  1.6487, -0.3925, -1.4036],
        [-0.7279, -0.5594, -0.7688,  0.7624],
        [ 1.6423, -0.1596, -0.4974,  0.4396],
        [-0.7581,  1.0783,  0.8008,  1.6806],
        [ 0.0349,  0.3211,  1.5736, -0.8455],
        [ 1.3123,  0.6872, -1.0892, -0.3553],
        [-1.4181,  0.8963,  0.0499,  2.2667],
        [ 1.1790, -0.4345, -1.3864, -1.2862]], requires_grad=True)
torch.Size([10, 4])
torch.Size([8, 4])
final emb: tensor([[[-1.5953,  0.1559,  2.6121,  1.8412],
         [-0.3606, -0.3840,  0.6163,  0.3166],
         [ 0.1099,  1.3950, -2.1651,  0.1804],
         [ 2.4877,  1.2483, -1.5419, -1.1271],
         [-0.6128,  1.3094,  0.8095,  1.5383]],

        [[ 0.3418, -1.3568,  0.4250, -1.1257],
         [ 1.0457, -1.0591,  1.3421, -2.0505],
         [-1.9606,  1.7861, -0.2751,  2.2163],
         [ 1.8538, -0.6734, -0.495

In [None]:
from torch.autograd import forward_ad
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import math
from dataclasses import  dataclass
torch.manual_seed(42)

#config
class config:
  vocab_size: int #unique words
  block_size: int #how far back(context) it can see, memory/ how many tokens back
  n_layer: int = 12 # stacked blocks, more layers more reasoning more train time
  n_head: int = 12 # attentions per layer, how many "heads" looking for a new pattern
  n_embd: int = 768 #size of vector for each token
  dropout: float = 0.1  #prevents overfitting by stopping random paths



"""
self attention: part 1 of transformer
Q K V, query key value. helps use the two embeddings to learn diffrent meanings for words and give the diffrent vectors even if the same word
below is theory class is optimized, it condences the prjections into one huge vector and splits. other than that its nearly identical just more efficent
"""

"""
#learnable compenets
q_prog = nn.Linear(C, C, bias =False)
k_prog = nn.Linear(C, C, bias =False)
v_prog = nn.Linear(C, C, bias =False)

#weights
q_prog.weight.data = torch.randn(C,C)
q_prog.weight.data = torch.randn(C,C)
q_prog.weight.data = torch.randn(C,C)

#preform projection
q = q_prog(x)
k = k_prog(x)
v = v_prog(x)

scores = q @ k.transpose(-2,-1)
print("scores",scores)



Attention(Q,K,V)=softmax(‚ÄãQK^‚ä§/dk‚Äã‚Äã)V

d_k = k.size(-1)#last dimesion of
scaled_scores = scores / math.sqrt(d_k)
attention_weights = F.softmax(scaled_scores, dim=1)
print("scaled scores", scaled_scores)
print("scaled scores -> percentages", attention_weights)

# aggreation Last part of attention!
output = attention_weights @ v
print("output!:",output)

"""

# Core logic for MultiHead
class CausualSelfAttention(nn.Module):
  def __init__(self, config :config)
    super().__init__()
    assert self.config.n_embed % config.n_head == 0
    self.n_head = config.n_head
    self.n_embd = config.n_embd
    self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=False) # Fuzed layer = more efficent

    self.register_buffer( # part of causal masking
        "bias",# buffer name
        torch.trill(torch.ones(config.block_size,config.block_size))
        .view(1,1, config.block_size, config.block_size)
    )

    self.c_proj = nn.Linear(config.n_embd,config.n_embd)

  def forward(self, x):
    B, T, C = x.size()
    head_dim = C // self.n_head

    # project once -> split
    qkv = self.c_attn(x)
    q, k, v = qkv.split(C, dim=2)

    # reshape into heads
    q = q.view(B, T, self.n_head, head_dim).transpose(1, 2)
    k = k.view(B, T, self.n_head, head_dim).transpose(1, 2)
    v = v.view(B, T, self.n_head, head_dim).transpose(1, 2)

    # attention
    att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(head_dim))
    att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf")) # prevents it from seeing future tokens
    att = F.softmax(att, dim=-1)

    # aggregate :3
    y = att @ v

    # merge heads
    y = y.transpose(1, 2).contiguous().view(B, T, C)

    # final projection
    y = self.c_proj(y)
    return y


class MLP(nn.Module):
  def __init__(self, config :config):
    super().__init__()
    self.fc = nn.Linear(config.n_embd, 4 * config.n_embd) #expands dimestions, think of it as more room to think / combining features
    self.proj = nn.Linear(4 * config.n_embd, config.n_embd) # condenses back so it can be added back to attetion
    self.drop = nn.Dropout(config.dropout) #refer to config

  def forward(self, x):
    x = self.fc(x)
    x = F.gelu(x) # makes x nonlinear so fc and proj dont just merge into one straight line
    x =self.proj(x)
    x = self.drop(x)

    return x

class Block(nn.Module): #residual connection
  def __init__(self, config : config): #litterly just does f(x) + x instead of f(x) so mlp dosesnt relearn it takes the learned/trained data and keeps it
    super().__init__()
    self.ln_1 = nn.LayerNorm(config.n_embd)
    self.attn = CasualSelfAttention(config)
    self.ln_2 = nn.LayerNorm(config.n_embd)
    self.mlp = MLP(config)

  def forward(self, x):
    # focus (the "+")
    x = x +self.attn(self.ln_1(x))

    x = x +self.mlp(self.ln_2(x))

    return x

In [1]:
class NLP(nn.Module):
  def __init__(self, config: config):
    super().__init__()
    # Input
    self.wte = nn.Embedding(vocab_size, config.n_embd)
    self.wpe = nn.Embedding(config.block_size, config.n_embd)
    self.drop = nn.Dropout(config.dropout)

    # Processing, makes a stack/block / LAYER for deeper understanding
    # Data flows through sequncesnsy so more refined/better understanding
    self.h = nn.ModuleList([Block(config) for _ in range(config.n_layers)])

    #output layers
    self.ln_f = nn.LayerNorm(config.n_embd) # final layer norm
    self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias = False) #language model head, parrel prediction(linear) makes raw score for each possible next token , good for training, and throws away the
    #rest(all but last vector) if not traning
    # Above makes raw score for each possible next token


    self.lm_head.weight = self.wte.weight

  def forward(self, idx, targets= None): #idx = input targets = inputs shifter one pos to left
    B, T = idx.size()
    #embdedings dropounts blocks layernorm... makes logits tensors

    logits = self.lm_head(x)

    loss = None
    if targets is not None:
      loss = F.Cross_entrophy(logits.view(-1, logits.size(-1)), targets.view(-1)) # makes it 2d for loss calc for each B*T prediction and takes mean of all of them to make loss

    return logits, loss

NameError: name 'nn' is not defined

In [2]:
psudeo code/ placeholders
readme_ids = tokenized(for readme or whatever in data)
summarry_ids - tokenizer(same as above)
targets_shifted = summary_ids[:, 1:]
logits, loss = NLP(idx = readme_ids, targets = targets_shifted)