<a href="https://colab.research.google.com/github/BraedynL0530/PortfolioWebsite/blob/master/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# README Summary Generator - FIXED VERSION
# Key fixes: Progress visibility, faster model, better error handling

# SETUP
from google.colab import drive
drive.mount('/content/drive')

!pip install -q transformers torch accelerate safetensors

# CONFIGURATION
import json
import os
import sys
from pathlib import Path
from tqdm.auto import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Force output flush for Colab
sys.stdout.flush()

# Paths
DRIVE_BASE = '/content/drive/MyDrive/readme_training'
INPUT_FILE = f'{DRIVE_BASE}/training_data.json'
CHECKPOINT_FILE = f'{DRIVE_BASE}/summaries_checkpoint.json'
OUTPUT_FILE = f'{DRIVE_BASE}/summaries_final.json'

os.makedirs(DRIVE_BASE, exist_ok=True)

# CRITICAL: Use a smaller, faster model for Colab
# Pick ONE (uncomment it):

MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # RECOMMENDED: Fast, no auth
# MODEL_NAME = "microsoft/phi-2"  # Good quality, medium speed
# MODEL_NAME = "google/flan-t5-base"  # Very fast, different architecture
# MODEL_NAME = "TheBloke/vicuna-7B-1.1-HF"  # Slow but higher quality

BATCH_SIZE = 1  # Process one at a time for better progress tracking
CHECKPOINT_INTERVAL = 5  # Save every 5 summaries

print(f"‚úÖ Model: {MODEL_NAME}")
print(f"‚úÖ Checkpoint every: {CHECKPOINT_INTERVAL}")
print(f"‚úÖ Drive path: {DRIVE_BASE}")
sys.stdout.flush()

# LOAD MODEL
print("\nüì¶ Loading model... (this may take 2-5 minutes)")
sys.stdout.flush()

try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        device_map="auto",
        dtype=torch.float16,
        low_cpu_mem_usage=True
    )

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    print("‚úÖ Model loaded successfully!")
    print(f"‚úÖ Using device: {model.device}")
    sys.stdout.flush()

except Exception as e:
    print(f"‚ùå MODEL LOAD FAILED: {e}")
    print("Try using: TinyLlama/TinyLlama-1.1B-Chat-v1.0")
    sys.stdout.flush()
    raise

# LOAD DATA
print("\nüìÇ Loading README data...")
sys.stdout.flush()

with open(INPUT_FILE, 'r') as f:
    readmes_data = json.load(f)

print(f"‚úÖ Loaded {len(readmes_data)} READMEs")
sys.stdout.flush()

# Load checkpoint
try:
    with open(CHECKPOINT_FILE, 'r') as f:
        processed_summaries = json.load(f)
    processed_indices = {s['id'] for s in processed_summaries}
    print(f"üìã Resuming: {len(processed_summaries)} already done")
except FileNotFoundError:
    processed_summaries = []
    processed_indices = set()
    print("üìã Starting fresh")

sys.stdout.flush()

# Add unique IDs if missing
for i, item in enumerate(readmes_data):
    if 'id' not in item:
        item['id'] = i

remaining = [r for r in readmes_data if r['id'] not in processed_indices]
print(f"üìä Remaining: {len(remaining)}")
sys.stdout.flush()

# SUMMARY GENERATOR
def generate_summary(readme_text, max_length=1500):
    """Generate summary with better error handling"""

    # Truncate
    if len(readme_text) > max_length:
        readme_text = readme_text[:max_length] + "..."

    # Improved prompt for better technical summaries
    prompt = f"""Write a technical summary of this GitHub project as a single paragraph with 3-4 complete sentences.

Your summary should cover:
- What the project does and its main purpose
- Technologies used (programming languages, frameworks, libraries) - only mention what's in the README
- Key features or implementation details

Do NOT use numbered lists or bullet points. Write in natural paragraph form.

README:
{readme_text}

Summary:"""

    try:
        inputs = tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,
            max_length=2048
        ).to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=100,  # Reduced for speed
                temperature=0.7,
                do_sample=True,
                top_p=0.9,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

        full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract summary - now looking for "Technical Summary:"
        if "Technical Summary:" in full_output:
            summary = full_output.split("Technical Summary:")[-1].strip()
        elif "Summary:" in full_output:
            summary = full_output.split("Summary:")[-1].strip()
        else:
            summary = full_output[len(prompt):].strip()

        # Clean up
        summary = summary.replace('\n', ' ').strip()

        # Remove any trailing incomplete sentences (but keep full summary)
        # Only trim if it's unreasonably long (over 1000 chars)
        if len(summary) > 1000:
            # Try to cut at last sentence
            last_period = summary[:1000].rfind('.')
            if last_period > 500:
                summary = summary[:last_period + 1]

        return summary

    except Exception as e:
        print(f"Generation error: {e}")
        return f"Error generating summary: {str(e)[:100]}"

# MAIN PROCESSING LOOP
print("\nüöÄ Starting generation...\n")
print("=" * 60)
sys.stdout.flush()

for i, readme_data in enumerate(remaining):
    try:
        print(f"\n[{i+1}/{len(remaining)}] Processing: {readme_data.get('repo_name', 'Unknown')}")
        sys.stdout.flush()

        # Generate
        summary = generate_summary(readme_data['readme'])

        # Save result
        readme_data['summary'] = summary
        processed_summaries.append(readme_data)

        # Show FULL summary for first 5, then preview for rest
        if i < 5:
            print(f"   ‚úì FULL: {summary}")
        else:
            preview = summary[:100] + "..." if len(summary) > 100 else summary
            print(f"   ‚úì {preview}")
        sys.stdout.flush()

        # Checkpoint
        if (i + 1) % CHECKPOINT_INTERVAL == 0:
            with open(CHECKPOINT_FILE, 'w') as f:
                json.dump(processed_summaries, f, indent=2)
            print(f"\nüíæ CHECKPOINT SAVED: {len(processed_summaries)} summaries")
            print("=" * 60)
            sys.stdout.flush()

    except Exception as e:
        print(f"\n‚ùå FAILED {readme_data.get('repo_name', 'Unknown')}: {e}")
        sys.stdout.flush()
        continue

# FINAL SAVE
print("\n" + "=" * 60)
print("üíæ Saving final results...")
sys.stdout.flush()

with open(CHECKPOINT_FILE, 'w') as f:
    json.dump(processed_summaries, f, indent=2)

with open(OUTPUT_FILE, 'w') as f:
    json.dump(processed_summaries, f, indent=2)

print(f"""
‚ú® COMPLETE! ‚ú®

üìä Stats:
   Total: {len(processed_summaries)} summaries
   Checkpoint: {CHECKPOINT_FILE}
   Final: {OUTPUT_FILE}

üéØ Next: Download from Google Drive and train your model!
""")
sys.stdout.flush()

# PREVIEW
print("\nüìã Sample summaries:")
for i, item in enumerate(processed_summaries[:3]):
    print(f"\n{i+1}. {item.get('repo_name', 'Unknown')} ({item.get('stars', 0)} ‚≠ê)")
    print(f"   {item['summary']}")
sys.stdout.flush()

Mounted at /content/drive
‚úÖ Model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
‚úÖ Checkpoint every: 5
‚úÖ Drive path: /content/drive/MyDrive/readme_training

üì¶ Loading model... (this may take 2-5 minutes)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

‚úÖ Model loaded successfully!
‚úÖ Using device: cuda:0

üìÇ Loading README data...
‚úÖ Loaded 6317 READMEs
üìã Resuming: 6317 already done
üìä Remaining: 0

üöÄ Starting generation...


üíæ Saving final results...

‚ú® COMPLETE! ‚ú®

üìä Stats:
   Total: 6317 summaries
   Checkpoint: /content/drive/MyDrive/readme_training/summaries_checkpoint.json
   Final: /content/drive/MyDrive/readme_training/summaries_final.json

üéØ Next: Download from Google Drive and train your model!


üìã Sample summaries:

1. Unknown (0 ‚≠ê)
   The ReplayWeb.pageApp is a web-based viewer that allows users to browse and playback web archives. The project was created as a side-project to learn web development and to create a tool that could be useful for web archiving. The project consists of a desktop tool that can browse and playback web archives, and a server-side implementation that allows users to create and manage their own web archives. The project uses the latest version of React

2. Unknown (0

In [3]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import math
from dataclasses import  dataclass
torch.manual_seed(42)
"""#GPT-2 Example, resuse :3
class config:
  vocab_size: int #unique words
  block_size: int #how far back(context) it can see, memory/ how many tokens back
  n_layer: int = 12 # stacked blocks, more layers more reasoning more train time
  n_head: int = 12 # attentions per layer, how many "heads" looking for a new pattern
  n_embd: int = 768 #size of vector for each token
  dropout: float = 0.1  #prevents overfitting by stopping random paths"""


# Word token embedding, wte
# nn layer
vocab_size = 10
n_embed = 4


token_embedding_tabel = nn.Embedding(vocab_size, n_embd)

print("shape:", token_embedding_tabel.weight.shape)
print("weight:",token_embedding_tabel.weight)

# Word posetional embedding, wpe
"""
makes a unique vector for each poestion
vectors for each posetions, another nueral network layer
word and poestion exist in the same n_embd dimesion space
adding them creates a  unique point in the said space
learns that "the" in the beginning postion 1 isnt the same as "the" at posetion 12, distictions
"""


B, T, C = 2,5,n_embed #batch time/sequence length(tokens) channels(dimesions/n_embed)
block_size = 8
vocab_size = 10



poesition_embeding_tabel = nn.Embedding(block_size, C)

#input data
idx = torch.randint(0, vocab_size, (B,T))

#token emb data before
tok_emb = token_embedding_tabel(idx)

#after / poesitions for latest sequence length
pos = torch.arange(0, T, dtype=torch.long) #just counts up from 0 and says this is index 1 2 3... for each sequnce ex T=5 returns a tensor or ([0,1,2,3,4]) helps pos tabel look uip
pos_emb = poesition_embeding_tabel(pos)

x = tok_emb + pos_emb #makes a vector with both data ex tok=[1,5,8...] pos = [-.2, .3,-4...] x =[.8, 4.7, 4]


"""
nn.Embeddings create and store vectors,
torch.arrange tells what rows to return
batches are a single sequence, could be a set token ex 128,  a Readme for a project example, paragraph senante etc MY CHOICE AS TO WHAT, T is sequence length, blocksize is max sequnce
can go under but block_size is the limit
"""

print(token_embedding_tabel.weight.shape)
print(poesition_embeding_tabel.weight.shape)
print("final emb:",x)



NameError: name 'n_embd' is not defined

In [2]:
from torch.autograd import forward_ad
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import math
from dataclasses import  dataclass
torch.manual_seed(42)

#config
@dataclass
class config:
  vocab_size: int #unique words
  block_size: int #how far back(context) it can see, memory/ how many tokens back
  n_layer: int   # stacked blocks, more layers more reasoning more train time
  n_head: int   # attentions per layer, how many "eyes" looking for a new pattern
  n_embd: int   #size of vector for each token
  dropout: float   #prevents overfitting by stopping random paths
  pad_token_id: int = 50256



"""
self attention: part 1 of transformer
Q K V, query key value. helps use the two embeddings to learn diffrent meanings for words and give the diffrent vectors even if the same word
below is theory class is optimized, it condences the prjections into one huge vector and splits. other than that its nearly identical just more efficent
"""

"""
#learnable compenets
q_prog = nn.Linear(C, C, bias =False)
k_prog = nn.Linear(C, C, bias =False)
v_prog = nn.Linear(C, C, bias =False)

#weights
q_prog.weight.data = torch.randn(C,C)
q_prog.weight.data = torch.randn(C,C)
q_prog.weight.data = torch.randn(C,C)

#preform projection
q = q_prog(x)
k = k_prog(x)
v = v_prog(x)

scores = q @ k.transpose(-2,-1)
print("scores",scores)



Attention(Q,K,V)=softmax(‚ÄãQK^‚ä§/dk‚Äã‚Äã)V

d_k = k.size(-1)#last dimesion of
scaled_scores = scores / math.sqrt(d_k)
attention_weights = F.softmax(scaled_scores, dim=1)
print("scaled scores", scaled_scores)
print("scaled scores -> percentages", attention_weights)

# aggreation Last part of attention!
output = attention_weights @ v
print("output!:",output)

"""

# Core logic for MultiHead
class CausalSelfAttention(nn.Module):
  def __init__(self, config :config):
    super().__init__()
    assert config.n_embd % config.n_head == 0
    self.n_head = config.n_head
    self.n_embd = config.n_embd
    self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=False) # Fuzed layer = more efficent

    self.register_buffer( # part of causal masking
        "bias",# buffer name
        torch.tril(torch.ones(config.block_size,config.block_size))
        .view(1,1, config.block_size, config.block_size)
    )

    self.c_proj = nn.Linear(config.n_embd,config.n_embd)

  def forward(self, x):
    B, T, C = x.size()
    head_dim = C // self.n_head

    # project once -> split
    qkv = self.c_attn(x)
    q, k, v = qkv.split(C, dim=2)

    # reshape into heads
    q = q.view(B, T, self.n_head, head_dim).transpose(1, 2)
    k = k.view(B, T, self.n_head, head_dim).transpose(1, 2)
    v = v.view(B, T, self.n_head, head_dim).transpose(1, 2)

    # attention
    att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(head_dim))
    att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf")) # prevents it from seeing future tokens
    att = F.softmax(att, dim=-1)

    # aggregate :3
    y = att @ v

    # merge heads
    y = y.transpose(1, 2).contiguous().view(B, T, C)

    # final projection
    y = self.c_proj(y)
    return y


class MLP(nn.Module):
  def __init__(self, config :config):
    super().__init__()
    self.fc = nn.Linear(config.n_embd, 4 * config.n_embd) #expands dimestions, think of it as more room to think / combining features
    self.proj = nn.Linear(4 * config.n_embd, config.n_embd) # condenses back so it can be added back to attetion
    self.drop = nn.Dropout(config.dropout) #refer to config

  def forward(self, x):
    x = self.fc(x)
    x = F.gelu(x) # makes x nonlinear so fc and proj dont just merge into one straight line
    x =self.proj(x)
    x = self.drop(x)

    return x

class Block(nn.Module): #residual connection
  def __init__(self, config : config): #litterly just does f(x) + x instead of f(x) so mlp dosesnt relearn it takes the learned/trained data and keeps it
    super().__init__()
    self.ln_1 = nn.LayerNorm(config.n_embd)
    self.attn = CausalSelfAttention(config)
    self.ln_2 = nn.LayerNorm(config.n_embd)
    self.mlp = MLP(config)

  def forward(self, x):
    # focus (the "+")
    x = x +self.attn(self.ln_1(x))

    x = x +self.mlp(self.ln_2(x))

    return x

In [7]:
class NLP(nn.Module):
  def __init__(self, config: config):
    super().__init__()
    # Input
    self.wte = nn.Embedding(config.vocab_size, config.n_embd)
    self.wpe = nn.Embedding(config.block_size, config.n_embd)
    self.drop = nn.Dropout(config.dropout)
    self.config = config
    self.pad_token_id = config.pad_token_id



    # Processing, makes a stack/block / LAYER for deeper understanding
    # Data flows through sequncesnsy so more refined/better understanding
    self.h = nn.ModuleList([Block(config) for _ in range(config.n_layer)])

    #output layers
    self.ln_f = nn.LayerNorm(config.n_embd) # final layer norm
    self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias = False) #language model head, parrel prediction(linear) makes raw score for each possible next token , good for training, and throws away the
    #rest(all but last vector) if not traning
    # Above makes raw score for each possible next token


    self.lm_head.weight = self.wte.weight

  def forward(self, idx, targets= None): #idx = input targets = inputs shifter one pos to left
    B, T = idx.size()

    assert T <= self.config.block_size, f"Sequence length {T} exceeds block_size {self.config.block_size}"

    tok_emb = self.wte(idx)
    pos = torch.arange(T, device=idx.device).unsqueeze(0)
    pos_emb = self.wpe(pos)
    x = self.drop(tok_emb + pos_emb)

    # Process through transformer blocks
    for block in self.h:
        x = block(x)

    # Final layer norm
    x = self.ln_f(x)

    # Output logits
    logits = self.lm_head(x)

    logits = logits[:, :-1, :]
    targets = targets[:, 1:]

    loss = None
    if targets is not None:
      print(f"    Logits shape: {logits.shape}")
      print(f"    Targets shape: {targets.shape}")
      print(f"    Logits min/max: {logits.min().item():.4f}/{logits.max().item():.4f}")
      print(f"    Targets min/max: {targets.min().item()}/{targets.max().item()}")

      logits_flat = logits.reshape(-1, logits.size(-1)) #flattens for learning
      targets_flat = targets.reshape(-1)

      print(f"    Flattened logits: {logits_flat.shape}")
      print(f"    Flattened targets: {targets_flat.shape}")

      loss = F.cross_entropy(logits_flat, targets_flat) #flattened outsied to debug, just gets loss

      print(f"    Computed loss: {loss.item():.6f}")

    return logits, loss

  @torch.no_grad()
  def summarize(self, idx, max_new_tokens, tempature = 1.0, top_k =0): # <1 rare words more likely, >1 more common words
    # Crop if too long
    if idx.size(1) > self.config.block_size:
      idx_cond = idx[:, -self.config.block_size:]
    else:
      idx_cond = idx

    logits, _ = self(idx_cond)
    logits = logits[:, -1, :] / temperature  # Get last token predictions

    probs = F.softmax(logits, dim=-1)
    next_token = torch.multinomial(probs, num_samples=1)  # Fixed typo

    idx = torch.cat((idx, next_token), dim=1)

    return idx

In [None]:
from transformers import GPT2Tokenizer

#device = torch.device('cpu')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

with open(OUTPUT_FILE) as file:
    data = json.load(file)

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

print(f"Tokenizer vocab_size: {len(tokenizer)}")  # Should print 50257

config_instance = config(
    vocab_size=len(tokenizer),  # This MUST be 50257
    block_size=512,
    n_layer=6,
    n_head=12,
    n_embd=768,
    dropout=0.2,
    pad_token_id=tokenizer.pad_token_id
)

print(f"Config vocab_size: {config_instance.vocab_size}")  # Verify it's 50257

model = NLP(config_instance).to(device)

print(f"Model wte shape: {model.wte.weight.shape}")  # Should be (50257, 768)


optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)





BATCH_SIZE = 8
NUM_EPOCHS = 3



for epoch in range(NUM_EPOCHS):
    print(f"\n--- Epoch {epoch + 1}/{NUM_EPOCHS} ---")

    for i in range(0, len(data), BATCH_SIZE):
        batch = data[i:i+BATCH_SIZE]

        batch_tokens = []
        for item in batch:
            full_text = f"{item['readme']} SUMMARY: {item['summary']}"
            tokens = tokenizer.encode(full_text, max_length=512, truncation=True)

            if len(tokens) < 512:
                tokens = tokens + [tokenizer.pad_token_id] * (512 - len(tokens))

            batch_tokens.append(tokens[:512])

        batch_tokens = torch.tensor(batch_tokens, dtype=torch.long).to(device)

        print(f"\n--- BATCH DEBUG---")
        print(f"Batch shape: {batch_tokens.shape}")
        print(f"Unique tokens: {torch.unique(batch_tokens).cpu().tolist()[:20]}")  # First 20 unique
        print(f"Pad token ID: {tokenizer.pad_token_id}")
        print(f"Pad token count: {(batch_tokens == tokenizer.pad_token_id).sum().item()}")
        print(f"Total tokens: {batch_tokens.numel()}")
        print(f"First 10 tokens: {batch_tokens[0, :10].cpu().tolist()}")
        print(f"Last 10 tokens: {batch_tokens[0, -10:].cpu().tolist()}")
                # Forward pass
        try:
            logits, loss = model(batch_tokens, targets=batch_tokens)
            print(f"  Forward pass OK, loss={loss.item():.4f}")
        except Exception as e:
            print(f"  Forward failed: {e}")
            break

        # Backward pass
        optimizer.zero_grad()

        try:
            loss.backward()
            print(f"  Backward pass OK")
        except Exception as e:
            print(f"Backward failed: {e}")
            break

        # Added Gradient check
        for name, param in model.named_parameters():
            if param.grad is not None:
                if torch.isnan(param.grad).any() or torch.isinf(param.grad).any():
                    print(f"NaN/Inf gradient in {name}")
                    break

        try:
            optimizer.step()
            print(f" Optimizer step OK")
        except Exception as e:
            print(f"Optimizer step failed: {e}")
            break

        if (i // BATCH_SIZE) % 50 == 0:
            print(f"Batch {i//BATCH_SIZE}/{len(data)//BATCH_SIZE}, Loss: {loss.item():.4f}")

In [None]:
import joblib

# Suppose your model is called `model`
joblib.dump(model, "v1.pkl")