In [8]:
#importing libraries and dependencies
import numpy as np
import pandas as pd
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset, load_from_disk
import torch
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dataset = load_dataset("gopalkalpande/bbc-news-summary")#flytech/python-codes-25k #d0rj/wikisum

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2-medium")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 does not have a pad token by default

# Formatting function
def format_prompt(example):
    article = example['Articles']
    summary = example['Summaries']

    full_text = article + tokenizer.eos_token + summary
    tokenized = tokenizer(
        full_text,
        truncation=True,
        padding="max_length",
        max_length=1024,
    )
    labels = tokenized["input_ids"][:]

    # Mask out the article portion from the loss computation
    article_ids = tokenizer(article, truncation=True, max_length=1024, add_special_tokens=False)["input_ids"]
    article_len = len(article_ids)

    if article_len < len(labels):
        labels[:article_len] = [-100] * article_len
    else:
        labels = [-100] * len(labels)

    tokenized["labels"] = labels
    return tokenized

# Apply formatting to the dataset
# tokenized_dataset = dataset.map(format_prompt, batched=False)


In [5]:
# tokenized_dataset.save_to_disk("/content/drive/MyDrive/tokenized_dataset_GPT")

Saving the dataset (0/1 shards):   0%|          | 0/2224 [00:00<?, ? examples/s]

In [10]:
#loading tokernizer
# from google.colab import drive
# drive.mount('/content/drive')
tokenized_dataset = load_from_disk("/content/drive/MyDrive/tokenized_dataset_GPT")

In [11]:
from sklearn.model_selection import train_test_split
split_dataset = tokenized_dataset['train'].train_test_split(test_size=0.1)
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']


In [12]:
model_name = "openai-community/gpt2-medium" # openai-community/gpt2
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)

In [26]:
from peft import LoraConfig, get_peft_model, TaskType
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Define LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # Language modeling task
    inference_mode=False,          # Training mode
    r=4,                          # Rank of LoRA matrices (tunable)
    lora_alpha=32,                # Scaling factor (tunable)
    lora_dropout=0.1,             # Dropout rate (tunable)
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Data collator for language modeling, with padding and masking
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Causal LM doesn't use masked language modeling
)



In [15]:
import torch
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

In [None]:
training_args = TrainingArguments(
    output_dir = "/content/drive/MyDrive/lora_gpt2_medium",
    overwrite_output_dir=True,
    eval_strategy="steps",
    eval_steps=1000,
    logging_steps=100,
    save_steps=1000,
    save_total_limit=3,
    gradient_accumulation_steps=1,
    learning_rate=3e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    dataloader_num_workers=4,
    num_train_epochs=15,
    weight_decay=0.01,
    warmup_steps=500,
    fp16=True,  # Use mixed precision if supported
    load_best_model_at_end=True,
    report_to="none",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# Start training
trainer.train(resume_from_checkpoint=latest_checkpoint)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss
2000,2.1348,2.101029
2500,2.1145,2.085284
3000,2.0948,2.077193
3500,2.0866,2.07375
4000,2.0832,2.070137
4500,2.0929,2.067148
5000,2.0932,2.065789
5500,2.0459,2.064115
6000,2.0737,2.063767
6500,2.0917,2.062529




In [None]:
metrics = trainer.evaluate()
print(metrics)

In [27]:
import os

def get_latest_checkpoint(path):
    if not os.path.exists(path):
        return None
    checkpoints = [os.path.join(path, d) for d in os.listdir(path) if d.startswith("checkpoint")]
    if not checkpoints:
        return None
    return max(checkpoints, key=os.path.getmtime)

latest_checkpoint = get_latest_checkpoint("/content/drive/MyDrive/lora_gpt2_medium")


In [None]:
##saving the model
save_path = "/content/drive/MyDrive/GPT-Summarize"
# Save model and tokenizer

trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

In [None]:
# Merge LoRA weights into base model
save_path = "/content/drive/MyDrive/GPT-Summarize"
model = model.merge_and_unload()

# Save the full merged model
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)


In [29]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from transformers import DataCollatorForLanguageModeling

checkpoint_path = "/content/drive/MyDrive/lora_gpt2_medium/checkpoint-7000"
model_name = "openai-community/gpt2-medium"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(model_name)

# Reapply LoRA
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=4,
    lora_alpha=32,
    lora_dropout=0.1,
)
model = get_peft_model(base_model, lora_config)

# Load checkpoint weights into LoRA model
model = PeftModel.from_pretrained(model, checkpoint_path)

# Save for future use
save_path = "/content/drive/MyDrive/GPT-Summarize"
# model.save_pretrained(save_path)
# tokenizer.save_pretrained(save_path)




In [32]:
merged_model = model.merge_and_unload()
merged_model.save_pretrained("GPT-medium-Summarize")
tokenizer.save_pretrained("GPT-medium-Summarize")


('GPT-medium-Summarize/tokenizer_config.json',
 'GPT-medium-Summarize/special_tokens_map.json',
 'GPT-medium-Summarize/vocab.json',
 'GPT-medium-Summarize/merges.txt',
 'GPT-medium-Summarize/added_tokens.json',
 'GPT-medium-Summarize/tokenizer.json')

In [64]:
merged_model.save_pretrained("GPT-medium-Summarize")

In [34]:
tokenizer = AutoTokenizer.from_pretrained('GPT-medium-Summarize')
model = AutoModelForCausalLM.from_pretrained('GPT-medium-Summarize')
model.eval()

Loading adapter weights from GPT-medium-Summarize led to missing keys in the model: transformer.h.0.attn.c_attn.lora_A.default.weight, transformer.h.0.attn.c_attn.lora_B.default.weight, transformer.h.1.attn.c_attn.lora_A.default.weight, transformer.h.1.attn.c_attn.lora_B.default.weight, transformer.h.2.attn.c_attn.lora_A.default.weight, transformer.h.2.attn.c_attn.lora_B.default.weight, transformer.h.3.attn.c_attn.lora_A.default.weight, transformer.h.3.attn.c_attn.lora_B.default.weight, transformer.h.4.attn.c_attn.lora_A.default.weight, transformer.h.4.attn.c_attn.lora_B.default.weight, transformer.h.5.attn.c_attn.lora_A.default.weight, transformer.h.5.attn.c_attn.lora_B.default.weight, transformer.h.6.attn.c_attn.lora_A.default.weight, transformer.h.6.attn.c_attn.lora_B.default.weight, transformer.h.7.attn.c_attn.lora_A.default.weight, transformer.h.7.attn.c_attn.lora_B.default.weight, transformer.h.8.attn.c_attn.lora_A.default.weight, transformer.h.8.attn.c_attn.lora_B.default.weight

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): lora.Linear(
            (base_layer): Conv1D(nf=3072, nx=1024)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=1024, out_features=4, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=4, out_features=3072, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (c_proj): Conv1D(nf=1024, nx=1024)
          (attn_dropout): Dropout(p=0.1, inpl

In [67]:


# Input text to summarize
input_text = "Global Technology Sector Faces Widespread Supply Chain Disruptions Amid Rising Geopolitical Tensions The global technology industry is currently grappling with unprecedented supply chain challenges as geopolitical tensions between key manufacturing regions continue to escalate. Major producers of semiconductors, electronics components, and raw materials have experienced significant delays and shortages, affecting production schedules across a broad range of tech companies. Industry leaders attribute these disruptions to a combination of factors, including trade restrictions, increased tariffs, and strained diplomatic relations between several countries critical to the manufacturing ecosystem. The semiconductor shortage, which began during the COVID-19 pandemic, has been exacerbated by these political developments, leading to prolonged wait times and higher costs for essential components. In response, many technology firms are accelerating efforts to diversify their supply chains. This includes sourcing materials from alternative countries, investing in new manufacturing plants domestically, and building strategic reserves of key components. Governments are also stepping in, offering incentives and subsidies aimed at boosting local production capabilities to reduce dependency on foreign suppliers. Analysts warn that unless these issues are addressed, the industry may face slowed innovation cycles and increased prices for consumers. The impact is expected to extend beyond consumer electronics, affecting automotive manufacturing, telecommunications infrastructure, and renewable energy technologies. Despite these challenges, some companies see an opportunity to strengthen resilience and foster innovation by rethinking traditional supply chain models. Industry experts suggest that digitalization, automation, and closer collaboration between stakeholders will be critical in navigating the evolving landscape."

# Tokenize
inputs = tokenizer(input_text, return_tensors="pt")

# Generate summary tokens
outputs = model.generate(
    **inputs
    # max_new_tokens=300,
    # min_length=50,
    # # length_penalty=0.01,
    # # num_beams=5,
    # early_stopping=True
    )

# Decode generated tokens (skip prompt tokens)
summary = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

print("Summary:", summary.strip())

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Summary: The global technology sector is currently grappling with unprecedented supply chain challenges as geopolitical tensions between key manufacturing


In [66]:
from peft import PeftModel
from transformers import AutoModelForCausalLM

base_model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2-medium")
model = PeftModel.from_pretrained(base_model, "deDgod/gpt2-medium-lora-news-summary")

adapter_config.json:   0%|          | 0.00/818 [00:00<?, ?B/s]



adapter_model.safetensors:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

