In [2]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="/content/drive/MyDrive/Hackathon/balanced.jsonl", split="train")

# First split: train (80%) + temp (20%)
train_temp = dataset.train_test_split(test_size=0.2, seed=42)

# Second split: temp into validation (10%) + test (10%)
val_test = train_temp["test"].train_test_split(test_size=0.5, seed=42)

train_dataset = train_temp["train"]
val_dataset = val_test["train"]
test_dataset = val_test["test"]

print("Train:", len(train_dataset))
print("Val:", len(val_dataset))
print("Test:", len(test_dataset))


Generating train split: 0 examples [00:00, ? examples/s]

Train: 3449
Val: 431
Test: 432


In [3]:
from transformers import T5Tokenizer

model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)

def preprocess(batch):
    inputs = ["summarize: " + d for d in batch["description"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    labels = tokenizer(batch["summary"], max_length=64, truncation=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

train_dataset = train_dataset.map(preprocess, batched=True, remove_columns=train_dataset.column_names)
val_dataset = val_dataset.map(preprocess, batched=True, remove_columns=val_dataset.column_names)
test_dataset = test_dataset.map(preprocess, batched=True, remove_columns=test_dataset.column_names)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/3449 [00:00<?, ? examples/s]

Map:   0%|          | 0/431 [00:00<?, ? examples/s]

Map:   0%|          | 0/432 [00:00<?, ? examples/s]

In [4]:
from transformers import T5ForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq

model = T5ForConditionalGeneration.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Hackathon/jira-summary-model_V1",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=6,
    weight_decay=0.01,
    warmup_steps=500,
    logging_steps=50,
    load_best_model_at_end=True,
    report_to="none"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

from transformers import EarlyStoppingCallback

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

trainer.train()


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss
1,2.2194,1.89127
2,1.9547,1.797208
3,1.9656,1.740198
4,1.8298,1.722225
5,1.7578,1.702434
6,1.7013,1.701552


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=1296, training_loss=1.9813872325567552, metrics={'train_runtime': 2547.0682, 'train_samples_per_second': 8.125, 'train_steps_per_second': 0.509, 'total_flos': 8326769773271040.0, 'train_loss': 1.9813872325567552, 'epoch': 6.0})

In [5]:
metrics = trainer.evaluate(test_dataset)
print(metrics)

{'eval_loss': 1.7899904251098633, 'eval_runtime': 17.0655, 'eval_samples_per_second': 25.314, 'eval_steps_per_second': 6.329, 'epoch': 6.0}


In [6]:
# # Save model + tokenizer
# model.save_pretrained("/content/drive/MyDrive/Hackathon/jira-summary-model_V1")
# tokenizer.save_pretrained("/content/drive/MyDrive/Hackathon/jira-summary-model_V1")

# Load the best checkpoint selected by early stopping
best_checkpoint = trainer.state.best_model_checkpoint
print("Best checkpoint:", best_checkpoint)

trainer.model = T5ForConditionalGeneration.from_pretrained(best_checkpoint)

# Save best model + tokenizer
trainer.model.save_pretrained("/content/drive/MyDrive/Hackathon/jira-summary-model")
tokenizer.save_pretrained("/content/drive/MyDrive/Hackathon/jira-summary-model")

Best checkpoint: /content/drive/MyDrive/Hackathon/jira-summary-model_V1/checkpoint-1296


('/content/drive/MyDrive/Hackathon/jira-summary-model/tokenizer_config.json',
 '/content/drive/MyDrive/Hackathon/jira-summary-model/special_tokens_map.json',
 '/content/drive/MyDrive/Hackathon/jira-summary-model/spiece.model',
 '/content/drive/MyDrive/Hackathon/jira-summary-model/added_tokens.json')

In [7]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/Hackathon/jira-summary-model")
tokenizer = T5Tokenizer.from_pretrained("/content/drive/MyDrive/Hackathon/jira-summary-model")

model.to("cuda")  # optional, if testing on GPU


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [8]:
text = """
Issue is after certificates are installed and listed under tab security ---> X.509 Certificates Install.

These installed certificates are not getting cleared after performing configuration default on device using CLI.

Steps:

1. Perform multiple certificate installation on 4100 device.
2. Verify if they are listed under Security--> X.509 Certificates install tab.
3. Perform configuration default using cli - set configuration default
4. Verify if the certificates are getting cleared. They are intact not cleared.

Please check documents attached.
"""

inputs = tokenizer("summarize: " + text, return_tensors="pt").to(model.device)

output = model.generate(
    **inputs,
    max_length=80,
    num_beams=4,
    early_stopping=True
)

print(tokenizer.decode(output[0], skip_special_tokens=True))


Certificates are not getting cleared after configuration default on device using CLI


# Finding similar issues

In [9]:
from sentence_transformers import SentenceTransformer

model_sim = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# Load your Jira summaries from JSONL
import json

summaries = []
ids = []

with open("/content/drive/MyDrive/Hackathon/existing_issues.jsonl") as f:
    for line in f:
        item = json.loads(line)
        summaries.append(item["summary"])
        ids.append(item["id"])

# Convert all summaries to embeddings
jira_embeddings = model_sim.encode(summaries, convert_to_tensor=True)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [26]:
print("Entered description: ")
desc = input()

inputs = tokenizer("summarize: " + desc, return_tensors="pt").to(model.device)

output = model.generate(
    **inputs,
    max_length=80,
    num_beams=4,
    early_stopping=True
)
new_summary = tokenizer.decode(output[0], skip_special_tokens=True)
print("\nGenerated jira summary:\n", new_summary)

query_embedding = model_sim.encode([new_summary], convert_to_tensor=True)
import torch
from sentence_transformers.util import cos_sim

scores = cos_sim(query_embedding, jira_embeddings)[0]
top_k = 3
top_results = torch.topk(scores, k=top_k)

from tabulate import tabulate

rows = []
for score, idx in zip(top_results.values, top_results.indices):
    rows.append([ids[idx], summaries[idx], f"{score:.4f}"])

print("\nExisting similar JIRA issues")
print(tabulate(rows, headers=["Issue ID", "Summary", "Similarity"], tablefmt="github"))


Entered description: 
Issue is after certificates are installed and listed under tab security ---> X.509 Certificates Install.  These installed certificates are not getting cleared after performing configuration default on device using CLI.  Steps:  1. Perform multiple certificate installation on 4100 device. 2. Verify if they are listed under Security--> X.509 Certificates install tab. 3. Perform configuration default using cli - set configuration default 4. Verify if the certificates are getting cleared. They are intact not cleared.  Please check documents attached.

Generated jira summary:
 Certificates are not getting cleared after configuration default on device using CLI

Existing similar JIRA issues
| Issue ID    | Summary                                                                                                                            |   Similarity |
|-------------|---------------------------------------------------------------------------------------------------------