<a href="https://colab.research.google.com/github/Dorin-Irimia/LLM_Google_Colab/blob/main/Welcome_To_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Antrenarea unui LLM pentru a putea redacta teste pe baza unor cerinte.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Instalarea Pachetelor Necesare

In [2]:
# Instalează dependințele necesare
!pip install torch transformers peft bitsandbytes accelerate datasets pandas openpyxl


Collecting bitsandbytes
  Using cached bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting datasets
  Using cached datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Using cached nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3

## Îmbinarea Dataset-urilor (Requirements + Tests)

In [22]:
import pandas as pd
import json

# ✅ Încarcă fișierele Excel
req_file = "Requirements.xlsx"
test_file = "Test_cases.xlsx"

df_req = pd.read_excel(req_file)
df_tests = pd.read_excel(test_file)

# ✅ Convertim numele coloanelor pentru consistență (eliminăm spații suplimentare)
df_req.columns = df_req.columns.str.strip()
df_tests.columns = df_tests.columns.str.strip()

print("Coloanele din df_tests:")
print(df_tests.columns)


print("Coloanele din df_req:")
print(df_req.columns)


# ✅ Unim requirement-urile cu testele pe "Requirement ID"
df_merged = df_tests.merge(df_req, on="ID_REQ", how="left")

print("Coloanele din df_merged:")
print(df_merged.columns)


# ✅ Conversie la JSONL pentru fine-tuning
data = []
for _, row in df_merged.iterrows():
    prompt = f"ID_REQ: {row['Requirment_Description']}\n {row['Test_Precondition']}\nGenerate a system test for this requirement:"
    response = f"System Test: {row['Test_Description']} (covers requirement ID: {row['ID_REQ']})"
    data.append({"prompt": prompt, "response": response})

# ✅ Salvăm dataset-ul final
jsonl_path = "dataset.jsonl"
with open(jsonl_path, "w", encoding="utf-8") as f:
    for entry in data:
        f.write(json.dumps(entry) + "\n")

print(f"✅ Dataset-ul pentru AI a fost creat: {jsonl_path}")


Coloanele din df_tests:
Index(['ID_REQ', 'ID_TEST', 'Test_Description', 'Test_Precondition',
       'Test_Steps', 'Expected_Result', 'Status', 'Functionality'],
      dtype='object')
Coloanele din df_req:
Index(['ID_REQ', 'Requirment_Description', 'Status', 'ID_TEST',
       'Functionality'],
      dtype='object')
Coloanele din df_merged:
Index(['ID_REQ', 'ID_TEST_x', 'Test_Description', 'Test_Precondition',
       'Test_Steps', 'Expected_Result', 'Status_x', 'Functionality_x',
       'Requirment_Description', 'Status_y', 'ID_TEST_y', 'Functionality_y'],
      dtype='object')
✅ Dataset-ul pentru AI a fost creat: dataset.jsonl


## Încărcarea Dataset-ului (Excel → JSONL)


In [3]:
import pandas as pd
import json

# Încarcă fișierul Excel
df = pd.read_excel("dataset.xlsx")

# Conversie la JSONL
data = []
for _, row in df.iterrows():
    prompt = f"Requirement: {row['Requirement']}\nGenerate a system test for this requirement:"
    response = f"System Test: {row['Test Case']} (covers requirement ID: {row['Requirement ID']})"
    data.append({"prompt": prompt, "response": response})

# Salvare dataset
with open("dataset.jsonl", "w", encoding="utf-8") as f:
    for entry in data:
        f.write(json.dumps(entry) + "\n")

print("✅ Dataset salvat ca dataset.jsonl!")


✅ Dataset salvat ca dataset.jsonl!


# Incercare invatare direct din excel

In [44]:
# Importă bibliotecile necesare
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import os  # Import os for environment variable

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'  # Enable synchronous CUDA error reporting

# Încărcarea fișierelor Excel
requirements_df = pd.read_excel('Requirements.xlsx')
tests_df = pd.read_excel('Test_cases.xlsx')

# Verifică structura fișierelor încărcate
print(requirements_df.head())
print(tests_df.head())

# Preprocesarea datelor
requirements_df = requirements_df.rename(columns={'Requirment_Description': 'description'})

# Adaugă coloanele necesare din tests_df pentru a forma un string de etichete
tests_df['full_test'] = tests_df['Test_Description'] + " | " + tests_df['Test_Precondition'] + " | " + tests_df['Test_Steps'] + " | " + tests_df['Expected_Result']

# Encodarea etichetelor
label_encoder = LabelEncoder()

# --- ADJUSTMENT ---
# Fit the label encoder on all possible values before transforming
all_test_cases = tests_df['full_test'].tolist() + ['NO_TEST_CASE']
label_encoder.fit(all_test_cases)

# Align DataFrames based on 'ID_REQ' and then assign labels
merged_df = pd.merge(requirements_df, tests_df[['ID_REQ', 'full_test']], on='ID_REQ', how='left')

# Handle NaN values by assigning a default label ('NO_TEST_CASE')
merged_df['full_test'] = merged_df['full_test'].fillna('NO_TEST_CASE')  # Assign a placeholder for NaN

requirements_df = merged_df[['ID_REQ', 'description']]  # Update requirements_df with aligned data
requirements_df['label'] = label_encoder.transform(merged_df['full_test'])  # Transform using updated encoder
# --- END ADJUSTMENT ---

train_df, val_df = train_test_split(requirements_df, test_size=0.2, random_state=42)

# Încărcarea tokenizer-ului și modelului
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# --- ADJUSTMENT ---
# Set num_labels to the total number of unique labels
num_labels = len(label_encoder.classes_)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
# --- END ADJUSTMENT ---

# Tokenizarea datelor
train_encodings = tokenizer(list(train_df['description']), truncation=True, padding=True)
val_encodings = tokenizer(list(val_df['description']), truncation=True, padding=True)

# Crearea dataset-ului
class RequirementsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = RequirementsDataset(train_encodings, train_df['label'].tolist())
val_dataset = RequirementsDataset(val_encodings, val_df['label'].tolist())

# Configurarea antrenamentului
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Antrenarea modelului
trainer.train()

# Funcție pentru generarea testelor
def generate_tests(requirement):
    inputs = tokenizer(requirement, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        predicted_test_index = outputs.logits.argmax(dim=1).item()  # Obține indexul testului generat
    return label_encoder.inverse_transform([predicted_test_index])[0]  # Returnează descrierea testului

# Generarea testelor pentru requirement-uri
new_requirements = requirements_df['description'].tolist()
generated_tests = [generate_tests(req) for req in new_requirements]

# Crearea unui DataFrame pentru rezultatele generate
output_df = pd.DataFrame({
    'Requirement ID': requirements_df['ID_REQ'],
    'Generated Test Description': generated_tests  # Descrierea generată a testului
})

# Salvarea rezultatelor într-un fișier Excel
output_df.to_excel('generated_tests.xlsx', index=False)

print("Testele generate au fost salvate în generated_tests.xlsx")

   ID_REQ                             Requirment_Description        Status  \
0       1  The LIGHT_STATUS shall be set to ON if the LIG...  covered_less   
1       2  The ENGINE_STATUS shall be set to RUNNING if t...  covered_less   
2       3  The WINDOW_POSITION shall be set to CLOSED if ...  covered_less   
3       4  The HVAC_MODE shall be set to COOLING if the T...  covered_less   
4       5  The BATTERY_CHARGING shall be ENABLED if the C...  covered_less   

   ID_TEST Functionality  
0      1.0        LIGHTS  
1      2.0        ENGINE  
2      3.0        WINDOW  
3      4.0   TEMPERATURE  
4      5.0       BATTERY  
   ID_REQ  ID_TEST                                   Test_Description  \
0       1        1          Verify LIGHT_STATUS Based on LIGHT_SWITCH   
1       2        2  Verify ENGINE_STATUS Based on IGNITION and FUE...   
2       3        3      Verify WINDOW_POSITION Based on WINDOW_SWITCH   
3       4        4              Verify HVAC_MODE Based on TEMPERATURE   
4   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  requirements_df['label'] = label_encoder.transform(merged_df['full_test'])  # Transform using updated encoder
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


# New Section

In [6]:
# Google Colab script for fine-tuning BERT on requirements and test cases

# Install necessary packages
!pip install transformers datasets torch pandas openpyxl

import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Load Excel files
req_file = "Requirements.xlsx"
test_file = "Test_cases.xlsx"
req_df = pd.read_excel(req_file)
test_df = pd.read_excel(test_file)

# Merge requirements with test cases
data = req_df.merge(test_df, on="ID_REQ")

# Prepare data for fine-tuning
def prepare_data(row, label_map):
    return {
        "text": row["Requirment_Description"],
        "label": label_map[row["Test_Description"]]  # Use numeric labels
    }

# Create a label map for numeric labels
unique_labels = data["Test_Description"].unique()
label_map = {label: idx for idx, label in enumerate(unique_labels)}

dataset = [prepare_data(row, label_map) for _, row in data.iterrows()]
dataset = Dataset.from_pandas(pd.DataFrame(dataset))

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_map))

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="none"  # Disable wandb
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Save model
model.save_pretrained("./bert_finetuned")
tokenizer.save_pretrained("./bert_finetuned")

print("Fine-tuning complete. Model saved.")




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss


Fine-tuning complete. Model saved.


In [8]:
# Google Colab script for fine-tuning BERT on requirements and test cases

# (Include aici codul anterior de antrenare a modelului)

# Function to generate test cases from requirements
def generate_test_cases(requirements, model, tokenizer):
    model.eval()  # Set the model to evaluation mode
    test_cases = []

    for req in requirements:
        # Tokenize the input requirement
        inputs = tokenizer(req, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Get the model's predictions
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            predicted_label = torch.argmax(logits, dim=1).item()

        # Convert the predicted label back to the test description
        test_description = [key for key, value in label_map.items() if value == predicted_label][0]
        test_cases.append(test_description)

    return test_cases

# Load the requirements to generate test cases for
new_req_df = pd.read_excel("Requirements.xlsx")  # Fișierul cu cerințe noi
requirements = new_req_df["Requirment_Description"].tolist()  # Asigurați-vă că denumirea coloanei este corectă

# Generate test cases
generated_test_cases = generate_test_cases(requirements, model, tokenizer)

# Create a DataFrame for the results
results_df = pd.DataFrame({
    "Requirement": requirements,
    "Generated Test Case": generated_test_cases
})

# Save the results to an Excel file
results_df.to_excel("Generated_Test_Cases.xlsx", index=False)

print("Test cases generated and saved to 'Generated_Test_Cases.xlsx'.")


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

# New model from GPT


### 1️⃣ Instalează și importă librăriile

In [9]:
!pip install transformers datasets torch pandas openpyxl

import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset




### 2️⃣ Încarcă datele din Excel

In [17]:
req_file = "Requirements.xlsx"
test_file = "Test_cases.xlsx"
req_df = pd.read_excel(req_file)
test_df = pd.read_excel(test_file)

# Verifică dacă fișierele sunt corect citite
print(req_df.head())
print(test_df.head())


FileNotFoundError: [Errno 2] No such file or directory: 'Requirements.xlsx'

### 3️⃣ Pregătește datele pentru antrenare

In [11]:
# Merge requirements with test cases
data = req_df.merge(test_df, on="ID_REQ")

# Creare label map pentru clasificare
unique_labels = data["Test_Description"].unique()
label_map = {label: idx for idx, label in enumerate(unique_labels)}

def prepare_data(row):
    return {
        "text": row["Requirment_Description"],
        "label": label_map[row["Test_Description"]]
    }

dataset = [prepare_data(row) for _, row in data.iterrows()]
dataset = Dataset.from_pandas(pd.DataFrame(dataset))

# Verifică primele date pregătite
print(dataset[0])


{'text': 'The LIGHT_STATUS shall be set to ON if the LIGHT_SWITCH is ON, else OFF.', 'label': 0}


### 4️⃣ Încarcă modelul și tokenizer-ul

In [12]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_map))

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Verifică dacă tokenizarea funcționează
print(tokenized_datasets[0])


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

{'text': 'The LIGHT_STATUS shall be set to ON if the LIGHT_SWITCH is ON, else OFF.', 'label': 0, 'input_ids': [101, 1996, 2422, 1035, 3570, 4618, 2022, 2275, 2000, 2006, 2065, 1996, 2422, 1035, 6942, 2003, 2006, 1010, 2842, 2125, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

### 5️⃣ Antrenează modelul

In [13]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="none"  # Dezactivează wandb
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer,
)

trainer.train()


  trainer = Trainer(


Step,Training Loss


TrainOutput(global_step=6, training_loss=2.4154300689697266, metrics={'train_runtime': 18.8382, 'train_samples_per_second': 1.593, 'train_steps_per_second': 0.319, 'total_flos': 7893898629120.0, 'train_loss': 2.4154300689697266, 'epoch': 3.0})

### 6️⃣ Salvează modelul antrenat

In [14]:
model.save_pretrained("./bert_finetuned")
tokenizer.save_pretrained("./bert_finetuned")

print("Model salvat cu succes!")


Model salvat cu succes!


### 7️⃣ Rulează inferența pe un fișier nou

In [16]:
# Încarcă modelul antrenat
model = BertForSequenceClassification.from_pretrained("./bert_finetuned")
tokenizer = BertTokenizer.from_pretrained("./bert_finetuned")

def generate_test_cases(requirements):
    model.eval()
    test_cases = []

    for req in requirements:
        inputs = tokenizer(req, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            predicted_label = torch.argmax(logits, dim=1).item()

        test_description = [key for key, value in label_map.items() if value == predicted_label][0]
        test_cases.append(test_description)

    return test_cases

# Încarcă fișierul cu cerințe noi
new_req_df = pd.read_excel("Requirements.xlsx")
requirements = new_req_df["Requirment_Description"].tolist()

# Generează testele
generated_test_cases = generate_test_cases(requirements)

# Salvează în Excel
results_df = pd.DataFrame({"Requirement": requirements, "Generated Test Case": generated_test_cases})
results_df.to_excel("Generated_Test_Cases.xlsx", index=False)

print("Fișierul 'Generated_Test_Cases.xlsx' a fost creat!")


Fișierul 'Generated_Test_Cases.xlsx' a fost creat!
