In [None]:
# Install 'transformers' for the BERT model, 'datasets' for data handling,
# and 'evaluate' for performance metrics.
!pip install transformers datasets evaluate -q

In [None]:
from google.colab import files
import io
import csv
import re
import pandas as pd

print("Please upload your 'FR_NFR_Dataset.xlsx - datasetFR_NFR_full.csv' file.")
uploaded = files.upload()

# Store the uploaded filename for the next cell
if uploaded:
    file_name = next(iter(uploaded))
    print(f"\nSuccessfully uploaded '{file_name}'")
else:
    print("\nNo file was uploaded. Please run the cell again to upload.")

In [None]:
import pandas as pd
from datasets import Dataset
import re

if 'file_name' in locals() and file_name in uploaded:
    try:
        # Use pandas.read_excel() as the file is an Excel file, not a CSV.
        # This is the definitive fix for the loading and casting errors.
        df = pd.read_excel(io.BytesIO(uploaded[file_name]))

        # Force the column names to be 'text' and 'label_str' for consistency
        # This assumes the first column is the requirement and the second is the type.
        df.columns = ['text', 'label_str']

        # Ensure the 'text' column is of string type
        df['text'] = df['text'].astype(str)

        # Map string labels to integer IDs for the model
        label2id = {"FR": 0, "NFR": 1}
        # Clean the label string and then map it
        df['label_str'] = df['label_str'].str.extract(r'(FR|NFR)', expand=False, flags=re.IGNORECASE).str.upper()
        df['label'] = df['label_str'].map(label2id)

        # Clean up and prepare the DataFrame
        df = df.dropna(subset=['text', 'label'])
        df['label'] = df['label'].astype(int)
        df = df[['text', 'label']]

        print(f"Successfully loaded and cleaned {len(df)} records from the Excel file.")
        print("\n--- Data Preview ---")
        print(df.head())

    except Exception as e:
        print(f"An error occurred while processing the Excel file: {e}")
        print("Please ensure you have uploaded the correct '.xlsx' file.")
else:
    print("Data not loaded. Please ensure you uploaded a file in Cell 2.")

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer

if 'df' in locals() and not df.empty:
    # Convert pandas DataFrame to Hugging Face Dataset
    hg_dataset = Dataset.from_pandas(df)

    # Split into training and testing sets (80% train, 20% test)
    hg_dataset = hg_dataset.train_test_split(test_size=0.2, seed=42)
    print("--- Dataset Structure ---")
    print(hg_dataset)

    # Load the pre-trained tokenizer ('distilbert-base-uncased' is fast and effective)
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

    # Define the function that will tokenize the text
    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True)

    # Apply the tokenization to our entire dataset
    tokenized_datasets = hg_dataset.map(tokenize_function, batched=True)
    print("\nTokenization complete.")
    print("\n--- Sample Tokenized Record ---")
    print(tokenized_datasets['train'][0])
else:
    print("DataFrame 'df' not found or is empty. Please check the previous cell for errors.")

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import numpy as np
import evaluate

if 'tokenized_datasets' in locals():
    # Load the pre-trained DistilBERT model for sequence classification
    model = AutoModelForSequenceClassification.from_pretrained(
        "distilbert-base-uncased",
        num_labels=2,
        id2label={0: "FR", 1: "NFR"},
        label2id={"FR": 0, "NFR": 1}
    )

    # Define the training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=3,              # A good starting point
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=50,
        eval_strategy="epoch",      # Evaluate at the end of each epoch
        save_strategy="epoch",            # Save a checkpoint after each epoch
        load_best_model_at_end=True,      # Automatically load the best model
    )

    # Define the evaluation metric
    metric = evaluate.load("accuracy")
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)

    # Create the Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        compute_metrics=compute_metrics,
    )

    # Start training!
    print("--- Starting Model Training ---")
    trainer.train()
    print("--- Training Complete ---")
else:
    print("Tokenized dataset not found. Please run the previous cells successfully.")

In [None]:
from transformers import pipeline

if 'trainer' in locals():
    # The trainer automatically saves the best performing model.
    # We can load it directly into a pipeline.
    best_model_path = trainer.state.best_model_checkpoint
    print(f"Loading best model from: {best_model_path}")

    classifier = pipeline("text-classification", model=best_model_path, tokenizer=tokenizer)

    # Example 1: Should be Functional (FR)
    text_fr = "The system shall email a confirmation link to the user upon registration."
    result_fr = classifier(text_fr)
    print(f"\nText: '{text_fr}'")
    print(f"Prediction: {result_fr[0]['label']} (Score: {result_fr[0]['score']:.4f})")

    # Example 2: Should be Non-Functional (NFR)
    text_nfr = "The user interface should be intuitive and require minimal training."
    result_nfr = classifier(text_nfr)
    print(f"\nText: '{text_nfr}'")
    print(f"Prediction: {result_nfr[0]['label']} (Score: {result_nfr[0]['score']:.4f})")
else:
    print("Trainer not found. Please complete the training in the previous cell.")