In [None]:
!pip install transformers torch pytorch-tabular scikit-learn pandas
!pip install pytorch-tabular


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

# ----------------------------------------------------
# Load Dataset
# ----------------------------------------------------
df = pd.read_csv("desharnais.csv")
df = df.rename(columns={'PointsAjust': 'PointsAdjust'})

# Keep relevant features
features = [
    "TeamExp", "ManagerExp", "Length", "Transactions",
    "Entities", "PointsAdjust", "Language"
]
target = "Effort"
df = df[features + [target]]

# ----------------------------------------------------
# Log-transform continuous features
# ----------------------------------------------------
cols_to_log = ["Effort", "Length", "Transactions", "Entities", "PointsAdjust"]
df[cols_to_log] = np.log(df[cols_to_log])

# ----------------------------------------------------
# Normalize selected features
# ----------------------------------------------------
cols_to_norm = ["TeamExp", "ManagerExp", "Language"]
df[cols_to_norm] = StandardScaler().fit_transform(df[cols_to_norm])

# ----------------------------------------------------
# Create natural-language descriptions for transformers
# ----------------------------------------------------
def row_to_text(row):
    text = (
        f"Team experience {row['TeamExp']}, "
        f"Manager experience {row['ManagerExp']}, "
        f"Project length {row['Length']}, "
        f"Transactions {row['Transactions']}, "
        f"Entities {row['Entities']}, "
        f"Adjusted function points {row['PointsAdjust']}, "
        f"Language type {row['Language']}."
    )
    return text

df["text"] = df.apply(row_to_text, axis=1)

# ----------------------------------------------------
# Define metrics
# ----------------------------------------------------
def compute_metrics(y_true_log, y_pred_log):
    y_true = np.exp(y_true_log)
    y_pred = np.exp(y_pred_log)

    mse = mean_squared_error(y_true_log, y_pred_log)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true_log, y_pred_log)
    r2 = r2_score(y_true_log, y_pred_log)

    mre = np.abs((y_true - y_pred) / y_true)
    mmre = np.mean(mre)
    pred_25 = np.mean(mre <= 0.25)

    return mse, rmse, mae, r2, mmre, pred_25

# ----------------------------------------------------
# PyTorch Dataset for text-based transformers
# ----------------------------------------------------
class TextRegDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tok = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )
        return {
            "input_ids": tok["input_ids"].squeeze(),
            "attention_mask": tok["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.float)
        }

from transformers import GPT2Model
class GPT2ForRegression(torch.nn.Module):
    def __init__(self, pretrained_model="gpt2"):
        super().__init__()
        self.gpt2 = GPT2Model.from_pretrained(pretrained_model)
        self.regressor = torch.nn.Linear(self.gpt2.config.n_embd, 1)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden = outputs.last_hidden_state  # [batch, seq_len, hidden_size]
        pooled = last_hidden[:, 0, :]           # use first token for regression
        return self.regressor(pooled)


# ----------------------------------------------------
# Transformer training function (MSELoss)
# ----------------------------------------------------
def train_transformer(model, train_loader, epochs=10):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=2e-5)
    loss_fn = torch.nn.MSELoss()

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # If outputs is a tensor (like GPT-2 regression), use it directly
            if isinstance(outputs, torch.Tensor):
                logits = outputs.view(-1)
            else:  # for BERT/RoBERTa
                logits = outputs.logits.view(-1)

            loss = loss_fn(logits, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        #print(f"Epoch {epoch+1}, Train Loss = {total_loss:.4f}")

# ----------------------------------------------------
# Prepare for 5-Fold Cross-Validation
# ----------------------------------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)
results = []

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Define models and tokenizers
gpt2_model = GPT2ForRegression(pretrained_model="gpt2")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token  # GPT-2 requires pad token

transformer_models = {
    "BERT-base": (BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1),
                  BertTokenizer.from_pretrained("bert-base-uncased")),
    "RoBERTa-base": (RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=1),
                     RobertaTokenizer.from_pretrained("roberta-base")),
    "GPT-2": (gpt2_model, gpt2_tokenizer)
}


# GPT-2 padding
transformer_models["GPT-2"][1].pad_token = transformer_models["GPT-2"][1].eos_token
#transformer_models["GPT-2"][0].resize_token_embeddings(len(transformer_models["GPT-2"][1]))

# ----------------------------------------------------
# 5-Fold CV for text-based transformers
# ----------------------------------------------------
for name, (model, tokenizer) in transformer_models.items():
    print(f"\nRunning 5-Fold CV for {name}...")
    fold_metrics = []

    for train_index, test_index in kf.split(df):
        train_df = df.iloc[train_index]
        test_df  = df.iloc[test_index]

        train_dataset = TextRegDataset(train_df["text"].tolist(), train_df["Effort"].tolist(), tokenizer)
        test_dataset  = TextRegDataset(test_df["text"].tolist(),  test_df["Effort"].tolist(), tokenizer)

        train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
        test_loader  = DataLoader(test_dataset, batch_size=8)

        # Train model
        train_transformer(model, train_loader, epochs=10)

        # Evaluation
        model.eval()
        preds, trues = [], []
        device = "cuda" if torch.cuda.is_available() else "cpu"
        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                # If outputs is a tensor (like GPT-2 regression), use it directly
                if isinstance(outputs, torch.Tensor):
                    logits = outputs.view(-1)
                else:  # for BERT/RoBERTa
                    logits = outputs.logits.view(-1)

                if logits.dim() == 0:
                    logits = logits.unsqueeze(0)
                preds.extend(logits.cpu().numpy())
                trues.extend(labels.cpu().numpy())

        fold_metrics.append(compute_metrics(np.array(trues), np.array(preds)))

    avg_metrics = np.mean(fold_metrics, axis=0)
    results.append({
        "Model": name,
        "MSE": avg_metrics[0],
        "RMSE": avg_metrics[1],
        "MAE": avg_metrics[2],
        "R²": avg_metrics[3],
        "MMRE": avg_metrics[4],
        "Pred(25)": avg_metrics[5]
    })

# Compute average metrics
avg_metrics = np.mean(fold_metrics, axis=0)


# Display Results
results_df = pd.DataFrame(results)
print("\n===== FINAL COMPARISON TABLE =====\n")
print(results_df.sort_values("RMSE"))




Running 5-Fold CV for RoBERTa-base...

Running 5-Fold CV for GPT-2...

===== FINAL COMPARISON TABLE =====

          Model       MSE      RMSE       MAE        R²      MMRE  Pred(25)
0     BERT-base  0.794309  0.838325  0.706915 -0.166370  0.669519  0.199265
1  RoBERTa-base  0.751170  0.853889  0.671356 -0.177613  1.064406  0.245588
2         GPT-2  1.425389  1.116220  0.953546 -0.973379  1.157639  0.147794


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

# -----------------------------
# Load and preprocess COCOMO-81
# -----------------------------
cocomo = pd.read_csv("COCOMO-81.csv")

# If column names differ, adjust these names accordingly:
# assumed columns: 'actual' (effort), 'loc', plus cost drivers like 'rely','data','cplx',... and 'dev_mode'
# encode dev_mode categories (embedded, semidetached, organic)
if "dev_mode" in cocomo.columns:
    cocomo["dev_mode"] = cocomo["dev_mode"].astype("category").cat.codes

# Log-transform skewed columns (target 'actual' and optionally 'loc')
cols_to_log_transform = [c for c in ["actual", "loc"] if c in cocomo.columns]
cocomo[cols_to_log_transform] = np.log(cocomo[cols_to_log_transform])

# Columns to normalize (use the same set you used before; keep only those present)
cols_to_normalize = [
    'rely','data','cplx','time','stor','virt','turn',
    'acap','aexp','pcap','vexp','lexp','modp','tool','sced',
    'dev_mode'
]
cols_to_normalize = [c for c in cols_to_normalize if c in cocomo.columns]

# Standard scale the normalized columns
if cols_to_normalize:
    cocomo[cols_to_normalize] = StandardScaler().fit_transform(cocomo[cols_to_normalize])

# Select final features for the text description (choose a compact set)
# We include 'loc' (log), normalized cost drivers, and dev_mode if available.
text_feature_cols = []
# include 'loc' if available (already log-transformed)
if "loc" in cocomo.columns:
    text_feature_cols.append("loc")
# include a few cost drivers (use whichever exist)
for c in ['rely','data','cplx','time','stor','acap','aexp','pcap','tool','sced','dev_mode']:
    if c in cocomo.columns:
        text_feature_cols.append(c)

# target column (log-scale)
target_col = "actual"
if target_col not in cocomo.columns:
    raise ValueError(f"Target column '{target_col}' not found in COCOMO file.")

# Keep only needed columns, drop rows with NaNs
keep_cols = text_feature_cols + [target_col]
cocomo = cocomo[keep_cols].dropna().reset_index(drop=True)

# -----------------------------
# Create natural-language text for each row
# -----------------------------
def row_to_text(row):
    parts = []
    for col in text_feature_cols:
        val = row[col]
        # format numeric nicely
        parts.append(f"{col} {float(np.round(val, 4))}")
    return ", ".join(parts) + "."

cocomo["text"] = cocomo.apply(row_to_text, axis=1)
# target (log-scale)
cocomo["target_log"] = cocomo[target_col]

# -----------------------------
# Metrics (consistent with your COCOMO script)
# Inputs to compute_metrics are log-scaled true and predicted values
# -----------------------------
def compute_metrics(y_true_log, y_pred_log):
    # numeric arrays
    y_true_log = np.asarray(y_true_log).reshape(-1)
    y_pred_log = np.asarray(y_pred_log).reshape(-1)

    # log-scale metrics
    mse = mean_squared_error(y_true_log, y_pred_log)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true_log, y_pred_log)
    r2 = r2_score(y_true_log, y_pred_log)

    # convert to real scale for MMRE and Pred(25)
    y_true_real = np.exp(y_true_log)
    y_pred_real = np.exp(y_pred_log)
    mre = np.abs((y_true_real - y_pred_real) / y_true_real)
    mmre = np.mean(mre)
    pred_25 = np.mean(mre <= 0.25)

    return mse, rmse, mae, r2, mmre, pred_25

# -----------------------------
# PyTorch Dataset for transformers
# -----------------------------
class TextRegDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tok = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {
            "input_ids": tok["input_ids"].squeeze(0),
            "attention_mask": tok["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.float)
        }
        return item

# -----------------------------
# GPT-2 regression wrapper (simple)
# -----------------------------
from transformers import GPT2Model
class GPT2ForRegression(torch.nn.Module):
    def __init__(self, pretrained_model="gpt2"):
        super().__init__()
        self.gpt2 = GPT2Model.from_pretrained(pretrained_model)
        hidden_size = self.gpt2.config.n_embd
        self.regressor = torch.nn.Linear(hidden_size, 1)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
        # use first token ([0]) pooling similar to your previous code
        last_hidden = outputs.last_hidden_state  # [batch, seq_len, hidden]
        pooled = last_hidden[:, 0, :]
        return self.regressor(pooled).view(-1)  # return 1-d tensor per batch element

# -----------------------------
# Training function for any transformer-like model
# -----------------------------
def train_transformer(model, train_loader, epochs=10, lr=2e-5, print_every=10):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=lr)
    loss_fn = torch.nn.MSELoss()

    model.train()
    for epoch in range(epochs):
        total_loss = 0.0
        for i, batch in enumerate(train_loader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # outputs may be:
            # - a 1-D tensor (GPT2ForRegression returns shape [batch])
            # - a SequenceClassifierOutput with .logits shape [batch, 1]
            if isinstance(outputs, torch.Tensor):
                preds = outputs.view(-1)
            else:
                # e.g., transformers' ForSequenceClassification
                preds = outputs.logits.view(-1)

            loss = loss_fn(preds, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # print epoch summary
        #print(f"Epoch {epoch+1}/{epochs} - Train loss: {total_loss:.4f}")

# -----------------------------
# 5-Fold CV across transformer models
# -----------------------------
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import GPT2Tokenizer

# Initialize models & tokenizers
# NOTE: set num_labels=1 for regression using sequence classification heads
models_and_tokenizers = {}

# BERT
bert_tok = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)
models_and_tokenizers["BERT-base"] = (bert_model, bert_tok)

# RoBERTa
roberta_tok = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=1)
models_and_tokenizers["RoBERTa-base"] = (roberta_model, roberta_tok)

# GPT-2 custom regression
gpt2_tok = GPT2Tokenizer.from_pretrained("gpt2")
# GPT2 tokenizer needs pad token defined
if gpt2_tok.pad_token is None:
    gpt2_tok.pad_token = gpt2_tok.eos_token
gpt2_model = GPT2ForRegression(pretrained_model="gpt2")
# Optionally resize embeddings if you've added pad token (not necessary when pad token equals eos_token)
models_and_tokenizers["GPT-2"] = (gpt2_model, gpt2_tok)

# -----------------------------
# 5-Fold cross validation loop
# -----------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)
results = []

for name, (model, tokenizer) in models_and_tokenizers.items():
    print(f"\n=== Running 5-Fold CV for {name} ===")
    fold_metrics = []

    for fold, (train_idx, test_idx) in enumerate(kf.split(cocomo)):
        train_df = cocomo.iloc[train_idx].reset_index(drop=True)
        test_df  = cocomo.iloc[test_idx].reset_index(drop=True)

        train_dataset = TextRegDataset(train_df["text"].tolist(), train_df["target_log"].tolist(), tokenizer)
        test_dataset  = TextRegDataset(test_df["text"].tolist(),  test_df["target_log"].tolist(), tokenizer)

        train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
        test_loader  = DataLoader(test_dataset, batch_size=16)

        # Train
        train_transformer(model, train_loader, epochs=10, lr=2e-5)

        # Evaluate
        model.eval()
        device = "cuda" if torch.cuda.is_available() else "cpu"
        preds_log, trues_log = [], []
        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                if isinstance(outputs, torch.Tensor):
                    logits = outputs.view(-1)
                else:
                    logits = outputs.logits.view(-1)

                preds_log.extend(logits.cpu().numpy())
                trues_log.extend(labels.cpu().numpy())

        fold_metrics.append(compute_metrics(np.array(trues_log), np.array(preds_log)))
        print(f"Fold {fold+1} done for {name}.")

    # Average across folds
    avg_metrics = np.mean(fold_metrics, axis=0)
    results.append({
        "Model": name,
        "MSE": avg_metrics[0],
        "RMSE": avg_metrics[1],
        "MAE": avg_metrics[2],
        "R²": avg_metrics[3],
        "MMRE": avg_metrics[4],
        "Pred(25)": avg_metrics[5]
    })

# Present results
results_df = pd.DataFrame(results).sort_values("RMSE")
print("\n===== FINAL COMPARISON TABLE =====\n")
print(results_df)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== Running 5-Fold CV for BERT-base ===
Fold 1 done for BERT-base.
Fold 2 done for BERT-base.
Fold 3 done for BERT-base.
Fold 4 done for BERT-base.
Fold 5 done for BERT-base.

=== Running 5-Fold CV for RoBERTa-base ===
Fold 1 done for RoBERTa-base.
Fold 2 done for RoBERTa-base.
Fold 3 done for RoBERTa-base.
Fold 4 done for RoBERTa-base.
Fold 5 done for RoBERTa-base.

=== Running 5-Fold CV for GPT-2 ===
Fold 1 done for GPT-2.
Fold 2 done for GPT-2.
Fold 3 done for GPT-2.
Fold 4 done for GPT-2.
Fold 5 done for GPT-2.

===== FINAL COMPARISON TABLE =====

          Model       MSE      RMSE       MAE        R²      MMRE  Pred(25)
1  RoBERTa-base  1.421528  1.059809  0.848401  0.522409  1.815898  0.192308
0     BERT-base  1.505543  1.145503  0.899501  0.483921  1.582184  0.207692
2         GPT-2  3.565083  1.856593  1.531490 -0.100161  3.879213  0.108974


In [None]:
# nasa_transformers_cv.py
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

# -------------------------
# Load NASA93 dataset
# -------------------------
nasa = pd.read_csv("NASA_93_Sheet.csv")

# Selected numeric predictors (same as your ML code)
selected_features = [
    "prec","flex","resl","team","pmat","rely","data","cplx","ruse","docu",
    "time","stor","pvol","acap","pcap","pcon","apex","plex","ltex","tool",
    "site","sced","kloc"
]

missing = [f for f in selected_features if f not in nasa.columns]
if missing:
    raise ValueError(f"Missing columns in CSV: {missing}")

# Target
target_col = "effort"
if target_col not in nasa.columns:
    raise ValueError(f"Target column '{target_col}' not found in CSV")

# Keep only the chosen columns + target
df = nasa[selected_features + [target_col]].copy()

# -------------------------
# Preprocessing
# -------------------------
# log-transform the target (works well for effort distributions)
df["effort_log"] = np.log(df[target_col].values)

# Scale predictors (fit once on full dataset here prior to CV)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[selected_features].values)
X_scaled_df = pd.DataFrame(X_scaled, columns=selected_features, index=df.index)

# Merge scaled predictors and log-target into df used later
df_scaled = pd.concat([X_scaled_df, df[["effort_log"]]], axis=1)

# -------------------------
# Create natural-language descriptions from numeric features
# -------------------------
# We'll round scaled values to 3 decimals to keep text length reasonable.
def row_to_text(row):
    parts = []
    for feat in selected_features:
        parts.append(f"{feat} {row[feat]:.3f}")
    return ", ".join(parts) + "."

df_scaled["text"] = df_scaled.apply(row_to_text, axis=1)
df_scaled["label"] = df_scaled["effort_log"].values  # label in log-space

# -------------------------
# Metrics helper (same logic as your ML code)
# Inputs: y_true_log, y_pred_log (both in log-space)
# -------------------------
def compute_metrics(y_true_log, y_pred_log):
    # Convert to real space for MMRE/Pred(25)
    y_true_real = np.exp(y_true_log)
    y_pred_real = np.exp(y_pred_log)

    mse = mean_squared_error(y_true_log, y_pred_log)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true_log, y_pred_log)
    r2 = r2_score(y_true_log, y_pred_log)

    mre = np.abs((y_true_real - y_pred_real) / y_true_real)
    mmre = np.mean(mre)
    pred_25 = np.mean(mre <= 0.25)

    return mse, rmse, mae, r2, mmre, pred_25

# -------------------------
# PyTorch Dataset for text-based transformers
# -------------------------
class TextRegDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tok = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": tok["input_ids"].squeeze(0),
            "attention_mask": tok["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.float)
        }

# -------------------------
# Small GPT-2 regression head (like your original)
# -------------------------
from transformers import GPT2Model
class GPT2ForRegression(torch.nn.Module):
    def __init__(self, pretrained_model="gpt2"):
        super().__init__()
        self.gpt2 = GPT2Model.from_pretrained(pretrained_model)
        self.regressor = torch.nn.Linear(self.gpt2.config.n_embd, 1)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden = outputs.last_hidden_state  # [batch, seq_len, hidden_size]
        pooled = last_hidden[:, 0, :]           # use first token for regression
        return self.regressor(pooled).view(-1)  # return vector [batch]

# -------------------------
# Training loop (handles both HF sequence-classification models and our GPT2ForRegression)
# -------------------------
def train_transformer(model, train_loader, device, epochs=10, lr=2e-5, print_every=1):
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=lr)
    loss_fn = torch.nn.MSELoss()

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        for batch in train_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # our GPT2ForRegression returns a tensor [batch]; HF SeqClass returns object with .logits
            if isinstance(outputs, torch.Tensor):
                preds = outputs.view(-1)
            else:
                preds = outputs.logits.view(-1)

            loss = loss_fn(preds, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        if (epoch + 1) % print_every == 0:
            avg_loss = total_loss / len(train_loader)
            #print(f"Epoch {epoch+1}/{epochs} — train loss: {avg_loss:.4f}")

# -------------------------
# 5-Fold CV setup and model definitions
# -------------------------
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import GPT2Tokenizer

# instantiate GPT-2 custom model + tokenizer
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token  # set pad token
gpt2_model = GPT2ForRegression(pretrained_model="gpt2")
# NOTE: if you add pad_token you'll often want to resize embeddings — not necessary for our custom GPT2Model head,
# but if you use GPT2LMHeadModel you should call resize_token_embeddings.

# BERT & RoBERTa models for regression using seq-classification with num_labels=1
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)

roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=1)

transformer_models = {
    "BERT-base": (bert_model, bert_tokenizer),
    "RoBERTa-base": (roberta_model, roberta_tokenizer),
    "GPT-2": (gpt2_model, gpt2_tokenizer)
}

# ensure GPT-2 tokenizer has pad token set (again)
transformer_models["GPT-2"][1].pad_token = transformer_models["GPT-2"][1].eos_token

# -------------------------
# 5-Fold Cross-Validation (text-based)
# -------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)
device = "cuda" if torch.cuda.is_available() else "cpu"

results = []

for name, (model, tokenizer) in transformer_models.items():
    print(f"\n=== Running 5-Fold CV for {name} ===")
    fold_metrics = []

    for fold, (train_idx, test_idx) in enumerate(kf.split(df_scaled), 1):
        print(f"Fold {fold} / 5")

        train_texts = df_scaled.iloc[train_idx]["text"].tolist()
        train_labels = df_scaled.iloc[train_idx]["label"].tolist()
        test_texts  = df_scaled.iloc[test_idx]["text"].tolist()
        test_labels = df_scaled.iloc[test_idx]["label"].tolist()

        train_ds = TextRegDataset(train_texts, train_labels, tokenizer, max_length=128)
        test_ds  = TextRegDataset(test_texts, test_labels, tokenizer, max_length=128)

        train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
        test_loader  = DataLoader(test_ds, batch_size=8)

        # Reset model weights per fold if you want fresh initialization.
        # For HF models we re-load pretrained to reset classifier head; for GPT2 custom we re-init from pretrained.
        # This keeps each fold independent.
        if name == "BERT-base":
            model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)
            tokenizer = bert_tokenizer
        elif name == "RoBERTa-base":
            model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=1)
            tokenizer = roberta_tokenizer
        else:  # GPT-2
            model = GPT2ForRegression(pretrained_model="gpt2")
            tokenizer = gpt2_tokenizer
            tokenizer.pad_token = tokenizer.eos_token

        # Train
        train_transformer(model, train_loader, device=device, epochs=10, lr=2e-5)

        # Evaluate
        model.to(device)
        model.eval()
        preds = []
        trues = []

        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                if isinstance(outputs, torch.Tensor):
                    logits = outputs.view(-1)
                else:
                    logits = outputs.logits.view(-1)

                # ensure shapes
                if logits.dim() == 0:
                    logits = logits.unsqueeze(0)

                preds.extend(logits.cpu().numpy())
                trues.extend(labels.cpu().numpy())

        fold_metrics.append(compute_metrics(np.array(trues), np.array(preds)))

    # average across folds
    avg_metrics = np.mean(fold_metrics, axis=0)
    results.append({
        "Model": name,
        "MSE": avg_metrics[0],
        "RMSE": avg_metrics[1],
        "MAE": avg_metrics[2],
        "R²": avg_metrics[3],
        "MMRE": avg_metrics[4],
        "Pred(25)": avg_metrics[5]
    })

# Final results DataFrame
results_df = pd.DataFrame(results).sort_values("RMSE").reset_index(drop=True)
print("\n===== FINAL 5-FOLD CV RESULTS =====\n")
print(results_df)


Fold 2 / 5
Fold 3 / 5
Fold 4 / 5
Fold 5 / 5

===== FINAL 5-FOLD CV RESULTS =====

          Model       MSE      RMSE       MAE        R²      MMRE  Pred(25)
0  RoBERTa-base  2.277996  1.485058  1.242570 -0.063013  2.915608  0.107018
1     BERT-base  4.266690  2.025075  1.697660 -1.034724  0.936477  0.128070
2         GPT-2  4.765156  2.132437  1.808653 -1.316325  0.994240  0.074854


In [None]:
# maxwell_transformer_cv.py
import os
import random
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

# reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# ---------------------------
# Load Maxwell dataset
# ---------------------------
projects = pd.read_csv("maxwell.csv")

# Drop Syear if present (same as your ML code)
if 'Syear' in projects.columns:
    projects.drop(['Syear'], axis=1, inplace=True)

target = 'Effort'
feature_cols = [c for c in projects.columns if c != target]

# ---------------------------
# Log-transform skewed continuous variables (if present)
# ---------------------------
cols_to_log = ['Effort', 'Duration', 'Size', 'Time']
for col in cols_to_log:
    if col in projects.columns:
        # replace 0 with 1 to avoid -inf, then log
        projects[col] = np.log(projects[col].replace(0, 1).astype(float))

# ---------------------------
# Normalize categorical-like integer columns
# ---------------------------
categorical_like = [
    'App','Har','Dba','Ifc','Source','Telonuse','Nlan'
] + [f"T{i:02d}" for i in range(1,16) if f"T{i:02d}" in projects.columns]

# Only scale those that actually exist
categorical_like = [c for c in categorical_like if c in projects.columns]
if categorical_like:
    projects[categorical_like] = StandardScaler().fit_transform(projects[categorical_like])

# Final X, y (y is in log-space already if Effort existed and was transformed)
if target not in projects.columns:
    raise ValueError(f"Target column '{target}' not found in the dataset.")

X = projects.drop(columns=[target])
y = projects[target].values  # will be log(Effort) if Effort was in cols_to_log

# ---------------------------
# Create natural-language descriptions automatically
# ---------------------------
def row_to_text_auto(row, feature_names):
    parts = []
    for f in feature_names:
        # make short readable component
        parts.append(f"{f} {row[f]}")
    return ". ".join(parts) + "."

projects['text'] = projects.apply(lambda r: row_to_text_auto(r, X.columns), axis=1)

# ---------------------------
# Metrics (same structure as original: metrics computed in log-space,
# MMRE/Pred(25) computed in real space after exp)
# ---------------------------
def compute_metrics(y_true_log, y_pred_log):
    """
    Input: arrays in log-space (i.e., log(Effort))
    Returns: mse, rmse, mae, r2 (all on log-space), mmre, pred25 (on real scale)
    """
    # ensure numpy arrays
    y_true_log = np.array(y_true_log).astype(float)
    y_pred_log = np.array(y_pred_log).astype(float)

    # metrics on log-space
    mse = mean_squared_error(y_true_log, y_pred_log)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true_log, y_pred_log)
    r2 = r2_score(y_true_log, y_pred_log)

    # convert to real space for relative error metrics
    y_true = np.exp(y_true_log)
    y_pred = np.exp(y_pred_log)
    # avoid division by zero (mask where y_true==0)
    eps = 1e-9
    mre = np.abs((y_true - y_pred) / (y_true + eps))
    mmre = np.mean(mre)
    pred_25 = np.mean(mre <= 0.25)

    return mse, rmse, mae, r2, mmre, pred_25

# ---------------------------
# PyTorch Dataset for text
# ---------------------------
class TextRegDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tok = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": tok["input_ids"].squeeze(0),
            "attention_mask": tok["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.float)
        }

# ---------------------------
# GPT-2 regression wrapper
# ---------------------------
from transformers import GPT2Model
class GPT2ForRegression(torch.nn.Module):
    def __init__(self, pretrained_model="gpt2"):
        super().__init__()
        self.gpt2 = GPT2Model.from_pretrained(pretrained_model)
        self.regressor = torch.nn.Linear(self.gpt2.config.n_embd, 1)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden = outputs.last_hidden_state  # [batch, seq_len, hidden_size]
        pooled = last_hidden[:, 0, :]           # use first token for regression
        return self.regressor(pooled)           # [batch, 1]

# ---------------------------
# Transformer training function (MSELoss)
# ---------------------------
def train_transformer(model, train_loader, epochs=10, lr=2e-5, device=None):
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=lr)
    loss_fn = torch.nn.MSELoss()

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        for batch in train_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # handle outputs shape:
            # - GPT2ForRegression returns [batch, 1] tensor
            # - HuggingFace SequenceClassification models return ModelOutput with .logits [batch, 1]
            if isinstance(outputs, torch.Tensor):
                logits = outputs.view(-1)
            else:
                # e.g., transformers' ModelOutput
                logits = outputs.logits.view(-1)

            loss = loss_fn(logits, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * input_ids.size(0)

        avg_loss = total_loss / (len(train_loader.dataset) + 1e-9)
        #print(f"Epoch {epoch+1}/{epochs} — Train Loss: {avg_loss:.6f}")

# ---------------------------
# Prepare 5-Fold CV
# ---------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
results = []

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# instantiate models/tokenizers
print("Loading models and tokenizers (this may take a while)...")
# BERT and RoBERTa as regression by using num_labels=1
bert_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

roberta_model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=1)
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# GPT-2 custom regression
gpt2_model = GPT2ForRegression(pretrained_model="gpt2")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# ensure pad token exists for GPT-2
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

transformer_models = {
    "BERT-base": (bert_model, bert_tokenizer),
    "RoBERTa-base": (roberta_model, roberta_tokenizer),
    "GPT-2": (gpt2_model, gpt2_tokenizer)
}
# For GPT-2 ensure model's embeddings match tokenizer (optional)
# gpt2_model.gpt2.resize_token_embeddings(len(gpt2_tokenizer))

# ---------------------------
# 5-Fold CV loop
# ---------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

for name, (model, tokenizer) in transformer_models.items():
    print(f"\n=== Running 5-Fold CV for {name} ===")
    fold_metrics = []

    for fold, (train_idx, test_idx) in enumerate(kf.split(projects), 1):
        print(f"Fold {fold} — preparing data")
        train_df = projects.iloc[train_idx].reset_index(drop=True)
        test_df = projects.iloc[test_idx].reset_index(drop=True)

        train_texts = train_df['text'].tolist()
        train_labels = train_df[target].tolist()  # log-space labels
        test_texts = test_df['text'].tolist()
        test_labels = test_df[target].tolist()

        train_dataset = TextRegDataset(train_texts, train_labels, tokenizer, max_length=128)
        test_dataset  = TextRegDataset(test_texts, test_labels, tokenizer, max_length=128)

        train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
        test_loader  = DataLoader(test_dataset, batch_size=8)

        # train (note: models are reused across folds, so re-load fresh weights to avoid leakage)
        # Reload model weights from pretrained each fold for fair CV
        if name == "BERT-base":
            model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1)
            tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        elif name == "RoBERTa-base":
            model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=1)
            tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
        else:  # GPT-2
            model = GPT2ForRegression(pretrained_model="gpt2")
            tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
            tokenizer.pad_token = tokenizer.eos_token

        train_transformer(model, train_loader, epochs=10, lr=2e-5, device=device)

        # evaluation
        model.eval()
        preds, trues = [], []
        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                if isinstance(outputs, torch.Tensor):
                    logits = outputs.view(-1)
                else:
                    logits = outputs.logits.view(-1)

                # handle zero-dim tensors
                if logits.dim() == 0:
                    logits = logits.unsqueeze(0)

                preds.extend(logits.cpu().numpy().tolist())
                trues.extend(labels.cpu().numpy().tolist())

        # compute metrics for this fold (inputs are log-space)
        fold_metric = compute_metrics(np.array(trues), np.array(preds))
        print(f"Fold {fold} metrics (MSE,RMSE,MAE,R2,MMRE,Pred25): {np.round(fold_metric,4)}")
        fold_metrics.append(fold_metric)

    # average across folds
    avg_metrics = np.mean(fold_metrics, axis=0)
    results.append({
        "Model": name,
        "MSE": avg_metrics[0],
        "RMSE": avg_metrics[1],
        "MAE": avg_metrics[2],
        "R²": avg_metrics[3],
        "MMRE": avg_metrics[4],
        "Pred(25)": avg_metrics[5]
    })

# results dataframe
results_df = pd.DataFrame(results)
print("\n===== 5-FOLD TRANSFORMER CV RESULTS =====\n")
print(results_df.sort_values("RMSE").reset_index(drop=True))


Fold 4 metrics (MSE,RMSE,MAE,R2,MMRE,Pred25): [ 5.4917  2.3434  2.1397 -5.0115  0.8122  0.    ]
Fold 5 — preparing data
Fold 5 metrics (MSE,RMSE,MAE,R2,MMRE,Pred25): [ 15.5607   3.9447   3.8269 -16.0006   0.9662   0.    ]

===== 5-FOLD TRANSFORMER CV RESULTS =====

          Model        MSE      RMSE       MAE         R²      MMRE  Pred(25)
0  RoBERTa-base   4.350357  2.054900  1.816770  -3.595136  0.736251  0.114103
1         GPT-2  13.240717  3.453824  3.301733 -13.662509  0.905722  0.000000
2     BERT-base  16.049604  3.971133  3.851986 -16.173669  0.963143  0.000000


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

# ====================================================
# Load Kitchenham Dataset
# ====================================================
df = pd.read_csv("kitchenham.csv")

# Keep only the useful features (same as your ML code)
keep_cols = [
    "Project.type",
    "Actual.duration",
    "Actual.effort",
    "Adjusted.function.points",
    "First.estimate"
]
df = df[keep_cols].copy()

target = "Actual.effort"

# ====================================================
# Label encode categorical columns
# ====================================================
cat_cols = df.select_dtypes(include="object").columns
label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# ====================================================
# Log-transform skewed numeric columns
# (same as previous code)
# ====================================================
cols_to_log = ["Actual.effort", "Actual.duration",
               "Adjusted.function.points", "First.estimate"]

for col in cols_to_log:
    df[col] = np.log(df[col].replace(0, 1))

# ====================================================
# Scale numeric features
# ====================================================
num_cols = [c for c in df.columns if c != target]
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# ====================================================
# Create natural language description for each row
# ====================================================
def row_to_text(row):
    text = (
        f"Project type encoded as {row['Project.type']}. "
        f"Project duration {row['Actual.duration']}. "
        f"Adjusted function points {row['Adjusted.function.points']}. "
        f"First estimation value {row['First.estimate']}."
    )
    return text

df["text"] = df.apply(row_to_text, axis=1)

# ====================================================
# Metrics
# ====================================================
def compute_metrics(y_true_log, y_pred_log):
    y_true = np.exp(y_true_log)
    y_pred = np.exp(y_pred_log)

    mse = mean_squared_error(y_true_log, y_pred_log)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true_log, y_pred_log)
    r2 = r2_score(y_true_log, y_pred_log)

    mre = np.abs((y_true - y_pred) / y_true)
    mmre = np.mean(mre)
    pred_25 = np.mean(mre <= 0.25)

    return mse, rmse, mae, r2, mmre, pred_25

# ====================================================
# PyTorch Dataset
# ====================================================
class TextRegDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tok = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )
        return {
            "input_ids": tok["input_ids"].squeeze(),
            "attention_mask": tok["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.float)
        }

# ====================================================
# GPT-2 Regression Head
# ====================================================
from transformers import GPT2Model

class GPT2ForRegression(torch.nn.Module):
    def __init__(self, pretrained_model="gpt2"):
        super().__init__()
        self.gpt2 = GPT2Model.from_pretrained(pretrained_model)
        self.regressor = torch.nn.Linear(self.gpt2.config.n_embd, 1)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden = outputs.last_hidden_state
        pooled = last_hidden[:, 0, :]
        return self.regressor(pooled)

# ====================================================
# Transformer training loop
# ====================================================
def train_transformer(model, train_loader, epochs=10):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=2e-5)
    loss_fn = torch.nn.MSELoss()

    for epoch in range(epochs):
        model.train()
        for batch in train_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # GPT-2 returns a tensor; BERT/RoBERTa return SequenceClassifierOutput
            if hasattr(outputs, "logits"):
                logits = outputs.logits.squeeze(-1)    # [batch]
            else:
                logits = outputs.squeeze(-1)           # [batch]

            loss = loss_fn(logits, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

# ====================================================
# Prepare 5-fold CV
# ====================================================
kf = KFold(n_splits=5, shuffle=True, random_state=42)
results = []

# Transformer models
from transformers import (
    BertTokenizer, BertForSequenceClassification,
    RobertaTokenizer, RobertaForSequenceClassification,
    GPT2Tokenizer
)

transformer_models = {
    "BERT-base": (
        BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1),
        BertTokenizer.from_pretrained("bert-base-uncased")
    ),
    "RoBERTa-base": (
        RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=1),
        RobertaTokenizer.from_pretrained("roberta-base")
    ),
    "GPT-2": (
        GPT2ForRegression("gpt2"),
        GPT2Tokenizer.from_pretrained("gpt2")
    )
}

# GPT-2 padding
transformer_models["GPT-2"][1].pad_token = transformer_models["GPT-2"][1].eos_token


# ====================================================
# Run 5-fold CV
# ====================================================
for name, (model, tokenizer) in transformer_models.items():
    print(f"\nRunning 5-Fold CV for {name}...")
    fold_metrics = []

    for train_idx, test_idx in kf.split(df):
        train_df = df.iloc[train_idx]
        test_df = df.iloc[test_idx]

        train_ds = TextRegDataset(train_df["text"].tolist(), train_df[target].tolist(), tokenizer)
        test_ds = TextRegDataset(test_df["text"].tolist(), test_df[target].tolist(), tokenizer)

        train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
        test_loader = DataLoader(test_ds, batch_size=8)

        train_transformer(model, train_loader, epochs=10)

        # Evaluation
        model.eval()
        preds, trues = [], []
        device = "cuda" if torch.cuda.is_available() else "cpu"

        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)

                # GPT-2 returns a tensor; BERT/RoBERTa return SequenceClassifierOutput
                if hasattr(outputs, "logits"):
                    logits = outputs.logits.squeeze(-1)    # [batch]
                else:
                    logits = outputs.squeeze(-1)           # [batch]

                preds.extend(logits.cpu().numpy())
                trues.extend(labels.cpu().numpy())

        fold_metrics.append(compute_metrics(np.array(trues), np.array(preds)))

    avg = np.mean(fold_metrics, axis=0)
    results.append({
        "Model": name,
        "MSE": avg[0],
        "RMSE": avg[1],
        "MAE": avg[2],
        "R²": avg[3],
        "MMRE": avg[4],
        "Pred(25)": avg[5]
    })

# ====================================================
# Final comparison table
# ====================================================
results_df = pd.DataFrame(results)
print("\n===== FINAL TRANSFORMER RESULTS (Kitchenham) =====\n")
print(results_df.sort_values("RMSE"))


Running 5-Fold CV for RoBERTa-base...

Running 5-Fold CV for GPT-2...

===== FINAL TRANSFORMER RESULTS (Kitchenham) =====

          Model       MSE      RMSE       MAE        R²      MMRE  Pred(25)
0     BERT-base  0.346235  0.484373  0.362807  0.698505  0.389827  0.565517
1  RoBERTa-base  0.447225  0.592818  0.476157  0.575531  0.663540  0.365517
2         GPT-2  1.397837  1.134409  0.914286 -0.416352  1.021157  0.220690
