In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Install necessary libraries
!pip install -q pandas scikit-learn

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load the dataset
df = pd.read_csv('/content/drive/Shareddrives/Fireüî•üî•/Code_Comment_Seed_Data.csv')  # replace with your actual file path

# Clean missing values if any
df.dropna(subset=['Comments', 'Surrounding Code Context', 'Class'], inplace=True)

# Combine comment + code context into one input string
df['input'] = df['Comments'].astype(str) + ' ' + df['Surrounding Code Context'].astype(str)

# Map class to binary
df['label'] = df['Class'].map({'Useful': 1, 'Not Useful': 0})

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['input'], df['label'], test_size=0.2, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=10000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [None]:
# Train classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_vec, y_train)

# Predictions
y_pred = clf.predict(X_test_vec)

# Evaluation
print("üîç Classification Report:")
print(classification_report(y_test, y_pred))

üîç Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.62      0.69       888
           1       0.79      0.89      0.84      1403

    accuracy                           0.79      2291
   macro avg       0.79      0.76      0.77      2291
weighted avg       0.79      0.79      0.78      2291



Hugging face model

DistilBERT

In [None]:
!pip uninstall -y transformers accelerate datasets huggingface_hub
!pip install transformers==4.44.2 accelerate==0.34.2 datasets==2.21.0 huggingface_hub -U

Found existing installation: transformers 4.56.1
Uninstalling transformers-4.56.1:
  Successfully uninstalled transformers-4.56.1
Found existing installation: accelerate 1.10.1
Uninstalling accelerate-1.10.1:
  Successfully uninstalled accelerate-1.10.1
Found existing installation: datasets 4.0.0
Uninstalling datasets-4.0.0:
  Successfully uninstalled datasets-4.0.0
Found existing installation: huggingface-hub 0.34.4
Uninstalling huggingface-hub-0.34.4:
  Successfully uninstalled huggingface-hub-0.34.4
Collecting transformers==4.44.2
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m43.7/43.7 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate==0.34.2
  Downloading accelerate-0.34.2-py3-none-any.whl.metadata (19 kB)
Collecting datasets==2.21.0
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB

In [None]:
import numpy as np
from datasets import Dataset
import torch
from sklearn.metrics import f1_score, classification_report

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer



dataset = Dataset.from_pandas(df).train_test_split(test_size=0.2, seed=42)
# -----------------------------
# 2. Choose model
# -----------------------------
MODEL_NAME = "distilbert-base-uncased"   # üî• swap to DistilBERT

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(batch["input"], truncation=True, padding="max_length", max_length=128)

dataset = dataset.map(tokenize, batched=True)

# -----------------------------
# 3. Load model
# -----------------------------
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# -----------------------------
# 4. Training setup (same as before)
# -----------------------------
args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {"f1": f1_score(p.label_ids, preds, average="weighted")}

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# -----------------------------
# 5. Train
# -----------------------------
trainer.train()

# -----------------------------
# 6. Evaluate + Classification Report
# -----------------------------
preds = trainer.predict(dataset["test"])
probs = torch.nn.functional.softmax(torch.tensor(preds.predictions), dim=-1).numpy()
y_true = preds.label_ids
y_pred = np.argmax(probs, axis=1)

print("üîç Classification Report (DistilBERT):")
print(classification_report(y_true, y_pred, digits=2))




Map:   0%|          | 0/9161 [00:00<?, ? examples/s]

Map:   0%|          | 0/2291 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.4789,0.413905,0.819566
2,0.3776,0.386774,0.822211
3,0.3308,0.415126,0.825116


üîç Classification Report (DistilBERT):
              precision    recall  f1-score   support

           0       0.81      0.71      0.76       863
           1       0.84      0.90      0.87      1428

    accuracy                           0.83      2291
   macro avg       0.82      0.80      0.81      2291
weighted avg       0.83      0.83      0.83      2291



In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)  # take argmax for classification
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Recreate trainer with compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Evaluate again
eval_results = trainer.evaluate()
print("üîπ Hugging Face DistilBERT Evaluation:", eval_results)


üîπ Hugging Face DistilBERT Evaluation: {'eval_loss': 0.30146753787994385, 'eval_model_preparation_time': 0.0024, 'eval_accuracy': 0.8677433435181143, 'eval_precision': 0.8661784287616512, 'eval_recall': 0.9272986457590877, 'eval_f1': 0.895697074010327, 'eval_runtime': 17.5677, 'eval_samples_per_second': 130.41, 'eval_steps_per_second': 16.337}


In [None]:
df.dropna(inplace=True)
print(df.isnull().sum())

Comments                    0
Surrounding Code Context    0
Class                       0
input                       0
label                       0
dtype: int64


DistilBERT - trained on merged.csv

In [None]:
import numpy as np
from datasets import Dataset
import torch
from sklearn.metrics import f1_score, classification_report

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# if your df column is 'label'
df['label'] = df['label'].astype(int)

dataset = Dataset.from_pandas(df).train_test_split(test_size=0.2, seed=42)
# -----------------------------
# 2. Choose model
# -----------------------------
MODEL_NAME = "distilbert-base-uncased"   # üî• swap to DistilBERT

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(batch["input"], truncation=True, padding="max_length", max_length=128)

dataset = dataset.map(tokenize, batched=True)

# -----------------------------
# 3. Load model
# -----------------------------
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# -----------------------------
# 4. Training setup (same as before)
# -----------------------------
args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {"f1": f1_score(p.label_ids, preds, average="weighted")}

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# -----------------------------
# 5. Train
# -----------------------------
trainer.train()

# -----------------------------
# 6. Evaluate + Classification Report
# -----------------------------
preds = trainer.predict(dataset["test"])
probs = torch.nn.functional.softmax(torch.tensor(preds.predictions), dim=-1).numpy()
y_true = preds.label_ids
y_pred = np.argmax(probs, axis=1)

print("üîç Classification Report (DistilBERT):")
print(classification_report(y_true, y_pred, digits=2))




Map:   0%|          | 0/12272 [00:00<?, ? examples/s]

Map:   0%|          | 0/3068 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.3794,0.330701,0.856279
2,0.2653,0.310485,0.871144
3,0.2222,0.318457,0.870237


üîç Classification Report (DistilBERT):
              precision    recall  f1-score   support

           0       0.88      0.80      0.84      1257
           1       0.87      0.92      0.90      1811

    accuracy                           0.87      3068
   macro avg       0.87      0.86      0.87      3068
weighted avg       0.87      0.87      0.87      3068



In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)  # take argmax for classification
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Recreate trainer with compute_metrics
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Evaluate again
eval_results = trainer.evaluate()
print("üîπ Hugging Face DistilBERT Evaluation:", eval_results)


üîπ Hugging Face DistilBERT Evaluation: {'eval_loss': 0.31048473715782166, 'eval_model_preparation_time': 0.0227, 'eval_accuracy': 0.8722294654498044, 'eval_precision': 0.8685714285714285, 'eval_recall': 0.9232468249585865, 'eval_f1': 0.8950749464668094, 'eval_runtime': 12.0105, 'eval_samples_per_second': 255.443, 'eval_steps_per_second': 15.986}


In [None]:
df['label'].value_counts()


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,7063
0,4389


Using stacked model

In [None]:
# Install necessary libraries
!pip install -q pandas scikit-learn

import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from scipy.stats import randint, uniform

# Load the dataset
df = pd.read_csv('/content/drive/Shareddrives/Fireüî•üî•/Code_Comment_Seed_Data.csv')

# Clean missing values if any
df.dropna(subset=['Comments', 'Surrounding Code Context', 'Class'], inplace=True)

# Combine comment + code context into one input string
df['input'] = df['Comments'].astype(str) + ' ' + df['Surrounding Code Context'].astype(str)

# Map class to binary
df['label'] = df['Class'].map({'Useful': 1, 'Not Useful': 0})

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['input'], df['label'], test_size=0.2, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=10000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# -----------------------------
# Hyperparameter tuning for base models
# -----------------------------

# 1Ô∏è‚É£ Multinomial Naive Bayes
nb = MultinomialNB()
nb_params = {
    'alpha': uniform(0, 1)
}
nb_search = RandomizedSearchCV(nb, nb_params, n_iter=10, cv=3, random_state=42, n_jobs=-1)
nb_search.fit(X_train_vec, y_train)
best_nb = nb_search.best_estimator_

# 2Ô∏è‚É£ Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt_params = {
    'max_depth': randint(2, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'criterion': ['gini', 'entropy']
}
dt_search = RandomizedSearchCV(dt, dt_params, n_iter=10, cv=3, random_state=42, n_jobs=-1)
dt_search.fit(X_train_vec, y_train)
best_dt = dt_search.best_estimator_

# 3Ô∏è‚É£ Random Forest
rf = RandomForestClassifier(random_state=42)
rf_params = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(2, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'criterion': ['gini', 'entropy']
}
rf_search = RandomizedSearchCV(rf, rf_params, n_iter=10, cv=3, random_state=42, n_jobs=-1)
rf_search.fit(X_train_vec, y_train)
best_rf = rf_search.best_estimator_

# -----------------------------
# Stacking Classifier
# -----------------------------
estimators = [
    ('naive_bayes', best_nb),
    ('decision_tree', best_dt),
    ('random_forest', best_rf)
]

stack_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000),
    cv=5,
    n_jobs=-1
)

# Train stacked model
stack_clf.fit(X_train_vec, y_train)

# Predictions
y_pred_stack = stack_clf.predict(X_test_vec)

# Evaluation
print("üîç Stacked Model Classification Report:")
print(classification_report(y_test, y_pred_stack))


üîç Stacked Model Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.67      0.72       888
           1       0.81      0.88      0.84      1403

    accuracy                           0.80      2291
   macro avg       0.79      0.77      0.78      2291
weighted avg       0.79      0.80      0.79      2291



In [None]:
clf = LogisticRegression(max_iter=1000, class_weight='balanced')


In [None]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train_vec, y_train)


In [None]:
import re

def clean_code_context(text):
    # Remove line numbers like "-1." or "2."
    text = re.sub(r'-?\d+\.\s*', '', text)
    # Remove strange formatting or ASCII noise
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text

df['clean_input'] = df['Comments'].astype(str) + ' ' + df['Surrounding Code Context'].astype(str).apply(clean_code_context)


In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=200, max_depth=30, class_weight='balanced', random_state=42)
clf.fit(X_train_vec, y_train)


In [None]:
y_pred = clf.predict(X_test_vec)

# Evaluation
print("üîç Classification Report:")
print(classification_report(y_test, y_pred))


üîç Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.76      0.72       888
           1       0.84      0.77      0.80      1403

    accuracy                           0.77      2291
   macro avg       0.76      0.77      0.76      2291
weighted avg       0.78      0.77      0.77      2291



In [None]:
!pip install xgboost
from xgboost import XGBClassifier

clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=4389/7063)
clf.fit(X_train_vec, y_train)




Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
y_pred = clf.predict(X_test_vec)

# Evaluation
print("üîç Classification Report:")
print(classification_report(y_test, y_pred))


üîç Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.77      0.73       888
           1       0.84      0.78      0.81      1403

    accuracy                           0.78      2291
   macro avg       0.77      0.78      0.77      2291
weighted avg       0.78      0.78      0.78      2291



In [None]:
!pip install --upgrade --force-reinstall transformers==4.41.1


In [None]:
pip uninstall -y transformers peft


Found existing installation: transformers 4.41.1
Uninstalling transformers-4.41.1:
  Successfully uninstalled transformers-4.41.1
Found existing installation: peft 0.17.0
Uninstalling peft-0.17.0:
  Successfully uninstalled peft-0.17.0


In [None]:
pip install transformers==4.37.2 peft==0.10.0


Collecting transformers==4.37.2
  Downloading transformers-4.37.2-py3-none-any.whl.metadata (129 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m129.4/129.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft==0.10.0
  Downloading peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers==4.37.2)
  Downloading tokenizers-0.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.37.2-py3-none-any.whl (8.4 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m8.4/8.4 MB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚î

In [None]:
!pip install transformers datasets

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch

# Prepare dataset
df_hf = df[['clean_input', 'label']].rename(columns={'clean_input': 'text', 'label': 'label'})
dataset = Dataset.from_pandas(df_hf)

# Split into train/test
dataset = dataset.train_test_split(test_size=0.2)

# Load tokenizer/model
checkpoint = "microsoft/codebert-base"  # or try "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

# Tokenize
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=256)

dataset = dataset.map(tokenize, batched=True)
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Train
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test']
)

trainer.train()


Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m193.6/193.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.7.0
    Uninstalling fsspec-2025.7.0:
      Successfully uninstalled fsspec-2025.7.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 5.1.0 requires transformers<5.0.0,>=4.41.0, but you have transformers 4.37.2 which is incompatible.[0m[31m
[0mSuccessfully installed fsspec-2025.3.0


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/9161 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Map:   0%|          | 0/2291 [00:00<?, ? examples/s]

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",     # Correct for new versions
)


In [None]:
import transformers
print(transformers.__version__)


In [None]:
from sklearn.metrics import classification_report
import numpy as np

# Step 1: Get predictions on the test dataset
predictions = trainer.predict(dataset['test'])

# Step 2: Extract predicted class labels
# predictions.predictions gives you raw logits ‚Äî use argmax to get final labels
y_pred = np.argmax(predictions.predictions, axis=1)

# Step 3: True labels
y_true = predictions.label_ids

# Step 4: Print classification report
print("üîç Classification Report (BERT):")
print(classification_report(y_true, y_pred))


In [None]:
!pip uninstall -y transformers accelerate datasets huggingface_hub
!pip install transformers==4.44.2 accelerate==0.34.2 datasets==2.21.0 huggingface_hub -U


Deberta

In [None]:
import pandas as pd
from datasets import Dataset
from sklearn.metrics import classification_report, f1_score
import numpy as np
import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# Hugging Face dataset
dataset = Dataset.from_pandas(df).train_test_split(test_size=0.2, seed=42)

# -----------------------------
# 2. Choose model (swap here)
# -----------------------------
MODEL_NAME = "microsoft/deberta-v3-small"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(batch["input"], truncation=True, padding="max_length", max_length=128)

dataset = dataset.map(tokenize, batched=True)

# -----------------------------
# 3. Load model
# -----------------------------
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# -----------------------------
# 4. Training setup
# -----------------------------
args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

# Metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        "f1": f1_score(p.label_ids, preds, average="weighted"),
    }

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# -----------------------------
# 5. Train
# -----------------------------
trainer.train()

# -----------------------------
# 6. Evaluate + Classification Report
# -----------------------------
preds = trainer.predict(dataset["test"])
probs = torch.nn.functional.softmax(torch.tensor(preds.predictions), dim=-1).numpy()

# Use argmax for predictions (or tune threshold later)
y_true = preds.label_ids
y_pred = np.argmax(probs, axis=1)

print("üîç Classification Report (DEBERTa):")
print(classification_report(y_true, y_pred))


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/9161 [00:00<?, ? examples/s]

Map:   0%|          | 0/2291 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mvidarshanaa[0m ([33mssncollege[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,F1
1,0.4907,0.431214,0.81401
2,0.3869,0.381605,0.834247
3,0.3379,0.409589,0.835068


üîç Classification Report (DEBERTa):
              precision    recall  f1-score   support

           0       0.82      0.73      0.77       863
           1       0.85      0.90      0.87      1428

    accuracy                           0.84      2291
   macro avg       0.83      0.82      0.82      2291
weighted avg       0.84      0.84      0.84      2291



In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer, TrainingArguments

# -----------------------------
# Metrics function
# -----------------------------
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)  # argmax for classification
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# -----------------------------
# TrainingArguments (reuse or tweak)
# -----------------------------
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="wandb"  # or "none" if you don't want logging
)

# -----------------------------
# Trainer for DeBERTa
# -----------------------------
trainer = Trainer(
    model=model,                  # your DeBERTa model
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# -----------------------------
# Evaluate
# -----------------------------
eval_results = trainer.evaluate()
print("üîπ Hugging Face DeBERTa Evaluation:", eval_results)




üîπ Hugging Face DeBERTa Evaluation: {'eval_loss': 0.40958869457244873, 'eval_model_preparation_time': 0.0025, 'eval_accuracy': 0.8371890004364906, 'eval_precision': 0.8463558765594222, 'eval_recall': 0.9026610644257703, 'eval_f1': 0.8736021687563538, 'eval_runtime': 11.4463, 'eval_samples_per_second': 200.153, 'eval_steps_per_second': 12.581}


Roberta

In [None]:
import pandas as pd
from datasets import Dataset
from sklearn.metrics import classification_report, f1_score
import numpy as np
import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# Hugging Face dataset
dataset = Dataset.from_pandas(df).train_test_split(test_size=0.2, seed=42)

# -----------------------------
# 2. Choose model (swap here)
# -----------------------------
MODEL_NAME = "roberta-base"   # üî• can change to "microsoft/deberta-v3-small" or "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(batch["input"], truncation=True, padding="max_length", max_length=128)

dataset = dataset.map(tokenize, batched=True)

# -----------------------------
# 3. Load model
# -----------------------------
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# -----------------------------
# 4. Training setup
# -----------------------------
args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

# Metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        "f1": f1_score(p.label_ids, preds, average="weighted"),
    }

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# -----------------------------
# 5. Train
# -----------------------------
trainer.train()

# -----------------------------
# 6. Evaluate + Classification Report
# -----------------------------
preds = trainer.predict(dataset["test"])
probs = torch.nn.functional.softmax(torch.tensor(preds.predictions), dim=-1).numpy()

# Use argmax for predictions (or tune threshold later)
y_true = preds.label_ids
y_pred = np.argmax(probs, axis=1)

print("üîç Classification Report (RoBERTa):")
print(classification_report(y_true, y_pred))


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Map:   0%|          | 0/9161 [00:00<?, ? examples/s]

Map:   0%|          | 0/2291 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.3638,0.337162,0.886396
2,0.2369,0.212616,0.914494
3,0.2035,0.234926,0.91764


üîç Classification Report (RoBERTa):
              precision    recall  f1-score   support

           0       0.93      0.85      0.89       863
           1       0.91      0.96      0.94      1428

    accuracy                           0.92      2291
   macro avg       0.92      0.91      0.91      2291
weighted avg       0.92      0.92      0.92      2291



In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer, TrainingArguments

# -----------------------------
# Metrics function
# -----------------------------
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)  # argmax for classification
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# -----------------------------
# TrainingArguments (reuse your previous settings)
# -----------------------------
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="wandb"  # or "none" if you don't want logging
)

# -----------------------------
# Trainer for RoBERTa
# -----------------------------
trainer = Trainer(
    model=model,                  # your RoBERTa model
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# -----------------------------
# Evaluate
# -----------------------------
eval_results = trainer.evaluate()
print("üîπ Hugging Face RoBERTa Evaluation:", eval_results)




üîπ Hugging Face RoBERTa Evaluation: {'eval_loss': 0.23492573201656342, 'eval_model_preparation_time': 0.0047, 'eval_accuracy': 0.9183762549105194, 'eval_precision': 0.9144956579826319, 'eval_recall': 0.9586834733893558, 'eval_f1': 0.9360683760683761, 'eval_runtime': 18.5131, 'eval_samples_per_second': 123.75, 'eval_steps_per_second': 7.778}
