# **Modeling Notebook: Developer Role Classification**

# **1. Model Training and Evaluation**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score, precision_score, recall_score, ConfusionMatrixDisplay
import xgboost as xgb

import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
import time

# Reproducibility
np.random.seed(42)
torch.manual_seed(42)


# **Feature Selection and Data Splitting**


We prepare our dataset by dropping unnecessary or already processed columns. Commit messages are kept separate for the LLM. Afterwards, we split the dataset into train, validation, and test sets, maintaining class distributions with stratified splitting.

In [None]:
features_to_drop = [
    'index',              # ID column
    'fileextensions',     # Already processed into categories
    'timeofcommit',       # Temporal features already extracted
    'commitmessage',      # Will be used separately for LLM
]

X = processed_df.drop(features_to_drop + ['role'], axis=1)
y = processed_df['role']
commit_messages = processed_df['commitmessage']

print("Final feature matrix shape (traditional models):", X.shape)
print("Target shape:", y.shape)

# Stratified train-validation-test split
X_train, X_temp, y_train, y_temp, msg_train, msg_temp = train_test_split(
    X, y, commit_messages,
    test_size=0.3,
    random_state=42,
    stratify=y
)

X_val, X_test, y_val, y_test, msg_val, msg_test = train_test_split(
    X_temp, y_temp, msg_temp,
    test_size=0.5,
    random_state=42,
    stratify=y_temp
)

print("Training samples:", X_train.shape[0])
print("Validation samples:", X_val.shape[0])
print("Test samples:", X_test.shape[0])


# **Preprocessing Pipeline (Traditional Models)**


**We preprocess the features using a ColumnTransformer. Numeric features are scaled with RobustScaler to handle skewed distributions, and categorical features are one-hot encoded.**

In [None]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = ['committype']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
        ('num', RobustScaler(), numeric_features)
    ],
    remainder='passthrough'
)

# Test preprocessing
X_train_transformed = preprocessor.fit_transform(X_train)
print("Transformed training data shape:", X_train_transformed.shape)


# **Traditional Machine Learning Models**

# **Logistic Regression**


We start with a baseline Logistic Regression model. Class imbalance is handled using class_weight='balanced'. Macro F1 is our main metric

In [None]:
baseline_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        random_state=42,
        max_iter=1000,
        class_weight='balanced',
        multi_class='multinomial'
    ))
])

baseline_model.fit(X_train, y_train)
y_val_pred_baseline = baseline_model.predict(X_val)

print(classification_report(y_val, y_val_pred_baseline, digits=3))
baseline_metrics = {
    'macro_f1': f1_score(y_val, y_val_pred_baseline, average='macro'),
    'accuracy': accuracy_score(y_val, y_val_pred_baseline),
    'precision': precision_score(y_val, y_val_pred_baseline, average='macro'),
    'recall': recall_score(y_val, y_val_pred_baseline, average='macro')
}

# Confusion matrix
cm = confusion_matrix(y_val, y_val_pred_baseline, labels=baseline_model.classes_)
ConfusionMatrixDisplay(cm, display_labels=baseline_model.classes_).plot(cmap='Blues', values_format='d')
plt.title('Confusion Matrix - Logistic Regression')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


# **Random Forest**


**Random Forest is trained next, with class weighting to handle imbalance. This model can capture non-linear patterns better than logistic regression.**

In [None]:
random_forest_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        random_state=42,
        n_estimators=100,
        class_weight='balanced'
    ))
])

random_forest_model.fit(X_train, y_train)
y_val_pred_rf = random_forest_model.predict(X_val)

print(classification_report(y_val, y_val_pred_rf, digits=3))
rf_metrics = {
    'macro_f1': f1_score(y_val, y_val_pred_rf, average='macro'),
    'accuracy': accuracy_score(y_val, y_val_pred_rf),
    'precision': precision_score(y_val, y_val_pred_rf, average='macro'),
    'recall': recall_score(y_val, y_val_pred_rf, average='macro')
}

cm_rf = confusion_matrix(y_val, y_val_pred_rf, labels=random_forest_model.classes_)
ConfusionMatrixDisplay(cm_rf, display_labels=random_forest_model.classes_).plot(cmap='Blues', values_format='d')
plt.title('Confusion Matrix - Random Forest')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


# **XGBoost**


**XGBoost is included as it is powerful for tabular data. We encode the target labels to integers for multi-class classification.**

In [None]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

xgb_classifier = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='merror',
    random_state=42,
    objective='multi:softmax',
    n_estimators=100,
    learning_rate=0.1
)

xgb_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', xgb_classifier)
])

xgb_model.fit(X_train, y_train_encoded)
y_val_pred_xgb_encoded = xgb_model.predict(X_val)

print(classification_report(y_val_encoded, y_val_pred_xgb_encoded, digits=3, target_names=label_encoder.classes_))
xgb_metrics = {
    'macro_f1': f1_score(y_val_encoded, y_val_pred_xgb_encoded, average='macro'),
    'accuracy': accuracy_score(y_val_encoded, y_val_pred_xgb_encoded),
    'precision': precision_score(y_val_encoded, y_val_pred_xgb_e**XGBoost**

ncoded, average='macro'),
    'recall': recall_score(y_val_encoded, y_val_pred_xgb_encoded, average='macro')
}

cm_xgb = confusion_matrix(y_val_encoded, y_val_pred_xgb_encoded)
ConfusionMatrixDisplay(cm_xgb, display_labels=label_encoder.classes_).plot(cmap='Blues', values_format='d')
plt.title('Confusion Matrix - XGBoost')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


# **Large Language Model (LLM) Fine-tuning**

** Load Pretrained Model and TokenizerWe use bert-base-uncased and adapt it for multi-class classification based on the number of developer roles. GPU is utilized if available.**


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
num_labels = len(y.unique())
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f"Model: {model.config.model_type}, Labels: {num_labels}, Device: {device}")


In [None]:
class CommitMessageDataset(Dataset):
    def __init__(self, commit_messages, labels, tokenizer, max_len):
        self.commit_messages = commit_messages
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.commit_messages)

    def __getitem__(self, item):
        commit_message = str(self.commit_messages[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            commit_message,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        return {
            'commit_message_text': commit_message,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

label_map = {class_name: i for i, class_name in enumerate(label_encoder.classes_)}
MAX_LEN = 128

train_dataset = CommitMessageDataset(msg_train.values, y_train.map(label_map).values, tokenizer, MAX_LEN)
val_dataset = CommitMessageDataset(msg_val.values, y_val.map(label_map).values, tokenizer, MAX_LEN)
test_dataset = CommitMessageDataset(msg_test.values, y_test.map(label_map).values, tokenizer, MAX_LEN)

print("LLM datasets prepared:", len(train_dataset), len(val_dataset), len(test_dataset))


In [None]:
BATCH_SIZE = 16
EPOCHS = 4

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

optimizer = optim.Adam(model.parameters(), lr=2e-5)
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
loss_fn = torch.nn.CrossEntropyLoss()


In [None]:
final_llm_metrics = {
    'macro_f1': history['val_macro_f1'][-1],
    'accuracy': history['val_acc'][-1],
    'precision': precision_score(history['val_y_true'][-1], history['val_y_pred'][-1], average='macro'),
    'recall': recall_score(history['val_y_true'][-1], history['val_y_pred'][-1], average='macro')
}

all_models_metrics = {
    'Logistic Regression': baseline_metrics,
    'Random Forest': rf_metrics,
    'XGBoost': xgb_metrics,
    'Fine-tuned LLM': final_llm_metrics
}

comparison_df_all = pd.DataFrame(all_models_metrics).T
display(comparison_df_all.sort_values(by='macro_f1', ascending=False))


# **Final Evaluation on Test Set (LLM)**

Markdown:
The fine-tuned LLM is evaluated on the unseen test set to report final metrics and the confusion matrix.

In [None]:
test_loss_llm, test_acc_llm, test_y_true_llm, test_y_pred_llm = eval_model(model, test_dataloader, loss_fn, device)

test_macro_f1_llm = f1_score(test_y_true_llm, test_y_pred_llm, average='macro')
test_accuracy_llm = accuracy_score(test_y_true_llm, test_y_pred_llm)
test_precision_llm = precision_score(test_y_true_llm, test_y_pred_llm, average='macro')
test_recall_llm = recall_score(test_y_true_llm, test_y_pred_llm, average='macro')

print(f'Test loss: {test_loss_llm:.4f}, Accuracy: {test_accuracy_llm:.4f}, Macro F1: {test_macro_f1_llm:.4f}')
print(classification_report(test_y_true_llm, test_y_pred_llm, target_names=label_encoder.classes_, digits=3))

cm_llm_test = confusion_matrix(test_y_true_llm, test_y_pred_llm)
ConfusionMatrixDisplay(cm_llm_test, display_labels=label_encoder.classes_).plot(cmap='Blues', values_format='d')
plt.title('Confusion Matrix - Fine-tuned LLM (Test Set)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
