In [None]:
import os
import random
import math
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, f1_score
from datasets import load_dataset
from collections import defaultdict
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout, GlobalMaxPooling1D, concatenate
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Change lang here. I have collected all variables that are changed across different
# langs here for ease of changing but have left a commented out duplicate near where
# they were originally used to show context of what's being used in the code.
lang = 'pharo'  # IMPORTANT: Change this to see the notebook run for different languages
max_length = 30 # 26 python, 30 pharo, 35 java
learning_rate = 0.0025 # 0.0025 for all
batch_size = 48 # 60 for java, python, 48 for pharo
epochs = 10 # 10 python and pharo, 8 java

def lr_schedule(epoch, lr):
    if epoch < 5: # 5 for java, pharo, 2 for python
        return lr
    else:
        return lr * tf.math.exp(-0.33)
    
# Search for the phrase below and comment the line out for 'pharo' runs
# callbacks = [lr_scheduler]

In [None]:
langs = ['java', 'python', 'pharo']
labels = {
    'java': ['summary', 'Ownership', 'Expand', 'usage', 'Pointer', 'deprecation', 'rational'],
    'python': ['Usage', 'Parameters', 'DevelopmentNotes', 'Expand', 'Summary'],
    'pharo': ['Keyimplementationpoints', 'Example', 'Responsibilities', 'Classreferences', 'Intent', 'Keymessages', 'Collaborators']
}
ds = load_dataset('NLBSE/nlbse25-code-comment-classification')
ds

In [None]:
import requests
import zipfile

# This file is too big to upload directly to Github unfortunately
# so run this just once the first time you run the program to get
# it downloaded then comment this code out because it takes about
# 3 minutes for this to download and will attempt to redownload
# the file every time it's run.
response = requests.get("http://nlp.stanford.edu/data/glove.6B.zip", stream=True)
with open("glove.6B.zip", "wb") as f:
    for chunk in response.iter_content(chunk_size=2048):
        f.write(chunk)

with zipfile.ZipFile("glove.6B.zip", "r") as zip_path:
    zip_path.extractall("./glove")

In [None]:
# Using pretrained word embeddings courtesy of these folks below
# Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: Global Vectors for Word Representation

def load_glove(glove_path):
    embeddings_dict = {}
    with open(glove_path, encoding='utf8') as file:
        for dataline in file:
            splitLine = dataline.split()
            word = splitLine[0]
            vector = np.asarray(splitLine[1:], dtype='float32')
            embeddings_dict[word] = vector
    return embeddings_dict

embeddings_dict = load_glove('glove/glove.6B.100d.txt')

In [None]:
seed = 7359
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)


In [None]:
def find_significant_words(data, labels, min_word_count=2, sig_words_threshold=0.15):
    label_counts_dict = {label: defaultdict(int) for label in labels}
    label_total_word_count = {label: 0 for label in labels}
    all_word_counts = defaultdict(int)
    total_words = 0 
    
    # Getting counts for all the words in the labels
    for comment, label_list in zip(data['comments'], data['labels']):
        words = comment.split()
        total_words += len(words)
        for word in words:
            all_word_counts[word] += 1
        for label, is_present in zip(labels, label_list):
            if is_present == 1:
                label_total_word_count[label] += len(words)
                for word in words:
                    label_counts_dict[label][word] += 1

    full_vocab = all_word_counts.keys()
    for label in labels: # This sections adds Laplace smoothing
        for word in full_vocab:
            label_counts_dict[label][word] += 1
        label_total_word_count[label] += len(full_vocab)

    significant_words_dict = {}
    index = 0
    for word in full_vocab:
        word_total_count = all_word_counts[word]
        if word_total_count < min_word_count: # Keep at least at 2
            continue # Don't want to add words that only have a 1 count due to Laplace smoothing
        
        for label in labels:
            word_count_in_label = label_counts_dict[label][word]
            word_count_not_in_label = 0
            for other_label in labels:
                if other_label != label:
                    word_count_not_in_label += label_counts_dict[other_label][word]
            total_words_not_in_label = 0
            for other_label in labels:
                if other_label != label:
                    total_words_not_in_label += label_total_word_count[other_label]

            # Uses log odds to determine if a label is more prominent
            # in one class versus all the others
            word_prob_in_label = word_count_in_label / label_total_word_count[label]
            word_prob_not_in_label = word_count_not_in_label / total_words_not_in_label
            log_odds = math.log(word_prob_in_label / word_prob_not_in_label)
            if abs(log_odds) >= sig_words_threshold:
                if word not in significant_words_dict:
                    significant_words_dict[word] = index
                    index += 1

    print("Number of significant words: ", len(significant_words_dict))
    return significant_words_dict

def tokenize_with_significant_words(comments, significant_words_dict):
    tokenized_comments = []
    for comment in comments:
        words = comment.split()
        tokenized = []
        for word in words:
            if word in significant_words_dict:
                tokenized.append(significant_words_dict[word])
        tokenized_comments.append(tokenized)
    return tokenized_comments

In [None]:
train_comments = [dataline['comment_sentence'] for dataline in ds[f'{lang}_train']]
test_comments = [dataline['comment_sentence'] for dataline in ds[f'{lang}_test']]

train_data = {
    'comments': train_comments,
    'labels': [dataline['labels'] for dataline in ds[f'{lang}_train']]
}

# Snippet of the now defunct Tokenizer we started with
# max_vocab_size = 3000
# max_length = 30
# tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
# tokenizer.fit_on_texts(train_comments)

significant_words_dict = find_significant_words(train_data, labels[lang], min_word_count=2, sig_words_threshold=0.15)
train_tokenized_comments = tokenize_with_significant_words(train_comments, significant_words_dict)
test_tokenized_comments = tokenize_with_significant_words(test_comments, significant_words_dict)
train_lengths = [len(seq) for seq in train_tokenized_comments]
test_lengths = [len(seq) for seq in test_tokenized_comments]

train_mean = np.mean(train_lengths)
train_median = np.median(train_lengths)
train_max = np.max(train_lengths)
train_q1 = np.percentile(train_lengths, 25)
train_q3 = np.percentile(train_lengths, 75)

test_mean = np.mean(test_lengths)
test_median = np.median(test_lengths)
test_max = np.max(test_lengths)
test_q1 = np.percentile(test_lengths, 25)
test_q3 = np.percentile(test_lengths, 75)

print("\nTrain Sequences:")
print(f"Average Length = {train_mean:.2f}")
print("1st Quartile   =", train_q1)
print("Median Length  =", train_median)
print("3rd Quartile   =", train_q3)
print("Max Length     =", train_max)

print("\nTest Sequences:")
print(f"Average Length = {test_mean:.2f}")
print("1st Quartile   =", test_q1)
print("Median Length  =", test_median)
print("3rd Quartile   =", test_q3)
print("Max Length     =", test_max)

# Have moved a duplicate of this line to the top of the notebook for the sake
# of easily switching between the hyperparameters that worked for different langs.
# Keep this here to show context of where it's used.
# max_length = 35 # 26 python, 30 pharo, 35 java
train_with_padding = pad_sequences(train_tokenized_comments, maxlen=max_length, padding='post', truncating='post')
test_with_padding = pad_sequences(test_tokenized_comments, maxlen=max_length, padding='post', truncating='post')

train_labels = np.array([dataline['labels'] for dataline in ds[f'{lang}_train']])
test_labels = np.array([dataline['labels'] for dataline in ds[f'{lang}_test']])

In [None]:
def make_sig_word_weights(significant_words_dict, embeddings_dict, embedding_dim):
    vocab_len = len(significant_words_dict) + 1
    sig_word_weights = np.random.normal(scale=0.6, size=(vocab_len, embedding_dim))
    words_not_in_glove = 0
    for word, index in significant_words_dict.items():
        embedding_vector = embeddings_dict.get(word)
        # print(embedding_vector) # To get an idea of what scale above should be
        if embedding_vector is not None:
            sig_word_weights[index] = embedding_vector
        else:
            words_not_in_glove += 1
    print(f"Percent of words not in Glove vocab: {((words_not_in_glove/len(significant_words_dict)) * 100):.2f}%")
    return sig_word_weights

embedding_dim = 100  # Glove 100d
sig_word_weights = make_sig_word_weights(significant_words_dict, embeddings_dict, embedding_dim)


In [None]:
# # Attempts to use stratify with this for more than binary labels blows this function up
# from sklearn.model_selection import train_test_split

# train_with_padding, val_with_padding, train_labels, val_labels = train_test_split(
#     train_with_padding, train_labels, test_size=0.2, random_state=seed)

In [None]:
multisplit = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=seed)
for train_index, val_index in multisplit.split(train_with_padding, train_labels):
    train_with_padding, val_with_padding = train_with_padding[train_index], train_with_padding[val_index]
    train_labels, val_labels = train_labels[train_index], train_labels[val_index]

In [None]:
labels_list = labels[lang]
num_labels = len(labels_list)
vocab_len = len(significant_words_dict) + 1
num_filters = 128
filter_sizes = [2, 3, 4]

inputs = Input(shape=(max_length,))
embedding = Embedding(
    input_dim = vocab_len,
    output_dim = embedding_dim,
    weights = [sig_word_weights],
    input_length = max_length,
    trainable = True,
)(inputs)

cnn_dif_filter_outputs = []
for size in filter_sizes:
    cnn_filter = Conv1D(filters=num_filters, kernel_size=size, activation='relu', padding='same')(embedding)
    max_pool = MaxPooling1D(pool_size=2)(cnn_filter)
    cnn_dif_filter_outputs.append(max_pool)
cnn_combined_output = concatenate(cnn_dif_filter_outputs)

lstm_layer = Bidirectional(LSTM(units = 64, return_sequences=True))(cnn_combined_output)
global_pool = GlobalMaxPooling1D()(lstm_layer)
dense_layer = Dense(units=64, activation=None)(global_pool) # relu also good results, sigmoid subpar
dropout_layer = Dropout(rate= 0.5)(dense_layer) # .55 also typically works fine
output_layer = Dense(num_labels, activation='sigmoid')(dropout_layer) # relu bad here
model = tf.keras.Model(inputs=inputs, outputs=output_layer)
# model.summary()

# Have moved a duplicate of this line to the top of the notebook for the sake
# of easily switching between the hyperparameters that worked for different langs.
# Keep this here to show context of where it's used.
# learning_rate = 0.0025 # 0.0025 for java, python, 0.001 for pharo
optimizer = Adam(learning_rate=learning_rate)
model.compile(
    loss='binary_crossentropy',
    optimizer=optimizer, # originally just default 'adam' parameter
    metrics=[
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall'),
        tf.keras.metrics.AUC(name='auc')
    ]
)

In [None]:
# Have moved a duplicate of this line to the top of the notebook for the sake
# of easily switching between the hyperparameters that worked for different langs.
# Keep this here to show context of where it's used.
# epochs = 10 # 10 python and pharo, 8 java
# batch_size = 60 # 60 for java, python, 48 for pharo

label_counts = np.sum(train_labels, axis=0)
total_counts = np.sum(label_counts)
label_percentages = label_counts / total_counts
label_inv_freqs = (1 / label_percentages)
sample_weights = np.sum(train_labels * label_inv_freqs, axis=1)

# Some code below used to get an idea of the spread of values we were working with.
# highest = 0
# for weight in sample_weights.tolist():
#     print(f"{weight:.2f}")
#     if weight > highest:
#         highest = weight
# print('HIGHEST VALUE:', highest) # Highest val: 19.29
# sample_weights = sample_weights / np.mean(sample_weights)
# Above: weights were performing better when not normalized

# plt.hist(sample_weights, bins=10)
# plt.title("Distribution of Sample Weights")
# plt.show()

# Also at top of notebook
# def lr_schedule(epoch, lr):
#     if epoch < 5: # 5 for java, pharo, 2 for python
#         return lr
#     else:
#         return lr * tf.math.exp(-0.33)
    
lr_scheduler = LearningRateScheduler(lr_schedule)    

history = model.fit(
    train_with_padding,
    train_labels,
    epochs = epochs,
    batch_size = batch_size,
    validation_data = (val_with_padding, val_labels),
    callbacks = [lr_scheduler], # disabled for pharo, 
    # sample_weight = sample_weights, # disabled for all models, just not helpful
    verbose = 1
)

In [None]:
# Transitioned away from this to a more dynamic threshold below
# test_probabilities = model.predict(test_with_padding)
# test_predictions = (test_probabilities > 0.5).astype(int)
# print(classification_report(test_labels, test_predictions, target_names=labels_list))

In [None]:
val_probabilities = model.predict(val_with_padding)
best_threshold = 0
best_total_f1 = 0

for threshold in (start / 400 for start in range(401)): # Search by 0.0025 steps
    val_predictions = (val_probabilities >= threshold).astype(int)
    total_f1 = 0
    for label_index in range(train_labels.shape[1]):
        f1 = f1_score(val_labels[:, label_index], val_predictions[:, label_index])
        total_f1 += f1

    if total_f1 > best_total_f1:
        best_total_f1 = total_f1
        best_threshold = threshold

print(f"Best Val Threshold: {best_threshold:.2f}")

test_probabilities = model.predict(test_with_padding)
test_predictions = (test_probabilities >= best_threshold).astype(int)
print(classification_report(test_labels, test_predictions, target_names=labels_list))


Current score below - no attention, weights added, with significant words

BASELINE BELOW:
Python
Pharo
Java

Default tokenizer

![image.png](attachment:image.png)
![image-2.png](attachment:image-2.png)
![image-3.png](attachment:image-3.png)

Batch 64 -> 48
Added dynamic threshold for predicting test set
Add bidirectional lstm layer
Add 3, 4 and support for multiple filters for cnn

![image-3.png](attachment:image-3.png)
![image-2.png](attachment:image-2.png)
![image.png](attachment:image.png)

Add glove embeddings (experiment with default weights for non included words) and find_significant_words, did not come together until I added MultilabelStratifiedShuffleSplit, tried adding sample weights but it was making things worse. Java barely moved (0.02) and other tests it went slightly negative. Already best performing so hoped hyperparameter tuning would give that final boost.

![image.png](attachment:image.png)
![image-2.png](attachment:image-2.png)
![image-3.png](attachment:image-3.png)

Added learning rate scheduler, learning from default of 0.001 to 0.0025, batch 48->60, tried sample weights (sucks for all), threshold for significant words .35->.15. Changes to all models: max_length is matched to the highest max length between the train/test utilitize all info effectively without excessive noise. Python 26, Pharo stay at 30, java 35. Dense layer - activation from relu to None (linear I believe?). Java to 8 epochs.

![image.png](attachment:image.png)
![image-4.png](attachment:image-4.png)
![image-2.png](attachment:image-2.png)