In [23]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import re
import string
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report




In [24]:
# Load dataset
df = pd.read_csv("Eclipse_Platform.csv")

In [25]:
# preprocess_text function
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'stack trace\S+', '', text)  # Remove stack traces
    text = re.sub(r'\b[0-9a-fA-F]+\b', '', text)  # Remove hex codes
    text = re.sub(r'\[.*?\]', '', text)  # Remove content in square brackets
    text = re.sub(r'<.*?>', '', text)  # Remove content in angle brackets
    text = text.replace('@', '')  # Remove @
    text = text.replace('-', '')  # Remove -
    text = text.replace('"', '')  # Remove "
    text = text.replace('`', '')  # Remove `
    text = text.replace('[', '')  # Remove [
    text = text.replace(']', '')  # Remove ]
    text = text.replace('<', '')  # Remove <
    text = text.replace('>', '')  # Remove >
    text = text.replace('%', '')  # Remove %
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\r\n', ' ', text)  # Replace Windows-style newlines with space
    text = re.sub(r'\n|\r', ' ', text)  # Replace Unix/Mac-style newlines with space
    # tokens = word_tokenize(text)  # Tokenization
    # lemmatizer = WordNetLemmatizer()
    # tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # text = ' '.join(tokens)  # Join tokens back to string
    return text

df["text"] = df["title"] + " " + df["description"]
df["text"] = df["text"].apply(preprocess_text)
df["component"] = df["component"].apply(preprocess_text)


In [26]:
df.tail(25)

Unnamed: 0,component,title,description,text
8432,team,FTP: Connection wizard has no title,Build F2\n\nWhen you are specifying your FTP c...,ftp connection wizard has no title build whe...
8433,ant,Ant Editor image does not update when problem ...,"If you have a problem that you set to ignore, ...",ant editor image does not update when problem ...
8434,user assistance,"2 additional points for the ""Accessibility fea...","As a follow-on to bug 400997 comment 9, there ...",additional points for the accessibility featu...
8435,ui,[Jobs] Property IProgressConstants2.SHOW_IN_TA...,Created attachment 254799 [details]\nA test pr...,property iprogressconstantsshowintaskbariconp...
8436,ant,Hidden functionality of the copy button for bu...,From the newsgroup:\n> I expected that button ...,hidden functionality of the copy button for bu...
8437,compare,Exceptions after one of the compare viewer inp...,Build I20030513\n\n1. Select two folders\n2. C...,exceptions after one of the compare viewer inp...
8438,swt,NullPointerException when invoking ContentAssist,eclipse.buildId=I20090313-0100\njava.version=1...,nullpointerexception when invoking contentassi...
8439,ant,NullPointerException opening Ant file with URL...,"Since upgrading to Mars, the following build f...",nullpointerexception opening ant file with url...
8440,ant,Code folding in the Ant Editor,Utilize the text framework support for code fo...,code folding in the ant editor utilize the tex...
8441,ant,Code completion does not present nested target...,antcall has supported a nested target element ...,code completion does not present nested target...


In [27]:
# 
component_counts = df['component'].value_counts()
print(component_counts)

component
swt                                             994
debug                                           994
ant                                             994
ui                                              994
team                                            929
releng                                          733
text                                            630
cvs                                             523
compare                                         356
ide                                             315
resources                                       273
user assistance                                 248
doc                                             180
search                                          144
runtime                                          93
update  deprecated  use eclipseequinoxp          57
Name: count, dtype: int64


In [28]:
df.shape[0]

8457

In [29]:
# Splitting the data
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [30]:
# Load the ROBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [31]:
def encode_data(tokenizer, texts, labels, max_length):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded = tokenizer.encode_plus(
            text, 
            add_special_tokens=True, 
            max_length=max_length, 
            truncation=True, 
            padding='max_length', 
            return_attention_mask=True, 
            return_tensors='tf'
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    
    input_ids = tf.concat(input_ids, axis=0)
    attention_masks = tf.concat(attention_masks, axis=0)
    labels = tf.convert_to_tensor(labels, dtype=tf.int32)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
        'labels': labels
    }

In [32]:

label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_data['component'])
test_labels = label_encoder.transform(test_data['component'])

In [34]:
# Encode data
max_length = 50  
train_encodings = encode_data(tokenizer, train_data['text'].to_numpy(), train_labels, max_length)
test_encodings = encode_data(tokenizer, test_data['text'].to_numpy(), test_labels, max_length)


In [35]:
# Convert labels to TensorFlow tensors
train_labels = tf.convert_to_tensor(train_labels)
test_labels = tf.convert_to_tensor(test_labels)

In [36]:
# Load the ROBERTa model
model = TFRobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(df['component'].unique()))





Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

In [37]:
# Prepare the training dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).shuffle(1000).batch(64)

In [38]:
# Prepare the validation dataset
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
)).batch(64)

In [39]:
# Optimizer and loss
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-6)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [41]:
# Train the model
history = model.fit(train_dataset, validation_data=test_dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [42]:
# Evaluate the model
eval_results = model.evaluate(test_dataset)
print(f'\nTest Loss: {eval_results[0]}, Test Accuracy: {eval_results[1]}')


Test Loss: 1.1483147144317627, Test Accuracy: 0.6637116074562073


In [46]:
from sklearn.metrics import accuracy_score, classification_report

# Prediction on test data
predictions = model.predict(test_dataset).logits
test_preds = np.argmax(predictions, axis=1)

# Convert predictions to original labels
test_preds_original = label_encoder.inverse_transform(test_preds)

# Accuracy and classification report
accuracy = accuracy_score(test_data['component'], test_preds_original)
classification_report_result = classification_report(test_data['component'], test_preds_original)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(classification_report_result)

Accuracy: 0.6637115839243499
Classification Report:
                                              precision    recall  f1-score   support

                                    ant            0.84      0.89      0.87       199
                                compare            0.73      0.68      0.70        66
                                    cvs            0.45      0.38      0.41       113
                                  debug            0.81      0.81      0.81       216
                                    doc            0.55      0.47      0.51        36
                                    ide            0.45      0.37      0.41        54
                                 releng            0.79      0.76      0.78       159
                              resources            0.38      0.49      0.43        47
                                runtime            0.00      0.00      0.00        23
                                 search            0.86      0.78      0.82        32
 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
def predict_components(input_description):
    # Preprocess the input description
    processed_description = preprocess_text(input_description)
    
    # Tokenize the text
    encoded_input = tokenizer.encode_plus(
        processed_description, 
        add_special_tokens=True,
        max_length=50,  # Ensure this matches the max_length used during training
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='tf'
    )

    # Make prediction
    prediction = model.predict({
        'input_ids': encoded_input['input_ids'],
        'attention_mask': encoded_input['attention_mask']
    })

    # Get the top 5 predictions (indices of the sorted logits)
    top_5_indices = np.argsort(prediction.logits, axis=1)[0, -5:][::-1]

    # Convert predictions to component names using the label encoder
    top_5_components = label_encoder.inverse_transform(top_5_indices)
    
    # Get the corresponding scores (softmax to get probabilities)
    softmax_scores = tf.nn.softmax(prediction.logits, axis=1)
    top_5_scores = [softmax_scores.numpy()[0, idx] for idx in top_5_indices]

    return list(zip(top_5_components, top_5_scores))




In [45]:
import ipywidgets as widgets
from IPython.display import display

# text input field
text_input = widgets.Textarea(
    value='',
    placeholder='Type Description here',
    description='Description:',
    disabled=False,
    layout={'width': '500px', 'height': '100px'}
)

#  button
button = widgets.Button(
    description='Predict Components',
    disabled=False,
    button_style='info',
    tooltip='Click to predict top 5 components',
    icon='check' 
)

# output area
output = widgets.Output()

# button click event handler
def on_button_clicked(b):
    with output:
        output.clear_output()
        if text_input.value.strip() == '':
            print("Please enter a description.")
        else:
            recommendations = predict_components(text_input.value)
            print("Top 5 Component Recommendations and their scores:")
            for component, score in recommendations:
                print(f"{component}: {score:.4f}")


button.on_click(on_button_clicked)


display(text_input, button, output)


Textarea(value='', description='Description:', layout=Layout(height='100px', width='500px'), placeholder='Type…

Button(button_style='info', description='Predict Components', icon='check', style=ButtonStyle(), tooltip='Clic…

Output()