In [3]:
!pip install tensorflow




[notice] A new release of pip is available: 23.0.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import re
import string
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report




In [2]:
# Load dataset
df = pd.read_csv("Eclipse_Platform.csv")

In [3]:
# Updated preprocess_text function
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'stack trace\S+', '', text)  # Remove stack traces
    text = re.sub(r'\b[0-9a-fA-F]+\b', '', text)  # Remove hex codes
    text = re.sub(r'\[.*?\]', '', text)  # Remove content in square brackets
    text = re.sub(r'<.*?>', '', text)  # Remove content in angle brackets
    text = text.replace('@', '')  # Remove @
    text = text.replace('-', '')  # Remove -
    text = text.replace('"', '')  # Remove "
    text = text.replace('`', '')  # Remove `
    text = text.replace('[', '')  # Remove [
    text = text.replace(']', '')  # Remove ]
    text = text.replace('\n', '')  # Remove \n
    text = text.replace('<', '')  # Remove <
    text = text.replace('>', '')  # Remove >
    text = text.replace('%', '')  # Remove %
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\r\n', ' ', text)  # Replace Windows-style newlines with space
    # tokens = word_tokenize(text)  # Tokenization
    # lemmatizer = WordNetLemmatizer()
    # tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # text = ' '.join(tokens)  # Join tokens back to string
    return text

df["text"] = df["title"] + " " + df["description"]
df["text"] = df["text"].apply(preprocess_text)
df["component"] = df["component"].apply(preprocess_text)


In [4]:
df.head(25)

Unnamed: 0,component,title,description,text
0,team,VCM Core: getIgnored() used with one resource ...,Our use of ISharingManager::getIgnored(IResour...,vcm core getignored used with one resource is ...
1,team,[Proxy] AIOOBE when the proxies array returned...,java.lang.ArrayIndexOutOfBoundsException: Arra...,aioobe when the proxies array returned by the...
2,swt,Missing source bundle for org.eclipse.swt.tools,During work on bug 484004 I found that bundle ...,missing source bundle for orgeclipseswttools d...
3,team,[History View] Generic DND support required,Currently the new History View only supports t...,generic dnd support required currently the ne...
4,team,Export of Team Projects Sets damaged,Build ID: I20080617-2000\n\nSteps To Reproduc...,export of team projects sets damaged build id ...
5,resources,Debug output does not tell what job is executed,There are several places in core.resources whe...,debug output does not tell what job is execute...
6,debug,"No mnemonics on ""Launch with errors"" preference","The ""Always"" and ""Prompt"" buttons for the ""Lau...",no mnemonics on launch with errors preference ...
7,swt,OS X: native crash in MenuItem.setMenu(Menu),"When I run the attached sample in OS X 10.8.5,...",os x native crash in menuitemsetmenumenu when ...
8,cvs,"Tag dialog, ResourceMapping and Recurse",The Tag dialog (for Replace) has a recurse opt...,tag dialog resourcemapping and recurse the tag...
9,releng,Mac-cocoa Eclipse.app has invalid Info.plist a...,"Tested with I20160428-8000, I20160430-2000 bui...",maccocoa eclipseapp has invalid infoplist and ...


In [5]:
# Assuming df_resampled is your final DataFrame after preprocessing and balancing
component_counts = df['component'].value_counts()
print(component_counts)

component
swt                                          994
debug                                        994
ant                                          994
ui                                           994
team                                         929
releng                                       733
text                                         630
cvs                                          523
compare                                      356
ide                                          315
resources                                    273
user assistance                              248
doc                                          180
search                                       144
runtime                                       93
update  deprecated  use eclipseequinoxp       57
Name: count, dtype: int64


In [6]:
# Calculate the average number of issues per component
average_issues_per_component = int(np.floor(df['component'].value_counts().mean()))

downsampled_dfs = []

for component in df['component'].unique():
    component_df = df[df['component'] == component]
    
    # Check if the number of issues for the current component exceeds the average
    if len(component_df) > average_issues_per_component:
        
        downsampled_dfs.append(component_df.sample(average_issues_per_component, random_state=42))
    else:
        # If the component has less than or equal to the average, keep all its issues and add to the list
        downsampled_dfs.append(component_df)

# Concatenate all the DataFrames in the list to create the downsampled DataFrame
downsampled_df = pd.concat(downsampled_dfs).reset_index(drop=True)
df=downsampled_df

In [7]:
# Assuming df_resampled is your final DataFrame after preprocessing and balancing
component_counts = df['component'].value_counts()
print(component_counts)

component
team                                         528
swt                                          528
debug                                        528
releng                                       528
ant                                          528
ui                                           528
text                                         528
cvs                                          523
compare                                      356
ide                                          315
resources                                    273
user assistance                              248
doc                                          180
search                                       144
runtime                                       93
update  deprecated  use eclipseequinoxp       57
Name: count, dtype: int64


In [8]:
# Delete components with fewer than 50 issues
component_counts = df['component'].value_counts()
valid_components = component_counts[component_counts >= 528].index
df= df[df['component'].isin(valid_components)]

In [9]:
# Splitting the data
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [10]:
# Load the ROBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [11]:
def encode_data(tokenizer, texts, labels, max_length):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded = tokenizer.encode_plus(
            text, 
            add_special_tokens=True, 
            max_length=max_length, 
            truncation=True, 
            padding='max_length', 
            return_attention_mask=True, 
            return_tensors='tf'
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    
    input_ids = tf.concat(input_ids, axis=0)
    attention_masks = tf.concat(attention_masks, axis=0)
    labels = tf.convert_to_tensor(labels, dtype=tf.int32)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
        'labels': labels
    }

In [12]:
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_data['component'])
test_labels = label_encoder.transform(test_data['component'])

In [13]:
# Encode data
max_length = 50  
train_encodings = encode_data(tokenizer, train_data['text'].to_numpy(), train_labels, max_length)
test_encodings = encode_data(tokenizer, test_data['text'].to_numpy(), test_labels, max_length)


In [14]:
# Convert labels to TensorFlow tensors
train_labels = tf.convert_to_tensor(train_labels)
test_labels = tf.convert_to_tensor(test_labels)

In [15]:
# Load the ROBERTa model
model = TFRobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(df['component'].unique()))





Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

In [16]:
# Prepare the training dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).shuffle(1000).batch(128)

In [17]:
# Prepare the validation dataset
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
)).batch(128)

In [18]:
# Optimizer and loss
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-6)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [31]:
# Train the model
history = model.fit(train_dataset, validation_data=test_dataset, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [32]:
# Evaluate the model
eval_results = model.evaluate(test_dataset)
print(f'\nTest Loss: {eval_results[0]}, Test Accuracy: {eval_results[1]}')


Test Loss: 0.7537953853607178, Test Accuracy: 0.7486486434936523


In [33]:
from sklearn.metrics import accuracy_score, classification_report

# Prediction on test data
predictions = model.predict(test_dataset).logits
test_preds = np.argmax(predictions, axis=1)

# Convert predictions to original labels
test_preds_original = label_encoder.inverse_transform(test_preds)

# Accuracy and classification report
accuracy = accuracy_score(test_data['component'], test_preds_original)
classification_report_result = classification_report(test_data['component'], test_preds_original)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(classification_report_result)

Accuracy: 0.7486486486486487
Classification Report:
              precision    recall  f1-score   support

       ant         0.85      0.79      0.82       104
     debug         0.70      0.83      0.76       109
    releng         0.78      0.92      0.84       101
       swt         0.74      0.69      0.71       101
      team         0.88      0.86      0.87       130
      text         0.69      0.61      0.65       100
        ui         0.55      0.48      0.51        95

    accuracy                           0.75       740
   macro avg       0.74      0.74      0.74       740
weighted avg       0.75      0.75      0.75       740



In [34]:
def predict_components(input_description):
    # Preprocess the input description
    processed_description = preprocess_text(input_description)
    
    # Tokenize the text
    encoded_input = tokenizer.encode_plus(
        processed_description, 
        add_special_tokens=True,
        max_length=50,  # Ensure this matches the max_length used during training
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='tf'
    )

    # Make prediction
    prediction = model.predict({
        'input_ids': encoded_input['input_ids'],
        'attention_mask': encoded_input['attention_mask']
    })

    # Get the top 5 predictions (indices of the sorted logits)
    top_5_indices = np.argsort(prediction.logits, axis=1)[0, -5:][::-1]

    # Convert predictions to component names using the label encoder
    top_5_components = label_encoder.inverse_transform(top_5_indices)
    
    # Get the corresponding scores (softmax to get probabilities)
    softmax_scores = tf.nn.softmax(prediction.logits, axis=1)
    top_5_scores = [softmax_scores.numpy()[0, idx] for idx in top_5_indices]

    return list(zip(top_5_components, top_5_scores))




In [35]:
import ipywidgets as widgets
from IPython.display import display

# text input field
text_input = widgets.Textarea(
    value='',
    placeholder='Type Description here',
    description='Description:',
    disabled=False,
    layout={'width': '500px', 'height': '100px'}
)

#  button
button = widgets.Button(
    description='Predict Components',
    disabled=False,
    button_style='info',
    tooltip='Click to predict top 5 components',
    icon='check' 
)

# output area
output = widgets.Output()

# button click event handler
def on_button_clicked(b):
    with output:
        output.clear_output()
        if text_input.value.strip() == '':
            print("Please enter a description.")
        else:
            recommendations = predict_components(text_input.value)
            print("Top 5 Component Recommendations and their scores:")
            for component, score in recommendations:
                print(f"{component}: {score:.4f}")


button.on_click(on_button_clicked)


display(text_input, button, output)


Textarea(value='', description='Description:', layout=Layout(height='100px', width='500px'), placeholder='Type…

Button(button_style='info', description='Predict Components', icon='check', style=ButtonStyle(), tooltip='Clic…

Output()