In [15]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

In [16]:
df = pd.read_csv('/Users/alina/Documents/dissertation/eclipse_all_bugs.csv')

In [26]:
# making a copy of df to make changes and preprocess
dff = df.copy()

In [27]:
# flattening of target categories
dff['Product_component'] = dff['Product'] + ' ' + dff['Component']
dff = dff.drop(columns=['Product', 'Component'])

In [28]:
# drop rows where data in target and important columns is missing
dff = dff.dropna(subset=['Product_component', 'Description'])

# replace NaN values based on the mode (most frequent category of the column)
mode_value = dff['Importance'].mode()[0]
dff['Importance'].fillna(mode_value, inplace=True)

# replace NaN values with a specific text
dff['Title'].fillna('Unknown', inplace=True)

In [29]:
# split the dataset into training and testing sets
X = dff[['Title', 'Description', 'Importance']]
y = dff['Product_component']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
# define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('text_title', CountVectorizer(), 'Title'),
        ('text_desc', CountVectorizer(), 'Description'),
        ('cat_importance', OneHotEncoder(handle_unknown='ignore'), ['Importance'])
    ],
    remainder='passthrough'
)

In [31]:
# fit and transform the training set
X_train_processed = preprocessor.fit_transform(X_train)

# transform the test set
X_test_processed = preprocessor.transform(X_test)

# encode the target feature
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1))
y_test_encoded = encoder.transform(y_test.values.reshape(-1, 1))

In [None]:
# fit the training data into classifier
logreg = LogisticRegression(n_jobs=2)
logreg.fit(X_train_processed, y_train_encoded)

In [34]:
# evaluate on test data
y_pred = logreg.predict(X_test_processed)

# decode the numerical labels back to their original form
y_test_decoded = encoder.inverse_transform(y_test_encoded.reshape(-1, 1))
y_pred_decoded = encoder.inverse_transform(y_pred.reshape(-1, 1))

# convert the arrays to pandas DataFrames
y_test_decoded = pd.DataFrame(y_test_decoded)
y_pred_decoded = pd.DataFrame(y_pred_decoded)

# replace NaN values with a default value
default_value = 'Unknown'
y_test_decoded = y_test_decoded.fillna(default_value)
y_pred_decoded = y_pred_decoded.fillna(default_value)

report = classification_report(y_test_decoded, y_pred_decoded)
print('Classification Report:\n', report)


Classification Report:
                     precision    recall  f1-score   support

      CDT cdt-core       0.43      0.14      0.21        22
     CDT cdt-debug       1.00      0.05      0.09        22
       CDT cdt-doc       0.00      0.00      0.00         4
     CDT cdt-other       0.43      0.21      0.29        14
 Equinox Incubator       0.00      0.00      0.00         3
          JDT Core       0.70      0.63      0.67       226
         JDT Debug       0.63      0.66      0.64       253
           JDT Doc       0.00      0.00      0.00         2
          JDT Text       0.17      0.06      0.08        18
            JDT UI       0.66      0.76      0.71       629
         PDE Build       0.50      0.25      0.33         4
            PDE UI       0.63      0.58      0.60        89
      Platform Ant       0.80      0.75      0.77        16
      Platform CVS       0.00      0.00      0.00         1
  Platform Compare       0.61      0.49      0.54        41
    Platform De

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
