# NLBSE Challenge 2024: Issue Report Classification

See more at https://nlbse2024.github.io/tools/

## Loading the dataset

In [7]:
import pandas as pd

df = pd.read_csv("./data/issues_train.csv")

df


Unnamed: 0,repo,created_at,label,title,body
0,facebook/react,2023-08-26 06:33:37,bug,"[DevTools Bug] Cannot add node ""1"" because a n...",### Website or app\n\nPrivate repo cannot give...
1,facebook/react,2023-07-28 05:16:12,bug,[DevTools Bug]: Devtools extension build faili...,### Website or app\n\nN/A\n\n### Repro steps\n...
2,facebook/react,2023-07-13 21:58:31,bug,[DevTools Bug]: Deprecated __REACT_DEVTOOLS_GL...,### Website or app\n\nhttps://github.com/open-...
3,facebook/react,2023-06-14 02:31:20,bug,"[DevTools Bug] Cannot remove node ""0"" because ...",### Website or app\n\nlocal\n\n### Repro steps...
4,facebook/react,2023-06-03 11:29:44,bug,"[DevTools Bug] Cannot remove node ""103"" becaus...",### Website or app\n\nlocalhost\n\n### Repro s...
...,...,...,...,...,...
1495,opencv/opencv,2022-01-24 10:48:13,feature,core: FP denormals support,relates #21046\r\n\r\n- support x86 SSE FTZ+DA...
1496,opencv/opencv,2022-01-20 12:40:55,feature,feature: submodule or a class scope for export...,All classes are registered in the scope that c...
1497,opencv/opencv,2022-01-15 02:39:22,feature,Reading BigTiff images,**Merge with extra: https://github.com/opencv/...
1498,opencv/opencv,2022-01-14 15:37:53,feature,Add general broadcasting layer,Performance details(broadcasting 1x1 to 16x204...


## TfidVectorizer model approach

In [22]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score

# Load the training data
train_data = pd.read_csv('./data/issues_train.csv')
test_data = pd.read_csv('./data/issues_test.csv') 

# Drop rows where 'title' or 'body' is NaN
train_data.dropna(subset=['title', 'body'], inplace=True)
test_data.dropna(subset=['title', 'body'], inplace=True)

X_train = train_data[['body', 'title']]
y_train = train_data['label']

X_test = test_data[['body', 'title']]
y_test = test_data['label']

# Create a column transformer that applies TfidfVectorizer to 'body' and 'title' columns
preprocessor = ColumnTransformer(
    transformers=[
        ('body', TfidfVectorizer(stop_words='english'), 'body'),
        ('title', TfidfVectorizer(stop_words='english'), 'title')
    ],
    remainder='passthrough'
)

# Create a pipeline that first applies the column transformer and then trains a classifier
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LinearSVC())
])

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Print a classification report
print(classification_report(y_test, y_pred))

# Print accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")




              precision    recall  f1-score   support

         bug       0.77      0.73      0.75       500
     feature       0.75      0.78      0.76       500
    question       0.68      0.68      0.68       498

    accuracy                           0.73      1498
   macro avg       0.73      0.73      0.73      1498
weighted avg       0.73      0.73      0.73      1498

Accuracy: 73.03%


## Word2Vec model approach

In [30]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score

def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.wv.key_to_index]
    if len(doc) == 0:
        return np.zeros(word2vec_model.vector_size)
    else:
        return np.mean(word2vec_model.wv[doc], axis=0)

# Load the training data
train_data = pd.read_csv('./data/issues_train.csv')
test_data = pd.read_csv('./data/issues_test.csv')

# Drop rows where 'title' or 'body' is NaN
train_data.dropna(subset=['title', 'body'], inplace=True)
test_data.dropna(subset=['title', 'body'], inplace=True)

# Create a concatenated column of 'title' and 'body'
train_data['text'] = train_data['title'] + " " + train_data['body']
test_data['text'] = test_data['title'] + " " + test_data['body']

# Tokenize the concatenated text
train_data['text_tokenized'] = train_data['text'].apply(lambda x: x.split())
test_data['text_tokenized'] = test_data['text'].apply(lambda x: x.split())

# Train a Word2Vec model
w2v_model = Word2Vec(pd.concat([train_data['text_tokenized'], test_data['text_tokenized']]), vector_size=100, window=5, min_count=2)

# Get the embeddings
train_data['text_vector'] = train_data['text_tokenized'].apply(lambda x: document_vector(w2v_model, x))
test_data['text_vector'] = test_data['text_tokenized'].apply(lambda x: document_vector(w2v_model, x))

# Prepare data for training and testing
X_train = np.array(list(train_data['text_vector']))
y_train = train_data['label']

X_test = np.array(list(test_data['text_vector']))
y_test = test_data['label']

# Train a LinearSVC classifier
classifier = LinearSVC()
classifier.fit(X_train, y_train)

# Make predictions
y_pred = classifier.predict(X_test)

# Print a classification report
print(classification_report(y_test, y_pred))

# Print accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")




              precision    recall  f1-score   support

         bug       0.72      0.63      0.68       500
     feature       0.65      0.77      0.71       500
    question       0.66      0.62      0.64       498

    accuracy                           0.67      1498
   macro avg       0.68      0.67      0.67      1498
weighted avg       0.68      0.67      0.67      1498

Accuracy: 67.42%


