# NLBSE Challenge 2024: Issue Report Classification

See more at https://nlbse2024.github.io/tools/

## Loading the dataset

In [7]:
import pandas as pd

df = pd.read_csv("./data/issues_train.csv")

df


Unnamed: 0,repo,created_at,label,title,body
0,facebook/react,2023-08-26 06:33:37,bug,"[DevTools Bug] Cannot add node ""1"" because a n...",### Website or app\n\nPrivate repo cannot give...
1,facebook/react,2023-07-28 05:16:12,bug,[DevTools Bug]: Devtools extension build faili...,### Website or app\n\nN/A\n\n### Repro steps\n...
2,facebook/react,2023-07-13 21:58:31,bug,[DevTools Bug]: Deprecated __REACT_DEVTOOLS_GL...,### Website or app\n\nhttps://github.com/open-...
3,facebook/react,2023-06-14 02:31:20,bug,"[DevTools Bug] Cannot remove node ""0"" because ...",### Website or app\n\nlocal\n\n### Repro steps...
4,facebook/react,2023-06-03 11:29:44,bug,"[DevTools Bug] Cannot remove node ""103"" becaus...",### Website or app\n\nlocalhost\n\n### Repro s...
...,...,...,...,...,...
1495,opencv/opencv,2022-01-24 10:48:13,feature,core: FP denormals support,relates #21046\r\n\r\n- support x86 SSE FTZ+DA...
1496,opencv/opencv,2022-01-20 12:40:55,feature,feature: submodule or a class scope for export...,All classes are registered in the scope that c...
1497,opencv/opencv,2022-01-15 02:39:22,feature,Reading BigTiff images,**Merge with extra: https://github.com/opencv/...
1498,opencv/opencv,2022-01-14 15:37:53,feature,Add general broadcasting layer,Performance details(broadcasting 1x1 to 16x204...


## TfidVectorizer model approach

In [29]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score, f1_score

# Load the training data
train_data = pd.read_csv('./data/issues_train.csv')
test_data = pd.read_csv('./data/issues_test.csv') 

# Drop rows where 'title' or 'body' is NaN
train_data.dropna(subset=['title', 'body'], inplace=True)
test_data.dropna(subset=['title', 'body'], inplace=True)

X_train = train_data[['body', 'title']]
y_train = train_data['label']

X_test = test_data[['body', 'title']]
y_test = test_data['label']

# Create a column transformer that applies TfidfVectorizer to 'body' and 'title' columns
preprocessor = ColumnTransformer(
    transformers=[
        ('body', TfidfVectorizer(stop_words='english'), 'body'),
        ('title', TfidfVectorizer(stop_words='english'), 'title')
    ],
    remainder='passthrough'
)

# Create a pipeline that first applies the column transformer and then trains a classifier
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LinearSVC())
])

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Print a classification report
print(classification_report(y_test, y_pred))

# Print accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")
print(f"F1-Score (weighted): {f1_score(y_test, y_pred, average='weighted')*100:.2f}%")



              precision    recall  f1-score   support

         bug       0.77      0.73      0.75       500
     feature       0.75      0.78      0.76       500
    question       0.68      0.68      0.68       498

    accuracy                           0.73      1498
   macro avg       0.73      0.73      0.73      1498
weighted avg       0.73      0.73      0.73      1498

Accuracy: 73.03%
F1-Score (weighted): 73.02%


## Word2Vec model approach

In [5]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, f1_score

def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.wv.key_to_index]
    if len(doc) == 0:
        return np.zeros(word2vec_model.vector_size)
    else:
        return np.mean(word2vec_model.wv[doc], axis=0)

# Load the training data
train_data = pd.read_csv('./data/issues_train.csv')
test_data = pd.read_csv('./data/issues_test.csv')

# Drop rows where 'title' or 'body' is NaN
train_data.dropna(subset=['title', 'body'], inplace=True)
test_data.dropna(subset=['title', 'body'], inplace=True)

# Create a concatenated column of 'title' and 'body'
train_data['text'] = train_data['title'] + " " + train_data['body']
test_data['text'] = test_data['title'] + " " + test_data['body']

# Tokenize the concatenated text
train_data['text_tokenized'] = train_data['text'].apply(lambda x: x.split())
test_data['text_tokenized'] = test_data['text'].apply(lambda x: x.split())

# Train a Word2Vec model
w2v_model = Word2Vec(pd.concat([train_data['text_tokenized'], test_data['text_tokenized']]), vector_size=100, window=5, min_count=2)

# Get the embeddings
train_data['text_vector'] = train_data['text_tokenized'].apply(lambda x: document_vector(w2v_model, x))
test_data['text_vector'] = test_data['text_tokenized'].apply(lambda x: document_vector(w2v_model, x))

# Prepare data for training and testing
X_train = np.array(list(train_data['text_vector']))
y_train = train_data['label']

X_test = np.array(list(test_data['text_vector']))
y_test = test_data['label']

# Train a LinearSVC classifier
classifier = LinearSVC()
classifier.fit(X_train, y_train)

# Make predictions
y_pred = classifier.predict(X_test)

# Print a classification report
print(classification_report(y_test, y_pred))

# Print accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")
print(f"F1-Score (weighted): {f1_score(y_test, y_pred, average='weighted')*100:.2f}%")




              precision    recall  f1-score   support

         bug       0.73      0.64      0.69       500
     feature       0.65      0.78      0.70       500
    question       0.65      0.60      0.63       498

    accuracy                           0.67      1498
   macro avg       0.68      0.67      0.67      1498
weighted avg       0.68      0.67      0.67      1498

Accuracy: 67.42%
F1-Score (weighted): 67.28%




## Glove Concatenated Column

In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, f1_score

import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, f1_score

def load_glove(glove_path):
    embeddings = {}
    with open(glove_path, 'r', encoding='utf8') as f:
        for line_no, line in enumerate(f, 1):  # Enumerate will give us the line number
            values = line.split()
            word = values[0]
            try:
                vector = np.asarray(values[1:], dtype='float32')
                embeddings[word] = vector
            except ValueError as e:
                print()
    return embeddings

def document_vector(glove_embeddings, doc):
    # Assuming all embeddings in the GloVe file have the same size
    embedding_size = next(iter(glove_embeddings.values())).shape[0]
    
    vectors = [glove_embeddings.get(word, np.zeros(embedding_size)) for word in doc]
    return np.mean(vectors, axis=0)

# Load GloVe embeddings
glove_embeddings = load_glove('data/glove.840B.300d/glove.840B.300d.txt')

# Load the training data
train_data = pd.read_csv('./data/issues_train.csv')
test_data = pd.read_csv('./data/issues_test.csv')

# Drop rows where 'title' or 'body' is NaN
train_data.dropna(subset=['title', 'body'], inplace=True)
test_data.dropna(subset=['title', 'body'], inplace=True)

# Create a concatenated column of 'title' and 'body'
train_data['text'] = train_data['title'] + " " + train_data['body']
test_data['text'] = test_data['title'] + " " + test_data['body']

# Tokenize the concatenated text
train_data['text_tokenized'] = train_data['text'].apply(lambda x: x.split())
test_data['text_tokenized'] = test_data['text'].apply(lambda x: x.split())

# Get the embeddings using GloVe
train_data['text_vector'] = train_data['text_tokenized'].apply(lambda x: document_vector(glove_embeddings, x))
test_data['text_vector'] = test_data['text_tokenized'].apply(lambda x: document_vector(glove_embeddings, x))

# Prepare data for training and testing
X_train = np.array(list(train_data['text_vector']))
y_train = train_data['label']

X_test = np.array(list(test_data['text_vector']))
y_test = test_data['label']

# Train a LinearSVC classifier
classifier = LinearSVC()
classifier.fit(X_train, y_train)

# Make predictions
y_pred = classifier.predict(X_test)

# Print a classification report
print(classification_report(y_test, y_pred))

# Print accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")
print(f"F1-Score (weighted): {f1_score(y_test, y_pred, average='weighted')*100:.2f}%")


























              precision    recall  f1-score   support

         bug       0.74      0.69      0.71       500
     feature       0.69      0.75      0.72       500
    question       0.66      0.65      0.66       498

    accuracy                           0.70      1498
   macro avg       0.70      0.70      0.70      1498
weighted avg       0.70      0.70      0.70      1498

Accuracy: 69.56%
F1-Score (weighted): 69.53%


## Glove Separate Columns

In [12]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, f1_score

def load_glove(glove_path):
    embeddings = {}
    with open(glove_path, 'r', encoding='utf8') as f:
        for line_no, line in enumerate(f, 1):  # Enumerate will give us the line number
            values = line.split()
            word = values[0]
            try:
                vector = np.asarray(values[1:], dtype='float32')
                embeddings[word] = vector
            except ValueError as e:
                print()
    return embeddings

def document_vector(glove_embeddings, doc):
    # Assuming all embeddings in the GloVe file have the same size
    embedding_size = next(iter(glove_embeddings.values())).shape[0]
    
    vectors = [glove_embeddings.get(word, np.zeros(embedding_size)) for word in doc]
    return np.mean(vectors, axis=0)


class GloVeVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, glove_embeddings):
        self.glove_embeddings = glove_embeddings

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return np.array([document_vector(self.glove_embeddings, x.split()) for x in X])

# Load GloVe embeddings
glove_embeddings = load_glove('data/glove.840B.300d/glove.840B.300d.txt')

# Load the training data
train_data = pd.read_csv('./data/issues_train.csv')
test_data = pd.read_csv('./data/issues_test.csv')

# Drop rows where 'title' or 'body' is NaN
train_data.dropna(subset=['title', 'body'], inplace=True)
test_data.dropna(subset=['title', 'body'], inplace=True)

X_train = train_data[['body', 'title']]
y_train = train_data['label']

X_test = test_data[['body', 'title']]
y_test = test_data['label']

# Create a column transformer that applies GloVeVectorizer to 'body' and 'title' columns
preprocessor = ColumnTransformer(
    transformers=[
        ('body', GloVeVectorizer(glove_embeddings), 'body'),
        ('title', GloVeVectorizer(glove_embeddings), 'title')
    ],
    remainder='passthrough'
)

# Create a pipeline that first applies the column transformer and then trains a classifier
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LinearSVC())
])

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Print a classification report
print(classification_report(y_test, y_pred))

# Print accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")
print(f"F1-Score (weighted): {f1_score(y_test, y_pred, average='weighted')*100:.2f}%")


























              precision    recall  f1-score   support

         bug       0.70      0.71      0.71       500
     feature       0.68      0.70      0.69       500
    question       0.65      0.62      0.63       498

    accuracy                           0.68      1498
   macro avg       0.68      0.68      0.68      1498
weighted avg       0.68      0.68      0.68      1498

Accuracy: 67.76%
F1-Score (weighted): 67.71%


## Fast Text Concatenated

In [4]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, f1_score

def load_fasttext(fasttext_path):
    embeddings = {}
    with open(fasttext_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

def document_vector(fasttext_embeddings, doc):
    # Use embeddings to get vectors for each word in the doc
    vectors = [fasttext_embeddings.get(word, np.zeros(300)) for word in doc]
    # Compute the mean vector for the entire doc
    return np.mean(vectors, axis=0)

# Load FastText embeddings
fasttext_embeddings = load_fasttext('data/wiki-news-300d-1M-subword.vec/wiki-news-300d-1M-subword.vec')

# Load the training data
train_data = pd.read_csv('./data/issues_train.csv')
test_data = pd.read_csv('./data/issues_test.csv')

# Drop rows where 'title' or 'body' is NaN
train_data.dropna(subset=['title', 'body'], inplace=True)
test_data.dropna(subset=['title', 'body'], inplace=True)

# Create a concatenated column of 'title' and 'body'
train_data['text'] = train_data['title'] + " " + train_data['body']
test_data['text'] = test_data['title'] + " " + test_data['body']

# Tokenize the concatenated text
train_data['text_tokenized'] = train_data['text'].apply(lambda x: x.split())
test_data['text_tokenized'] = test_data['text'].apply(lambda x: x.split())

# Get the embeddings using FastText
train_data['text_vector'] = train_data['text_tokenized'].apply(lambda x: document_vector(fasttext_embeddings, x))
test_data['text_vector'] = test_data['text_tokenized'].apply(lambda x: document_vector(fasttext_embeddings, x))

# Prepare data for training and testing
X_train = np.array(list(train_data['text_vector']))
y_train = train_data['label']

X_test = np.array(list(test_data['text_vector']))
y_test = test_data['label']

# Train a LinearSVC classifier
classifier = LinearSVC()
classifier.fit(X_train, y_train)

# Make predictions
y_pred = classifier.predict(X_test)

# Print a classification report
print(classification_report(y_test, y_pred))

# Print accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")
print(f"F1-Score (weighted): {f1_score(y_test, y_pred, average='weighted')*100:.2f}%")


              precision    recall  f1-score   support

         bug       0.61      0.62      0.61       500
     feature       0.57      0.73      0.64       500
    question       0.65      0.44      0.52       498

    accuracy                           0.60      1498
   macro avg       0.61      0.60      0.59      1498
weighted avg       0.61      0.60      0.59      1498

Accuracy: 59.88%
F1-Score (weighted): 59.27%




## FastText Separate Columns

In [5]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, f1_score

def load_fasttext(fasttext_path):
    embeddings = {}
    with open(fasttext_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

def document_vector(fasttext_embLineags.get(word, np.zeros(300)) for word in doc]
    return np.mean(vectors, axis=0)

# Load FastText embeddings
fasttext_embeddings = load_fasttext('data/wiki-news-300d-1M-subword.vec/wiki-news-300d-1M-subword.vec')

# Load the training data
train_data = pd.read_csv('./data/issues_train.csv')
test_data = pd.read_csv('./data/issues_test.csv')

# Drop rows where 'title' or 'body' is NaN
train_data.dropna(subset=['title', 'body'], inplace=True)
test_data.dropna(subset=['title', 'body'], inplace=True)

# Tokenize the title and body
train_data['title_tokenized'] = train_data['title'].apply(lambda x: x.split())
train_data['body_tokenized'] = train_data['body'].apply(lambda x: x.split())

test_data['title_tokenized'] = test_data['title'].apply(lambda x: x.split())
test_data['body_tokenized'] = test_data['body'].apply(lambda x: x.split())

# Get the embeddings for title and body using FastText
train_data['title_vector'] = train_data['title_tokenized'].apply(lambda x: document_vector(fasttext_embeddings, x))
train_data['body_vector'] = train_data['body_tokenized'].apply(lambda x: document_vector(fasttext_embeddings, x))

test_data['title_vector'] = test_data['title_tokenized'].apply(lambda x: document_vector(fasttext_embeddings, x))
test_data['body_vector'] = test_data['body_tokenized'].apply(lambda x: document_vector(fasttext_embeddings, x))

# Prepare data for training and testing by concatenating title and body vectors for each record
X_train = np.hstack((np.array(list(train_data['title_vector'])), np.array(list(train_data['body_vector']))))
y_train = train_data['label']

X_test = np.hstack((np.array(list(test_data['title_vector'])), np.array(list(test_data['body_vector']))))
y_test = test_data['label']

# Train a LinearSVC classifier
classifier = LinearSVC()
classifier.fit(X_train, y_train)

# Make predictions
y_pred = classifier.predict(X_test)

# Print a classification report
print(classification_report(y_test, y_pred))

# Print accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")
print(f"F1-Score (weighted): {f1_score(y_test, y_pred, average='weighted')*100:.2f}%")




              precision    recall  f1-score   support

         bug       0.65      0.72      0.68       500
     feature       0.66      0.75      0.70       500
    question       0.68      0.51      0.59       498

    accuracy                           0.66      1498
   macro avg       0.66      0.66      0.66      1498
weighted avg       0.66      0.66      0.66      1498

Accuracy: 66.09%
F1-Score (weighted): 65.65%


## Doc2Vec

In [18]:
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, accuracy_score, f1_score

class Doc2VecTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vector_size=100, window=5, min_count=1, workers=4, epochs=100):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        self.epochs = epochs
        self.d2v_model = None

    def fit(self, X, y=None):
        tagged_data = [TaggedDocument(words=_d.split(), tags=[str(i)]) for i, _d in enumerate(X)]
        self.d2v_model = Doc2Vec(vector_size=self.vector_size, window=self.window, min_count=self.min_count, workers=self.workers, epochs=self.epochs)
        self.d2v_model.build_vocab(tagged_data)
        self.d2v_model.train(tagged_data, total_examples=self.d2v_model.corpus_count, epochs=self.d2v_model.epochs)
        return self

    def transform(self, X):
        return np.array([self.d2v_model.infer_vector(_d.split()) for _d in X])

# Load the training data
train_data = pd.read_csv('./data/issues_train.csv')
test_data = pd.read_csv('./data/issues_test.csv')

# Drop rows where 'title' or 'body' is NaN
train_data.dropna(subset=['title', 'body'], inplace=True)
test_data.dropna(subset=['title', 'body'], inplace=True)

X_train = train_data[['body', 'title']]
y_train = train_data['label']
X_test = test_data[['body', 'title']]
y_test = test_data['label']

# Create a column transformer that applies Doc2VecTransformer to 'body' and 'title' columns
preprocessor = ColumnTransformer(
    transformers=[
        ('body', Doc2VecTransformer(), 'body'),
        ('title', Doc2VecTransformer(), 'title')
    ],
    remainder='passthrough'
)

# Use a RandomForest classifier for the pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100))
])

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Print a classification report
print(classification_report(y_test, y_pred))

# Print accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")
print(f"F1-Score (weighted): {f1_score(y_test, y_pred, average='weighted')*100:.2f}%")


              precision    recall  f1-score   support

         bug       0.61      0.44      0.51       500
     feature       0.62      0.58      0.60       500
    question       0.48      0.65      0.55       498

    accuracy                           0.56      1498
   macro avg       0.57      0.56      0.55      1498
weighted avg       0.57      0.56      0.55      1498

Accuracy: 55.61%
F1-Score (weighted): 55.45%


In [5]:
import pandas as pd
import re
import numpy as np
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.base import BaseEstimator, TransformerMixin

class Word2VecVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, vector_size=100, window=5, min_count=1):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.w2v = None
        self.mean_vector = None
    
    def fit(self, X, y=None):
        # Tokenize the text
        sentences = [row.split() for row in X]
        # Train the Word2Vec model
        self.w2v = Word2Vec(sentences, vector_size=self.vector_size, window=self.window, min_count=self.min_count, workers=4)
        # Pre-compute the mean vectors for the training set
        self.mean_vector = np.array([
            np.mean([self.w2v.wv[word] for word in words if word in self.w2v.wv] or [np.zeros(self.vector_size)], axis=0)
            for words in sentences
        ])
        return self
    
    def transform(self, X):
        # Compute the mean vector for the input data
        return np.array([
            np.mean([self.w2v.wv[word] for word in words.split() if word in self.w2v.wv] or [np.zeros(self.vector_size)], axis=0)
            for words in X
        ])


# The clean_text function and data loading code remain the same

# Let's test with a range of models
models = {
    "Logistic Regression": LogisticRegression(max_iter=5000),
    "LinearSVC": LinearSVC(),
    "RandomForest": RandomForestClassifier(),
}

param_distributions = {
    "Logistic Regression": {
        'clf__C': [0.1, 1, 10, 100],
        'w2v__vector_size': [100, 200],
        'w2v__window': [5, 10],
        'w2v__min_count': [1, 2]
    },
    "LinearSVC": {
        'clf__C': [0.1, 1, 10, 100],
        'w2v__vector_size': [100, 200],
        'w2v__window': [5, 10],
        'w2v__min_count': [1, 2]
    },
    "RandomForest": {
        'clf__n_estimators': [50, 100, 150],
        'clf__max_depth': [None, 50, 100],
        'clf__min_samples_split': [2, 5, 10],
        'clf__min_samples_leaf': [1, 2, 4],
        'w2v__vector_size': [100, 200],
        'w2v__window': [5, 10],
        'w2v__min_count': [1, 2]
    },
}

# Perform RandomizedSearchCV for all models
for model_name, model in models.items():
    pipeline = Pipeline([
        ('w2v', Word2VecVectorizer()),  # Replace Tfidf with Word2VecVectorizer
        ('clf', model)
    ])
    search = RandomizedSearchCV(pipeline, param_distributions=param_distributions[model_name], 
                                n_iter=50, verbose=2, n_jobs=-1)  # Adjust n_iter as needed
    
    search.fit(X_train, y_train)
    y_pred = search.predict(X_test)

    print(f"Performance of {model_name} after Randomized Search")
    print(classification_report(y_test, y_pred))




Fitting 5 folds for each of 32 candidates, totalling 160 fits
Performance of Logistic Regression after Randomized Search
              precision    recall  f1-score   support

         bug       0.74      0.62      0.68       500
     feature       0.65      0.79      0.72       500
    question       0.63      0.60      0.62       498

    accuracy                           0.67      1498
   macro avg       0.68      0.67      0.67      1498
weighted avg       0.68      0.67      0.67      1498

Fitting 5 folds for each of 32 candidates, totalling 160 fits




Performance of LinearSVC after Randomized Search
              precision    recall  f1-score   support

         bug       0.59      0.78      0.67       500
     feature       0.66      0.73      0.69       500
    question       0.73      0.41      0.52       498

    accuracy                           0.64      1498
   macro avg       0.66      0.64      0.63      1498
weighted avg       0.66      0.64      0.63      1498

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Performance of RandomForest after Randomized Search
              precision    recall  f1-score   support

         bug       0.67      0.56      0.61       500
     feature       0.60      0.66      0.63       500
    question       0.52      0.55      0.53       498

    accuracy                           0.59      1498
   macro avg       0.59      0.59      0.59      1498
weighted avg       0.59      0.59      0.59      1498



In [9]:
import pandas as pd
import openai

# Replace 'your-api-key' with your actual OpenAI API key
openai.api_key = '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' ## ISAC'S KEY

def query_chatgpt(prompt, model="gpt-4", max_tokens=100):
    """
    Function to query ChatGPT-4 with a given prompt.
    
    :param prompt: Prompt string to send to ChatGPT-4
    :param model: The model to use, default is ChatGPT-4
    :param max_tokens: Maximum number of tokens to generate
    :return: Response from ChatGPT-4
    """
    try:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[{"role": "system", "content": "You are the best transformer for language classification."}, {"role": "user", "content": prompt}],
            max_tokens=max_tokens
        )
        return response.choices[0].message['content']
    except Exception as e:
        print(f"An error occurred: {e}")
        return None
    
train_data = pd.read_csv('./data/issues_train.csv')
count = 0

iterations = 20

for i in range(iterations):
    correctLabel = train_data.iloc[i]['label']
    description = train_data.iloc[i]['title'] + train_data.iloc[i]['body']
    print(f"Correct PR type: {correctLabel}")
    # Example usage
    prompt = f"I will provide you with the description of a pull request on github. This description is composed by title and body of the pull request. Based on that description, I want you to answer me with just 1 word, this word can be either feature, bug or question. You will tell me whether the description given to you is a feature, a bug or a question. The definitions of those words is as follows: feature - new functionalty being added to a codebase, bug - a problem in existing code, question - an inquiry (anything that is not a feature or a bug). See pull request description below \n {description}"
    response = query_chatgpt(prompt)
    print(f"Predicted PR type: {response}")
    if str(response) == correctLabel:
        count += 1
accuracy = count/iterations
print(f"Accuracy = {accuracy}")


Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: question
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Correct PR type: bug
Predicted PR type: bug
Accuracy = 0.95
