In [2]:
import string
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from multiprocessing.pool import ThreadPool as Pool 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection
import lightgbm as lgb
import pickle
from flask import Flask, request 
from scipy.sparse import hstack
from sklearn import metrics
import numpy as np


In [3]:
def preprocess(text: string, stopword_set: set = set(stopwords.words()) , stemmer: PorterStemmer =PorterStemmer()): 
    cleaned_text = text.translate(str.maketrans('', '', '!"#$%&\'()*+,.<=>?@[]^`{|}~' + u'\xa0'))
    cleaned_text = cleaned_text.lower()
    cleaned_text = cleaned_text.translate(str.maketrans(string.whitespace, ' ' * len(string.whitespace), ''))
    cleaned_text = ' '.join(['_variable_with_underscore' if '_' in t else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_variable_with_dash' if '-' in t else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_long_variable_name' if len(t) > 15 and t[0] != '#' else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_weburl' if t.startswith('http') and '/' in t else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_number' if re.sub('[\\/;:_-]', '', t).isdigit() else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_variable_with_address' if re.match('.*0x[0-9a-f].*', t) else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_name_with_number' if re.match('.*[a-f]*:[0-9]*', t) else t for t in  cleaned_text.split()])
    cleaned_text = ' '.join(['_number_starts_with_one_character' if re.match('[a-f][0-9].*', t) else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_number_starts_with_three_characters' if re.match('[a-f]{3}[0-9].*', t) else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_version' if any(i.isdigit() for i in t) and t.startswith('v') else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_localpath' if ('\\' in t or '/' in t) and ':' not in t else t for t in cleaned_text.split()])
    cleaned_text = ' '.join(['_image_size' if t.endswith('px') else t for t in cleaned_text.split()])

    tokenized_text = word_tokenize(cleaned_text)
    sw_removed_text = [word for word in tokenized_text if word not in stopword_set]
    sw_removed_text = [word for word in tokenized_text if len(word) > 0]
    stemmed_text = ' '.join([stemmer.stem(w) for w in sw_removed_text])
    return stemmed_text

In [54]:
dataset = pd.read_json('resource/embold_train.json') 
dataset.loc[dataset['label'] > 0, 'label'] = -1 
dataset.loc[dataset['label'] == 0, 'label'] = 1 
dataset.loc[dataset['label'] == -1, 'label'] = 0
stopwords = set(stopwords.words('english')) 
ps = PorterStemmer()


In [55]:
dataset = dataset.loc[:50000]
dataset

Unnamed: 0,title,body,label
0,y-zoom piano roll,a y-zoom on the piano roll would be useful.,0
1,buggy behavior in selection,! screenshot from 2016-02-23 21 27 40 https:/...,1
2,auto update feature,"hi,\r \r great job so far, @saenzramiro ! : \r...",0
3,filter out noisy endpoints in logs,i think we should stop logging requests to:\r ...,0
4,enable pid on / pid off alarm actions for ardu...,expected behavior\r alarm actions pid on and p...,1
...,...,...,...
49996,kas 1.0: rts-1 doesn't see all the resources o...,it seems the \ direction\ of the rts-1 affect...,1
49997,add a picking system,i don't saw the picking system/engine into you...,0
49998,upgrade to ionic 3.0,ionic version 3 has been released based on ang...,0
49999,implement tabbed interface,because all interfaces must be tabbed.\r \r fo...,0


In [56]:
stopword_set = None
stemmer = None
def initialize_pool(stopword_set_arg, stemmer_arg):
    global stopword_set
    global stemmer
    stopword_set = stopword_set_arg
    stemmer = stemmer_arg

In [57]:
pool = Pool(8)

pool = Pool(8, initializer=initialize_pool, initargs=(stopwords, ps, )) 

In [88]:
cleaned_title = pool.map(preprocess, dataset.title)

In [59]:
cleaned_title

['_long_variable_nam piano roll',
 'buggi behavior in select',
 'auto updat featur',
 'filter out noisi endpoint in log',
 'enabl pid on _localpath pid off alarm action for _localpath',
 'script stop ad video',
 'add the translat of _long_variable_nam',
 'propos loadtransl to lazi load _long_variable_nam',
 'bot should post to list period instead of onli post on startup',
 'en la org _long_variable_nam peopl info _localpath produc crash',
 '_name_with_numb check result against train data',
 'null or in jsonexport',
 'custom averag in _long_variable_nam',
 'add consist cach via separ process',
 '_long_variable_nam support pagin',
 'filter float point',
 'i2cwrit error on debian _number can not read properti i2cwritesync of undefin',
 'handl search which yield no result better',
 'path of war _long_variable_nam manuev il',
 'implement iap in app purchas subscript on io',
 'block that can be place face a particular direct are sometim place wrong',
 'app break when chang month after the ac

In [87]:
cleaned_body = pool.map(preprocess, dataset.body)

In [61]:
cleaned_body

['a _long_variable_nam on the piano roll would be use',
 'screenshot from _long_variable_nam _number _number _number _long_variable_nam _localpath',
 '_localpath _localpath great job so far saenzramiro _name_with_numb _localpath _localpath an auto updat featur would be nice to _localpath or altern a menu button to check for the latest version manual',
 'i think we should stop log request _name_with_numb _long_variable_nam _long_variable_nam _localpath _long_variable_nam _long_variable_nam _localpath _long_variable_nam faviconico _localpath',
 'expect _localpath alarm action pid on and pid off should enabl _localpath disabl hardwar pid on _localpath platform rampsoak on _localpath off may also be _localpath _localpath actual _localpath pid state on _localpath doe not chang when alarm _localpath _localpath step to reproduc the _localpath alarm handler onli work if _long_variable_nam is set sw onli _long_variable_nam case is handl in pidon _localpath pidoff function so thi condit may be a

In [62]:
data_texts = pd.DataFrame([cleaned_title, cleaned_body], index=['title','body']).T 
y = dataset['label'] 

data_fit, data_blindtest, y_fit, y_blindtest = model_selection.train_test_split(data_texts, y, test_size=0.1) 

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1)) 
tfidf_vectorizer.fit(cleaned_title + cleaned_body) 

X_tfidf_fit = tfidf_vectorizer.transform(data_fit['title']) 
X_tfidf_blindtest = tfidf_vectorizer.transform(data_blindtest['title'])

In [63]:
gbm_model = lgb.LGBMClassifier()

precision_cv_score = model_selection.cross_val_score(gbm_model, X_tfidf_fit, y_fit, cv=5, n_jobs=-2,scoring='precision').mean()
recall_cv_score = model_selection.cross_val_score(gbm_model, X_tfidf_fit, y_fit, cv=5, n_jobs=-2, scoring='recall_macro').mean() 
f1_cv_score = model_selection.cross_val_score(gbm_model, X_tfidf_fit, y_fit, cv=5, n_jobs=-2, scoring='f1_macro').mean() 

print('CV: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score)) 

CV: p:0.7819 r:0.7824 f:0.7841


In [64]:
data_fit_train, data_fit_test, y_fit_train, y_fit_test = model_selection.train_test_split(data_fit, y_fit, test_size=0.3) 

X_tfidf_fit_train = tfidf_vectorizer.transform(data_fit_train['title']) 
X_tfidf_fit_test = tfidf_vectorizer.transform(data_fit_test['title']) 
X_tfidf_blindtest = tfidf_vectorizer.transform(data_blindtest['title']) 

gbm_model.fit(X_tfidf_fit_train, y_fit_train, eval_set=[(X_tfidf_fit_test, y_fit_test)], eval_metric='AUC') 
precision_test_score = metrics.precision_score(gbm_model.predict(X_tfidf_blindtest), y_blindtest, average='macro') 
recall_test_score = metrics.recall_score(gbm_model.predict(X_tfidf_blindtest), y_blindtest, average='macro') 
f1_test_score = metrics.f1_score(gbm_model.predict(X_tfidf_blindtest), y_blindtest, average='macro') 

print('test: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_test_score, recall_test_score, f1_test_score)) 


[LightGBM] [Info] Number of positive: 14134, number of negative: 17366
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020293 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 46796
[LightGBM] [Info] Number of data points in the train set: 31500, number of used features: 1249
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448698 -> initscore=-0.205931
[LightGBM] [Info] Start training from score -0.205931
test: p:0.7891 r:0.7936 f:0.7908


In [79]:
pickle.dump(tfidf_vectorizer, open('resource/github_bug_prediction_tfidf_vectorizer.pkl', 'wb')) 
pickle.dump(gbm_model, open('resource/github_bug_prediction_basic_model.pkl', 'wb'))

### Sk-learn LSA

In [14]:
from sklearn.decomposition import TruncatedSVD 

lsa = TruncatedSVD(n_components=500, n_iter=100, random_state=0) 
lsa.fit(X_tfidf_fit) 
X_lsa_fit = lsa.transform(X_tfidf_fit) 

gbm_model_with_lsa = lgb.LGBMClassifier() 

precision_cv_score = model_selection.cross_val_score(gbm_model_with_lsa, X_lsa_fit, y_fit, cv=5, n_jobs=-2, scoring='precision_macro').mean() 
recall_cv_score = model_selection.cross_val_score(gbm_model_with_lsa, X_lsa_fit, y_fit, cv=5, n_jobs=-2, scoring='recall_macro').mean() 
f1_cv_score = model_selection.cross_val_score(gbm_model_with_lsa, X_lsa_fit, y_fit, cv=5, n_jobs=-2, scoring='f1_macro').mean() 

print('fit: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score)) 

X_fit_with_lsa = hstack([X_tfidf_fit, X_lsa_fit]).tocsr() 

precision_cv_score = model_selection.cross_val_score(gbm_model_with_lsa, X_fit_with_lsa, y_fit, cv=5, n_jobs=-2, scoring='precision_macro').mean() 
recall_cv_score = model_selection.cross_val_score(gbm_model_with_lsa, X_fit_with_lsa, y_fit, cv=5, n_jobs=-2, scoring='recall_macro').mean() 
f1_cv_score = model_selection.cross_val_score(gbm_model_with_lsa, X_fit_with_lsa, y_fit, cv=5, n_jobs=-2, scoring='f1_macro').mean() 

print('fit: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))

fit: p:0.7743 r:0.7711 f:0.7723
fit: p:0.7871 r:0.7838 f:0.7851


### Sk-learn LDA

In [15]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(ngram_range=(1,1))
count_vectorizer.fit(cleaned_title + cleaned_body)
X_tf_fit = count_vectorizer.transform(data_fit['title'])
X_tfidf_blindtest = count_vectorizer.transform(data_blindtest['title'])
lda = LatentDirichletAllocation(n_components=500, random_state=0)
lda.fit(X_tf_fit)
X_lda_fit = lda.transform(X_tf_fit)
gbm_model_with_lda = lgb.LGBMClassifier()


In [16]:
precision_cv_score = model_selection.cross_val_score(gbm_model_with_lda, X_lda_fit, y_fit, cv=5, n_jobs=-2, scoring='precision_macro').mean()
recall_cv_score = model_selection.cross_val_score(gbm_model_with_lda, X_lda_fit, y_fit, cv=5, n_jobs=-2, scoring='recall_macro').mean()
f1_cv_score = model_selection.cross_val_score(gbm_model_with_lda, X_lda_fit, y_fit, cv=5, n_jobs=-2, scoring='f1_macro').mean()

print('fit: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))

fit: p:0.7135 r:0.7059 f:0.7075


In [17]:
X_fit_with_lda = hstack([X_tfidf_fit, X_lda_fit]).tocsr()

precision_cv_score = model_selection.cross_val_score(gbm_model_with_lda, X_fit_with_lda, y_fit, cv=5, n_jobs=-2, scoring='precision_macro').mean()
recall_cv_score = model_selection.cross_val_score(gbm_model_with_lda, X_fit_with_lda, y_fit, cv=5, n_jobs=-2, scoring='recall_macro').mean()
f1_cv_score = model_selection.cross_val_score(gbm_model_with_lda, X_fit_with_lda, y_fit, cv=5, n_jobs=-2, scoring='f1_macro').mean()

print('fit: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))

fit: p:0.7927 r:0.7875 f:0.7893


# (1)Hyperparameter Optimization -- pg.50-52

In [66]:
import optuna

def objective(trial): 
    dtrain = lgb.Dataset(X_tfidf_fit_train, label=y_fit_train) 

    param = { 
    "objective": "binary", 
    "metric": "binary_logloss", 
    "verbosity": -1, 
    "boosting_type": "gbdt", 
    "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True), 
    "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True), 
    "num_leaves": trial.suggest_int("num_leaves", 2, 256), 
    "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0), 
    "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0), 
    "bagging_freq": trial.suggest_int("bagging_freq", 1, 7), 
    "min_child_samples": trial.suggest_int("min_child_samples", 5, 100), 
    } 

    gbm = lgb.train(param, dtrain) 
    preds = gbm.predict(X_tfidf_fit_test) 
    pred_labels = np.rint(preds) 
    accuracy = metrics.roc_auc_score(y_fit_test, pred_labels) 
    return accuracy 



In [67]:
study = optuna.create_study(direction="maximize") 
study.optimize(objective, n_trials=30) 

[I 2024-03-08 23:45:54,314] A new study created in memory with name: no-name-fffc121c-0a6c-43a4-a04d-2f85fb726a55
[I 2024-03-08 23:45:55,184] Trial 0 finished with value: 0.7760215503160969 and parameters: {'lambda_l1': 4.419001317559781e-07, 'lambda_l2': 0.030852868653625912, 'num_leaves': 20, 'feature_fraction': 0.626873063428099, 'bagging_fraction': 0.7009799782882757, 'bagging_freq': 2, 'min_child_samples': 33}. Best is trial 0 with value: 0.7760215503160969.
[I 2024-03-08 23:45:56,386] Trial 1 finished with value: 0.7796521012278781 and parameters: {'lambda_l1': 5.51354151043671e-06, 'lambda_l2': 9.319046868795827e-05, 'num_leaves': 46, 'feature_fraction': 0.5600680685365126, 'bagging_fraction': 0.9174833899778673, 'bagging_freq': 2, 'min_child_samples': 43}. Best is trial 1 with value: 0.7796521012278781.
[I 2024-03-08 23:45:57,125] Trial 2 finished with value: 0.7664958077925738 and parameters: {'lambda_l1': 1.4888852631301854e-06, 'lambda_l2': 2.1116989328371447e-06, 'num_leave

In [68]:
trial = study.best_trial

In [69]:
trial.params

{'lambda_l1': 0.06649123824483635,
 'lambda_l2': 0.04611630350257047,
 'num_leaves': 133,
 'feature_fraction': 0.9703471695518046,
 'bagging_fraction': 0.8318999823249558,
 'bagging_freq': 4,
 'min_child_samples': 5}

In [70]:
gbm_model = lgb.LGBMClassifier(**trial.params)

precision_cv_score = model_selection.cross_val_score(gbm_model, X_tfidf_fit, y_fit, cv=5, n_jobs=-2, scoring='precision_macro').mean()
recall_cv_score = model_selection.cross_val_score(gbm_model, X_tfidf_fit, y_fit, cv=5, n_jobs=-2, scoring='recall_macro').mean()
f1_cv_score = model_selection.cross_val_score(gbm_model, X_tfidf_fit, y_fit, cv=5, n_jobs=-2, scoring='f1_macro').mean()
print('CV: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_cv_score, recall_cv_score, f1_cv_score))


CV: p:0.7880 r:0.7866 f:0.7872


In [78]:
# Model fitting and evaluation on test data
gbm_model.fit(X_tfidf_fit_train, y_fit_train, eval_set=[(X_tfidf_fit_test, y_fit_test)], 
              eval_metric='AUC')
# Evaluation metrics on test data
precision_test_score = metrics.precision_score(gbm_model.predict(X_tfidf_blindtest), y_blindtest, 
                                               average='macro')
recall_test_score = metrics.recall_score(gbm_model.predict(X_tfidf_blindtest), y_blindtest, 
                                         average='macro')
f1_test_score = metrics.f1_score(gbm_model.predict(X_tfidf_blindtest), y_blindtest, 
                                 average='macro')

# Printing test results
print('test: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_test_score, recall_test_score, 
                                                   f1_test_score))

[LightGBM] [Info] Number of positive: 14134, number of negative: 17366
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.068117 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53856
[LightGBM] [Info] Number of data points in the train set: 31500, number of used features: 2844
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448698 -> initscore=-0.205931
[LightGBM] [Info] Start training from score -0.205931
test: p:0.7872 r:0.7887 f:0.7879


# (2)In class activity -- pg.53

In [95]:
count_vectorizer = CountVectorizer(ngram_range=(1,1))
count_vectorizer.fit(cleaned_title + cleaned_body)
X_tf_fit = count_vectorizer.transform(data_fit['title'])
X_tfidf_blindtest = count_vectorizer.transform(data_blindtest['title'])
lda = LatentDirichletAllocation(n_components=500, random_state=0)
lda.fit(X_tf_fit)
X_lda_fit = lda.transform(X_tf_fit)
gbm_model_with_lda = lgb.LGBMClassifier()

In [96]:
lda_study = optuna.create_study(direction="maximize") 
lda_study.optimize(objective, n_trials=30) 

[I 2024-03-09 00:11:11,939] A new study created in memory with name: no-name-835c21ab-068a-4684-9620-40cc088b1ea6
[I 2024-03-09 00:11:13,512] Trial 0 finished with value: 0.7620806191272266 and parameters: {'lambda_l1': 0.0008844067079026687, 'lambda_l2': 0.010981836360873569, 'num_leaves': 250, 'feature_fraction': 0.6208106158961265, 'bagging_fraction': 0.6599400476352488, 'bagging_freq': 2, 'min_child_samples': 92}. Best is trial 0 with value: 0.7620806191272266.
[I 2024-03-09 00:11:15,477] Trial 1 finished with value: 0.7762253159367934 and parameters: {'lambda_l1': 0.06499899287888909, 'lambda_l2': 4.898700149315819, 'num_leaves': 115, 'feature_fraction': 0.9618625161633172, 'bagging_fraction': 0.7722324566116461, 'bagging_freq': 6, 'min_child_samples': 56}. Best is trial 1 with value: 0.7762253159367934.
[I 2024-03-09 00:11:16,991] Trial 2 finished with value: 0.7781484790837994 and parameters: {'lambda_l1': 7.147540117509886e-06, 'lambda_l2': 5.218959272901435, 'num_leaves': 106,

In [97]:
trial_lda = lda_study.best_trial
trial_lda.params

{'lambda_l1': 1.646898485738107e-08,
 'lambda_l2': 2.2114858208151143e-05,
 'num_leaves': 167,
 'feature_fraction': 0.9953610097383894,
 'bagging_fraction': 0.9799164942174642,
 'bagging_freq': 5,
 'min_child_samples': 5}

In [98]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1)) 
tfidf_vectorizer.fit(cleaned_title + cleaned_body) 

X_tfidf_fit = tfidf_vectorizer.transform(data_fit['title']) 
X_tfidf_blindtest = tfidf_vectorizer.transform(data_blindtest['title'])

In [99]:
gbm_model_lda = lgb.LGBMClassifier(**trial_lda.params)

precision_cv_score = model_selection.cross_val_score(gbm_model_lda, X_tfidf_fit, y_fit, cv=5, n_jobs=-2, scoring='precision_macro').mean()
recall_cv_score = model_selection.cross_val_score(gbm_model_lda, X_tfidf_fit, y_fit, cv=5, n_jobs=-2, scoring='recall_macro').mean()
f1_cv_score = model_selection.cross_val_score(gbm_model_lda, X_tfidf_fit, y_fit, cv=5, n_jobs=-2, scoring='f1_macro').mean()

print('CV: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_test_score, recall_test_score, 
                                                   f1_test_score))

CV: p:0.7951 r:0.7969 f:0.7959


In [100]:
# Model fitting and evaluation on test data
gbm_model_lda.fit(X_tfidf_fit_train, y_fit_train, eval_set=[(X_tfidf_fit_test, y_fit_test)], 
              eval_metric='AUC')
# Evaluation metrics on test data
precision_test_score = metrics.precision_score(gbm_model_lda.predict(X_tfidf_blindtest), y_blindtest, 
                                               average='macro')
recall_test_score = metrics.recall_score(gbm_model_lda.predict(X_tfidf_blindtest), y_blindtest, 
                                         average='macro')
f1_test_score = metrics.f1_score(gbm_model_lda.predict(X_tfidf_blindtest), y_blindtest, 
                                 average='macro')

# Printing test results
print('test: p:{0:.4f} r:{1:.4f} f:{2:.4f}'.format(precision_test_score, recall_test_score, 
                                                   f1_test_score))

[LightGBM] [Info] Number of positive: 14134, number of negative: 17366
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063671 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 53856
[LightGBM] [Info] Number of data points in the train set: 31500, number of used features: 2844
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448698 -> initscore=-0.205931
[LightGBM] [Info] Start training from score -0.205931
test: p:0.7891 r:0.7905 f:0.7897
