Permalink
Cannot retrieve contributors at this time
# | |
# 1703-06337.R, 28 Dec 19 | |
# Data from: | |
# Mefta Sadat and Ayse Basar Bener and Andriy V. Miranskyy | |
# Rediscovery Datasets: Connecting Duplicate Reports | |
# | |
# Example from: | |
# Evidence-based Software Engineering: based on the publicly available data | |
# Derek M. Jones | |
# | |
# TAG fault-report_duplicate KDE_fault-report | |
source("ESEUR_config.r") | |
library("gnm") | |
library("plyr") | |
# | |
# | |
# plot_dups=function(df) | |
# { | |
# t=count(count(df$root_id)$freq) | |
# if (nrow(t) < 15) | |
# return() | |
# | |
# lines(t$x, t$freq) | |
# } | |
# plot_layout(2, 1) | |
pal_col=rainbow(4) | |
kde=read.csv(paste0(ESEUR_dir, "reliability/kde.csv.xz"), as.is=TRUE) | |
dups=subset(kde, !is.na(root_id)) | |
# plot(0.9, type="n", log="xy", | |
# xlim=c(1, 20), ylim=c(1, 400), | |
# xlab="Reports", ylab="Duplicates") | |
# | |
# d_ply(t, .(product), plot_dups) | |
# | |
# | |
dup_cnt=count(count(dups$root_id)$freq) | |
# plot(dup_cnt$freq, log="y", col=point_col, | |
# xlab="Duplicates", ylab="Reports\n") | |
# | |
# fail_mod=gnm(freq ~ instances(Mult(1, Exp(x)), 2)-1, | |
# data=dup_cnt[-1, ], verbose=TRUE, trace=TRUE, | |
# start=c(30000.0, -0.7, 300.0, -0.1), | |
# family=poisson(link="identity")) | |
# summary(fail_mod) | |
# | |
# exp_coef=as.numeric(coef(fail_mod)) | |
# | |
# lines(exp_coef[1]*exp(exp_coef[2]*dup_cnt$x), col=pal_col[1]) | |
# lines(exp_coef[3]*exp(exp_coef[4]*dup_cnt$x), col=pal_col[3]) | |
# | |
# t=predict(fail_mod) | |
# lines(t, col=pal_col[2]) | |
plot(dup_cnt$x, dup_cnt$freq, log="y", col=point_col, | |
xaxs="i", | |
xlim=c(1, 170), | |
xlab="Fault report ID", ylab="Occurrences\n") | |
# Cannot get any convergence if the first count is included | |
dup2_cnt=dup_cnt[-1, ] | |
fail_mod=gnm(freq ~ instances(Mult(1, Exp(x)), 3)-1, | |
data=dup2_cnt, verbose=FALSE, trace=FALSE, | |
start=c(230000.0, -1.0, 2100.0, -0.3, 21, -0.03), | |
family=poisson(link="identity")) | |
summary(fail_mod) | |
exp_coef=as.numeric(coef(fail_mod)) | |
lines(dup2_cnt$x, exp_coef[1]*exp(exp_coef[2]*dup2_cnt$x), col=pal_col[1]) | |
lines(dup2_cnt$x, exp_coef[3]*exp(exp_coef[4]*dup2_cnt$x), col=pal_col[3]) | |
lines(dup2_cnt$x, exp_coef[5]*exp(exp_coef[6]*dup2_cnt$x), col=pal_col[4]) | |
t=predict(fail_mod) | |
lines(dup2_cnt$x, t, col=pal_col[2]) | |
# library("pracma") | |
# | |
# # A different fit, constant offset makes a difference | |
# me_mod=mexpfit(dup2_cnt$x, dup2_cnt$freq, p0=c(-0.9, -0.1, -0.01)) | |
# print(me_mod) | |