<a id="index"></a>
# JSS '19 - Gkortzis et al. - Data Analysis

This notebook performs the following analyses reported in the study:

1. [Prepare dataset](#prepare)
2. [RQ1](#rq1)
    1. [Descriptive statistics](#rq1-descriptive)
    2. [Descriptive statistics (sums & median)](#rq1-sums)
    4. [Regression Analysis (Prepare dataset)](#rq1-regression)
    5. [Dataset Visualization](#rq1-visual)
    6. [Multivariate Regression Analysis](#rq1-regression-multivariate)
    7. [rq1-boxplots](#rq1-boxplots)
<!--    7. [Boxplots](#rq1-boxplots) -->
<!--     8. [Grouping analysis(NEW)](#grouping) -->
3. [RQ2](#rq2)
    1. [Prepare Dataset](#rq2-pd)
    2. [Scatterplots](#rq2-scatter)
    3. [Boxplots](#rq2-boxplots2)
    4. [Regression Analysis [vuln-density, reuse-ratio]](#rq2-regression)
    5. [Regression Analysis [native-vuln-density, reuse-ratio]](#rq2-regression2)
    6. [Multivariate Regression Analysis [vuln-density, native-sloc, reuse-sloc]](#rq2-regression3)
    7. [Multivariate Regression Analysis [vuln-density, native-vuln-density, reuse-vuln-density]](#rq2-regression4)
4. [RQ3](#rq3)
    1. [Dataset Description](#rq3-dd)
    2. [Regression Analysis [#cves-#dependencies]](#rq3-regression)
    3. [RQ3 - Regression Analysis [#v - #dependencies]](#rq3-potential)
    3. [Regression Analysis [#cves - #module_size]](#rq3-regression2)
    4. [Regression Analysis [#cve-density - #dependencies]](#rq3-regression3)
    3. [Count Vulnerable Projects](#rq3-count)
5. [RQ4](#rq4)
    1. [Prepare Dataset](#rq4-pd)
    2. [Count Vulnerabilities](#rq4-count)
    3. [Regression Analysis](#rq4-regression)
6. [[Discussion] How are potential vulnerabilities related to disclosed ones?](#discussion)
7. [JSS Revision 1 - New Analysis](#jss-rev1)

<a id="prepare"></a>
## Prepare dataset

In [None]:
import csv
import logging
import numpy as np
import pandas as pd
from scipy import stats
logging.basicConfig(level=logging.INFO)

def map_deps_to_projects(dependencies_usages):
    logging.info("Creating projects dependencies' list..")
    
    projects_dependencies = {}
    with open(dependencies_usages, 'r') as csv_file:
        for line in csv_file:
            fields = line.replace('\n','').split(';')
#             logging.info(fields)
            dependency = fields[0]
            for project in fields[2:]:
                if project not in projects_dependencies:
                    projects_dependencies[project] = [dependency]
                else:
                    projects_dependencies[project].append(dependency)
                    
    return projects_dependencies


def count_vulnerabilities(projects_dependencies, owasp_vulnerabilities):
    logging.info("Creating projects cves list..")
    
    dependencies_vulnerabilities = {}
    with open(owasp_vulnerabilities, 'r') as csv_file:
        for line in csv_file:
            fields = line.replace('\n','').split(';')
#             logging.info(fields)
            dependency = fields[0]
            number_of_cves = int(fields[2])
            if number_of_cves > 0:
                cves = fields[4].split(',')
                dependencies_vulnerabilities[dependency] = set(cves)
    
    projects_vulnerabilities = {}
    for project in projects_dependencies:
        cves = set()
        
        for dependency in projects_dependencies[project]:
            if dependency in dependencies_vulnerabilities:
                dependency_cves = dependencies_vulnerabilities[dependency]
                cves.update(dependency_cves)
            else:
#                 logging.warning("dependency {} not found".format(dependency))
                pass
        
        projects_vulnerabilities[project] = len(cves)
#         logging.info("{}-->{}".format(project,projects_vulnerabilities[project]))
            
    return projects_vulnerabilities


def load_dataset(csv_file):
    return pd.read_csv(csv_file)


def prepare_dataset(df):
    print("Creating main dataframe. Size {}".format(len(df)))
     
    # Calculate derived variables
    df['#uv_p1'] = df['#uv_p1_r1'] + df['#uv_p1_r2'] + df['#uv_p1_r3'] + df['#uv_p1_r4']
    df['#dv_p1'] = df['#dv_p1_r1'] + df['#dv_p1_r2'] + df['#dv_p1_r3'] + df['#dv_p1_r4']
    df['#dev_p1'] = df['#dev_p1_r1'] + df['#dev_p1_r2'] + df['#dev_p1_r3'] + df['#dev_p1_r4']
    df['#dnev_p1'] = df['#dnev_p1_r1'] + df['#dnev_p1_r2'] + df['#dnev_p1_r3'] + df['#dnev_p1_r4']
    df['#dwv_p1'] = df['#dwv_p1_r1'] + df['#dwv_p1_r2'] + df['#dwv_p1_r3'] + df['#dwv_p1_r4']
    df['#dnwv_p1'] = df['#dnwv_p1_r1'] + df['#dnwv_p1_r2'] + df['#dnwv_p1_r3'] + df['#dnwv_p1_r4']
    
    df['#uv_p2'] = df['#uv_p2_r1'] + df['#uv_p2_r2'] + df['#uv_p2_r3'] + df['#uv_p2_r4']
    df['#dv_p2'] = df['#dv_p2_r1'] + df['#dv_p2_r2'] + df['#dv_p2_r3'] + df['#dv_p2_r4']
    df['#dev_p2'] = df['#dev_p2_r1'] + df['#dev_p2_r2'] + df['#dev_p2_r3'] + df['#dev_p2_r4']
    df['#dnev_p2'] = df['#dnev_p2_r1'] + df['#dnev_p2_r2'] + df['#dnev_p2_r3'] + df['#dnev_p2_r4']
    df['#dwv_p2'] = df['#dwv_p2_r1'] + df['#dwv_p2_r2'] + df['#dwv_p2_r3'] + df['#dwv_p2_r4']
    df['#dnwv_p2'] = df['#dnwv_p2_r1'] + df['#dnwv_p2_r2'] + df['#dnwv_p2_r3'] + df['#dnwv_p2_r4']
    
    df['#uv'] = df['#uv_p1'] + df['#uv_p2']
    df['#dv'] = df['#dv_p1'] + df['#dv_p2']
    df['#dev'] = df['#dev_p1'] + df['#dev_p2']
    df['#dnev'] = df['#dnev_p1'] + df['#dnev_p2']
    df['#dwv'] = df['#dwv_p1'] + df['#dwv_p2']
    df['#dnwv'] = df['#dnwv_p1'] + df['#dnwv_p2']
    
    df['#uv_sloc'] = df['#uv'] / (df['#d_sloc']+df['#u_sloc'])
    df['#dv_sloc'] = df['#dv'] / (df['#d_sloc']+df['#u_sloc'])
#     df['#dev_sloc'] = df['#dev'] / (df['#d_sloc']+df['#u_sloc'])
#     df['#dnev_sloc'] = df['#dnev'] / (df['#d_sloc']+df['#u_sloc'])
#     df['#dwv_sloc'] = df['#dwv'] / (df['#d_sloc']+df['#u_sloc'])
#     df['#dnwv_sloc'] = df['#dnw'] / (df['#d_sloc']+df['#u_sloc'])

    df['classes'] = df['#u_classes'] + df['#d_classes']
    df['sloc'] = df['#u_sloc'] + df['#d_sloc']
    df['v'] = df['#uv'] + df['#dv']

    # Remove project with no external classes or very small native code base
    df = df[df['#d_classes'] > 0]
    df = df[df['#u_sloc'] >= 1000]
    
    print("Initial filtering reduced size to {}".format(len(df)))
    return df


def enhance_dataset(df, projects_dependencies, projects_vulnerabilities):
    logging.info("Enhancing dataframe with dependencies and cves..")
    df["#dependencies"] = np.nan
    df["#cves"] = np.nan
    
    for index, row in df.iterrows():
        project = row['project']
        number_of_dependencies = len(projects_dependencies[project])
        number_of_cves = projects_vulnerabilities[project]
        df.at[index,'#dependencies'] = int(number_of_dependencies)
        df.at[index,'#cves'] = int(number_of_cves)
      
    return df


def detect_enterprise_repos(df, enterprise_repos):
    logging.info("Detecting enterprise repos")
    df["is_enterprise"] = np.nan
    df["contributors"] = np.nan
    
    # read the enterprise repos
    
    with open(enterprise_repos) as f:
        lines = f.read().splitlines()
           
    repositories_info = {}
    for repository in lines[1:]: # skip csv's headings
        fields = repository.split(',')
        repositories_info[fields[0]] = fields[1:]
    
    for index, row in df.iterrows():
        project = row['project']
        if not project:
            print("Project {} not found".format(project))
            continue
        if project in repositories_info:
            is_of_enterprise_org = repositories_info[project][3]
            contributors = repositories_info[project][4]
        else: 
            print("{} :: does not exist in the group ids list".format(project))
            is_of_enterprise_org = 0
            contributors = 1
        df.at[index,'is_enterprise'] = int(is_of_enterprise_org)
        df.at[index,'contributors'] = int(contributors)
        
    return df
        

def filter_dataset(df, projects_as_dependencies):
    logging.info("Filtering dataset")
    project_list = []
    with open(projects_as_dependencies, 'r') as csv_file:
        for line in csv_file:
            project = line.rstrip('\n')
            project_list.append(project)
            df = df[df.project != project]

    print("Selected data set after filtering :: {}".format(len(df)))
    
    return df

        
owasp_vulnerabilities = '../owasp_vulnerabilities_enhanced.csv'
dependencies_usages = '../depependencies_usages.csv'
projects_dataset = '../datasets/dataset_complete.csv'
study_vars = ['classes','#u_classes','#d_classes',
              'sloc','#u_sloc','#d_sloc','#de_sloc','#dne_sloc','#dw_sloc','#dnw_sloc',
              'v', '#uv', '#dv', '#dev', '#dnev', '#dwv', '#dnwv',
              '#uv_classes', '#dv_classes', '#uv_sloc', '#dv_sloc',
              '#dependencies', '#cves']

projects_dependencies = map_deps_to_projects(dependencies_usages)
projects_vulnerabilities = count_vulnerabilities(projects_dependencies, owasp_vulnerabilities)
projects_as_dependencies = '../projects_as_dependencies.csv'
enterprise_repos = "../projects_groupids_enterprise_info.csv"
df = load_dataset(projects_dataset)
df = prepare_dataset(df)
df = enhance_dataset(df, projects_dependencies, projects_vulnerabilities)
df = detect_enterprise_repos(df, enterprise_repos) 
df = filter_dataset(df, projects_as_dependencies)


<a id="rq1"></a>
## RQ1
__RQ1: "What size and reuse factors are related with potential security vulnerabilities?"__. 

[Back to table of contents](#index)

<a id="rq1-descriptive"></a>
### RQ1 - Descriptive statistics
This is the table with the descriptive statistics for the whole dataset. 

[Back to table of contents](#index)

In [None]:
VLn = sum(df['#uv_classes_sloc'])
VLr = sum(df['#dv_classes_sloc'])
# Add reuse ratio
df_filtered = df[study_vars]
pd.set_option('float_format', '{:f}'.format)
df_filtered.describe()
# df_filtered.describe().to_csv("../datasets/temp_descriptive_stats.csv") # uncomment if you want to export the descriptive stats into a csv file


<a id="rq1-sums"></a>
### RQ1 - Descriptive statistics (Sums & median)
// TODO description 

[Back to table of contents](#index)

In [None]:
C = sum(df['classes'])
Cn = sum(df['#u_classes'])
Cr = sum(df['#d_classes'])
L = sum(df['sloc'])
Ln = sum(df['#u_sloc'])
Lr = sum(df['#d_sloc'])
Lre = sum(df['#de_sloc'])
Lrne = sum(df['#dne_sloc'])
Lrw = sum(df['#dw_sloc'])
Lrnw = sum(df['#dnw_sloc'])
V = sum(df['v'])
Vn = sum(df['#uv'])
Vr = sum(df['#dv'])
Vre = sum(df['#dev'])
Vrne = sum(df['#dnev'])
Vrw = sum(df['#dwv'])
Vrnw = sum(df['#dnwv'])
VCn = sum(df['#uv_classes'])
VCr = sum(df['#dv_classes'])
D = sum(df['#dependencies'])

print('''----- Descriptive statistics [sum] -----
{:30}{:=10d}\n{:30}{:=10d}\n{:30}{:=10d}
{:30}{:=10d}\n{:30}{:=10d}\n{:30}{:=10d}
{:30}{:=10d}\n{:30}{:=10d}
{:30}{:=10d}\n{:30}{:=10d}
{:30}{:=10d}\n{:30}{:=10d}\n{:30}{:=10d}
{:30}{:=10d}\n{:30}{:=10d}
{:30}{:=10d}\n{:30}{:=10d}
{:30}{:=10d}\n{:30}{:=10d}
{:30}{:=10d}\n{:30}{:=10d}
{:30}{:=10.0f}
'''.format('Classes =',C,'Native classes =',Cn,'Reused classes =',Cr,
           'Sloc =', L,'Native sloc =',Ln,'Reused sloc =', Lr,
           'Reused enterprise sloc =',Lre,'Reused volunteer sloc =', Lrne,
           'Reused well-known sloc =',Lrw, 'Reused less-known sloc =', Lrnw, 
           'Vulnerabilities (potential) =',V, 'Vulns native =', Vn, 'Vulns reused =', Vr,
           'Vulns reused enterprise =', Vre, 'Vulns reused volunteer =', Vrne,
           'Vulns reused well-known =', Vrw,'Vulns reused less-known =', Vrnw, 
           'Vulnerbale native classes =',VCn,'Vulnerable reused classes', VCr, 
           'Vulnerable native sloc',VLn,'Vulnerable reused sloc =', VLr,
           'Dependencies =',D))

print("---- Descriptive statistics [median] ---")
df_filtered.median()

<a id="rq1-descriptive-enterprise"></a>
### RQ1 - Descriptive statistics [For Enterprise projects]
The following represent the descriptive statistics for the Enterprise projects 

[Back to table of contents](#index)

In [None]:
enterprise = df[df['is_enterprise'] > 0]
enterprise.describe()
# enterprise.describe().to_csv("../datasets/temp_enterprise_descriptive_statistics.csv")


<a id="rq1-descriptive-enterprise"></a>
### RQ1 - Descriptive statistics [For Volunteer projects]
The following represent the descriptive statistics for the Volunteer projects 

[Back to table of contents](#index)

In [None]:
non_enterprise = df[df['is_enterprise'] == 0]
non_enterprise.describe()
# non_enterprise.describe().to_csv("../datasets/temp_volunteer_descriptive_statistics.csv")

<a id="rq1-regression"></a>
### RQ1 - Regression Analysis (Prepare dataset)
// TODO description 

[Back to table of contents](#index)

In [None]:
#-----------------
# IMPORTS & CONFIG
#-----------------
import pandas
import numpy
import seaborn
import statsmodels.formula.api as sm
from scipy import stats
from matplotlib import pyplot
from IPython.display import display, HTML

%matplotlib inline

marker_size = 5

df['dependencies'] = df['#dependencies'] # make a copy of the column without the '#' that cannot be parsed by statsmodels library 
df['cves'] = df['#cves'] # make a copy of the column without the '#' that cannot be parsed by statsmodels library 
df['reuse_ratio'] = df['#d_sloc'] / (df['#d_sloc']+df['#u_sloc']) # these variable is also declared and initialized in RQ2
df['wk_ratio'] = df['#dw_sloc'] / (df['#dw_sloc']+df['#dnw_sloc'])
df['dv'] = df['#dv']
df['dependency_size'] = df['#d_sloc'] / df['dependencies'] # the average size of the dependencies modules of a project
df['cve_density'] = df['#cves'] / df['#d_sloc']

#
# Standardize beta coefficient (by z-score)
#
df['v_z'] = df['v'].pipe(stats.zscore)
df['sloc_z'] = df['sloc'].pipe(stats.zscore)
df['classes_z'] = df['classes'].pipe(stats.zscore)
df['dependencies_z'] = df['#dependencies'].pipe(stats.zscore)
df['cves_z'] = df['#cves'].pipe(stats.zscore)
df['reuse_ratio_z'] = df['reuse_ratio'].pipe(stats.zscore)
df['wk_ratio_z'] = df['wk_ratio'].pipe(stats.zscore)
df['dv_z'] = df['dv'].pipe(stats.zscore)
df['dependency_size_z'] = df['dependency_size'].pipe(stats.zscore)
df['cve_density_z'] = df['cve_density'].pipe(stats.zscore)
df['u_sloc_z'] = df['#u_sloc'].pipe(stats.zscore)
df['d_sloc_z'] = df['#d_sloc'].pipe(stats.zscore)
df['dw_sloc_z'] = df['#dw_sloc'].pipe(stats.zscore)
df['dnw_sloc_z'] = df['#dnw_sloc'].pipe(stats.zscore)


<a id="rq1-visual"></a>
### RQ1 - Dataset Visualization
The following four figures present the regression line of the number of vulnerabilities against the 4 factors: _'sloc'_, _'dependencies'_, _'reuse-ratio'_ and _'classes'_.

[Back to table of contents](#index)

In [None]:
# print plots with regression line

seaborn.lmplot(x='sloc',y='v',data=df,fit_reg=True, scatter_kws={"s": marker_size})
# seaborn.lmplot(x='sloc_z',y='v_z',data=df,fit_reg=True, scatter_kws={"s": marker_size})
seaborn.lmplot(x='dependencies',y='v',data=df,fit_reg=True, scatter_kws={"s": marker_size})
# seaborn.lmplot(x='v_z',y='dependencies_z',data=df,fit_reg=True, scatter_kws={"s": marker_size})
seaborn.lmplot(x='reuse_ratio', y='v',data=df,fit_reg=True, scatter_kws={"s": marker_size})
# seaborn.lmplot(x='reuse_ratio_z',y='v_z',data=df,fit_reg=True, scatter_kws={"s": marker_size})
seaborn.lmplot(x='classes', y='v',data=df,fit_reg=True, scatter_kws={"s": marker_size})
# seaborn.lmplot(x='classes_z',y='v_z',data=df,fit_reg=True, scatter_kws={"s": marker_size})

<a id="rq1-regression-multivariate"></a>
### Multivariate Regression Analysis
Here, we calculate the standardized beta values and perform a multivariate regression analysis on the four factors: _'sloc'_, _'dependencies'_, _'reuse-ratio'_ and _'classes'_.

[Back to table of contents](#index)

In [None]:
# OLS with beta standardized 
ols_model = sm.ols(formula="v_z ~ sloc_z + dependencies_z + reuse_ratio_z + classes_z", data=df)
result = ols_model.fit()

print(result.summary())


<a id="rq1-regression-multivariate-well-known"></a>
### Multivariate Regression Analysis [well-known]
Here, we calculate the standardized beta values and perform a multivariate regression analysis on the four factors: _'sloc'_, _'dependencies'_, _'reuse-ratio'_ and _'classes'_.

[Back to table of contents](#index)

In [None]:
# OLS with beta standardized 
ols_model = sm.ols(formula="v_z ~ u_sloc_z + dw_sloc_z  + dnw_sloc_z", data=df)
result = ols_model.fit()

print(result.summary())


<a id="rq1-correlation-well_knonw"></a>
### Correlation between well-known Ratio and Vulnerabilities
Here, we calculate the Pearson correlation between the amount of vulnerabilitites in a project and the ratio of well-known dependencies.

[Back to table of contents](#index)

In [None]:
# Correlation with Kendall Tau
tau, p_value = stats.kendalltau(df['v_z'], df['wk_ratio_z'])
print(f'tau: {round(tau,2)}, p-value: {round(p_value,2)}')

<a id="rq1-boxplots"></a>
### RQ1 - Boxplots


[Back to table of contents](#index)

In [None]:
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline

matplotlib.rcParams['mathtext.fontset'] = 'custom'
matplotlib.rcParams['mathtext.rm'] = 'Bitstream Vera Sans'
matplotlib.rcParams['mathtext.it'] = 'Bitstream Vera Sans:italic'
matplotlib.rcParams['mathtext.bf'] = 'Bitstream Vera Sans:bold'
matplotlib.rcParams['font.family'] = 'STIXGeneral'

fig, axs = plt.subplots(nrows=1, ncols=4, figsize=(8, 4), tight_layout = {'pad': 1})
bp_vars = ['sloc', 'classes', 'dependencies', 'reuse_ratio'] 
labels = ['Design\nsize', 'Number of\nclasses', 'Number of\ndependencies', 'Reuse\nratio'] 

# Plot boxes
for i in range(len(labels)):
    bxp_df = df[bp_vars[i]]
    axs[i].boxplot(bxp_df, showfliers=False)
    axs[i].set_xticks([])
    axs[i].set_title(labels[i])

fig.subplots_adjust(hspace=0.1, wspace=0.5)

plt.savefig("../figs/boxplots_rq1.pdf")

plt.show()

<a id="rq2"></a>
## RQ2
__RQ2: "How are potential security vulnerabilities distributed between native and reused code?"__

[Back to table of contents](#index)

<a id="rq2-pd"></a>
### RQ2 - Prepare Dataset
Define new variables for the analysis of RQ2 and calculate their standardized beta values. 
[Back to table of contents](#index)

In [None]:
#
# Define and calculate new variables
#
df['reuse_ratio'] = df['#d_sloc'] / (df['#d_sloc']+df['#u_sloc'])
df['uv_ratio'] = df['#uv'] / df['#u_sloc']
df['dv_ratio'] = df['#dv'] / df['#d_sloc']
df['#v_sloc'] = (df['#uv'] + df['#dv']) / (df['#d_sloc']+df['#u_sloc']) # vulnerability density

#
# Standardize beta coefficient (by z-score)
#
df['reuse_ratio_z'] = df['reuse_ratio'].pipe(stats.zscore)
df['uv_ratio_z'] = df['uv_ratio'].pipe(stats.zscore) # vulnerability density in native code
df['dv_ratio_z'] = df['dv_ratio'].pipe(stats.zscore) # vulnerability density in reused code
df['v_sloc_z'] = df['#v_sloc'].pipe(stats.zscore) # vulnerability density

<a id="rq2-scatter"></a>
### RQ2 - Scatterplots

[Back to table of contents](#index)

In [None]:
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline
matplotlib.rcParams.update({'font.size': 16})
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(20, 6), tight_layout = {'pad': 1})
label_size = 24

axs[0].scatter(df['uv_ratio'], df['reuse_ratio'],s=5,cmap='bwr')
axs[0].set_xlim([-0.0001,0.02])
axs[0].set_xlabel("Native Vulnerability Density", fontsize=label_size)
axs[0].set_ylabel('Reuse Ratio', rotation=90, fontsize=label_size)
axs[1].scatter(df['dv_ratio'], df['reuse_ratio'],s=5,cmap='bwr')
axs[1].set_xlim([-0.0001,0.01])
axs[1].set_xlabel("Reused Vulnerability Density", fontsize=label_size)
axs[1].set_yticks([])

fig.subplots_adjust(wspace=0.1)
plt.savefig("../figs/scatter_plots.pdf")
plt.show()

<a id="rq2-boxplots2"></a>
### RQ2 - Boxplots

[Back to table of contents](#index)

In [None]:
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline

def draw_seperate_plots():
    matplotlib.rcParams['mathtext.fontset'] = 'custom'
    matplotlib.rcParams['mathtext.rm'] = 'Bitstream Vera Sans'
    matplotlib.rcParams['mathtext.it'] = 'Bitstream Vera Sans:italic'
    matplotlib.rcParams['mathtext.bf'] = 'Bitstream Vera Sans:bold'
    matplotlib.rcParams['font.family'] = 'STIXGeneral'

    fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(8, 4), tight_layout = {'pad': 1})
    bp_vars = ['uv_ratio', 'dv_ratio', '#v_sloc'] #'reuse_ratio'
    labels = ['Native\nvulnerabilities density', 'Reused\nvulnerabilities density', 'Overall\nvulnerabilities density'] #'Reuse ratio', 

    # Plot boxes
    for i in range(len(labels)):
        bxp_df = df[bp_vars[i]]
        axs[i].boxplot(bxp_df, showfliers=False)
        axs[i].set_xticks([])
        axs[i].set_ylim([-0.0001,0.0065])
        axs[i].set_ylim([-0.0001,0.0100])
        axs[i].set_title(labels[i])

    fig.subplots_adjust(hspace=0.1, wspace=0.5)

    plt.savefig("../figs/boxplots2.pdf")

    plt.show()
    

def draw_merged_plots():
    matplotlib.rcParams['mathtext.fontset'] = 'custom'
    matplotlib.rcParams['mathtext.rm'] = 'Bitstream Vera Sans'
    matplotlib.rcParams['mathtext.it'] = 'Bitstream Vera Sans:italic'
    matplotlib.rcParams['mathtext.bf'] = 'Bitstream Vera Sans:bold'
    matplotlib.rcParams['font.family'] = 'STIXGeneral'

    # Multiple box plots on one Axes
    boxplots_df = [df[bp_vars[0]]*1000, df[bp_vars[1]]*1000, df[bp_vars[2]]*1000]

    fig = plt.figure(1, figsize=(6, 4),tight_layout = {'pad': 1})

    # fig, ax = plt.subplots(figsize=(10, 6), tight_layout = {'pad': 1})

    # Create an axes instance
    ax = fig.add_subplot(111)
    ax.boxplot(boxplots_df, showfliers=False, widths=0.15)


    # ax.set_xticks([])
    ax.set_ylim([-0.0001*1000,0.0100*1000])
    ax.yaxis.grid(False)
    ## Custom x-axis labels
    ax.set_xticklabels(labels)
    ax.set_axisbelow(True)

    # Create the boxplot
    # bp = ax.boxplot(boxplots_df)
    plt.savefig("../figs/boxplots_rq2_compact.pdf")
    plt.show()
    
draw_seperate_plots()
draw_merged_plots()

<a id="rq2-regression"></a>
### RQ2 - Regression Analysis [vuln-density, reuse-ratio]
In the following analysis we investigate how reuse ratio in a project is related to its vulnerability density. 

The results show that there is no evidence that these two variables are somehow related.

[Back to table of contents](#index)

In [None]:
# OLS with beta standardized 
df['v_sloc'] = df['#v_sloc']
ols_model = sm.ols(formula="v_sloc_z ~ reuse_ratio_z", data=df)
result = ols_model.fit()

print(result.summary())

seaborn.lmplot(x='reuse_ratio', y='#v_sloc',data=df,fit_reg=True, scatter_kws={"s": marker_size})

<a id="rq2-regression2"></a>
### RQ2 - Regression Analysis [native-vuln-density, reuse-ratio]
In the following analysis we investigate how reuse ratio in a project is related to its vulnerability density in the native code. 

The results show that there is a weak correlation between the two variables. 
__Very unexpected results__: How can we interprete? 

[Back to table of contents](#index)

In [None]:
# OLS with beta standardized 
ols_model = sm.ols(formula="uv_ratio_z ~ reuse_ratio_z", data=df)
result = ols_model.fit()

print(result.summary())

seaborn.lmplot(x='reuse_ratio_z', y='uv_ratio_z',data=df,fit_reg=True, scatter_kws={"s": marker_size})

<a id="rq2-regression3"></a>
### RQ2 - Multivariate Regression Analysis [vuln-density, native-sloc, reuse-sloc]
In the following analysis we investigate how native and reused code contribute to the vulnerability density of the project. 
<!--
The results show that there is a weak correlation between the two variables. 
__Very unexpected results__: How can we interprete? 
-->

[Back to table of contents](#index)

In [None]:
# OLS with beta standardized 
ols_model = sm.ols(formula="uv_ratio_z ~ u_sloc_z + d_sloc_z", data=df)
result = ols_model.fit()

print(result.summary())

<a id="rq2-regression4"></a>
### RQ2 - Multivariate Regression Analysis [vuln-density, native-vuln-density, reuse-vuln-density]
In the following analysis we investigate how native and reused code contribute to the vulnerability density of the project. 

[Back to table of contents](#index)

In [None]:
# OLS with beta standardized 
ols_model = sm.ols(formula="v_sloc_z ~ uv_ratio_z + dv_ratio_z", data=df)
result = ols_model.fit()

print(result.summary())

<a id="rq3"></a>
## RQ3
__RQ3: "To What extent do open source projects suffer from vulnerabilities introduced through dependencies?"__.
For that RQ we collect information from the the OWASP dependenvcy-check tool in order to find how projects may use 

[Back to table of contents](#index)


<a id="rq3-dd"></a>
### RQ3 - Dataset Description
Vizualize how projects are distributed based to the number of their disclosed vulnerabilities. 

[Back to table of contents](#index)

In [None]:
import seaborn as sns

sns.set(font_scale=1.1)
ax = sns.violinplot(y=df['#cves'])
fig = ax.get_figure()
ax.set_xlabel("Disclosed Vulnerabilities in Projects")
ax.set_ylabel("Observed values")

fig.savefig('../figs/rq3_violin.pdf')

<a id="rq3-regression"></a>
### RQ3 - Regression Analysis [#cves - #dependencies]
Perform a regression analysis to investigate how the number of the disclosed vulnerabilities of a project is related to the number of its dependencies.

[Back to table of contents](#index)

In [None]:
# OLS with beta standardized 
ols_model = sm.ols(formula="cves_z ~ dependencies_z", data=df)
result = ols_model.fit()

print(result.summary())

# print plots with regression line
seaborn.lmplot(x='dependencies',y='cves',data=df,fit_reg=True, scatter_kws={"s": marker_size})

<a id="rq3-regression-potential"></a>
### RQ3 - Regression Analysis [#v - #dependencies]

Perform a regression analysis to investigate how the number of the disclosed vulnerabilities of a project is related to the number of its dependencies.

[Back to table of contents](#index)

In [None]:
# OLS with beta standardized 
ols_model = sm.ols(formula="v_z ~ dependencies_z", data=df)
result = ols_model.fit()

print(result.summary())

# print plots with regression line
seaborn.lmplot(x='dependencies',y='v',data=df,fit_reg=True, scatter_kws={"s": marker_size})

<a id="rq3-regression2"></a>
### RQ3 - Regression Analysis [#cves - #module_size]
Perform a regression analysis to investigate how the number of the disclosed vulnerabilities of a project is related to the size of its dependencies.

The results show that the size of a module(dependency) is not related to the number of its disclosed vulnerablities. 

[Back to table of contents](#index)

In [None]:
# OLS with beta standardized 
ols_model = sm.ols(formula="cves_z ~ dependency_size_z", data=df)
result = ols_model.fit()

print(result.summary())

# print plots with regression line
seaborn.lmplot(x='dependency_size',y='cves',data=df,fit_reg=True, scatter_kws={"s": marker_size})

<a id="rq3-regression3"></a>
### RQ3 - Regression Analysis [#cve-density - #dependencies]
Perform a regression analysis to investigate how the cve density of a project is related to the number of its dependencies.

[Back to table of contents](#index)

In [None]:
df_filtered = df[df['cve_density'] < 0.2] # filter a great outlier

# OLS with beta standardized 
ols_model = sm.ols(formula="cve_density_z ~ dependencies_z", data=df_filtered)
result = ols_model.fit()

print(result.summary())

seaborn.lmplot(x='dependencies', y='cve_density',data=df_filtered,fit_reg=True, scatter_kws={"s": marker_size})

<a id="rq3-regression-multi"></a>
### RQ3 - Multivariate Regression Analysis
Perform a multivariate regression analysis to investigate how the cves of a project is related to the following variables: number of its dependencies, size of the depndencies, reused_code.

[Back to table of contents](#index)

In [None]:
# OLS with beta standardized 
ols_model = sm.ols(formula="cves_z ~ dependencies_z + dependency_size_z + d_sloc_z", data=df)
result = ols_model.fit()

print(result.summary())

In [None]:
# print plots with regression line

seaborn.lmplot(x='dependencies_z',y='cves_z',data=df,fit_reg=True, scatter_kws={"s": marker_size})
seaborn.lmplot(x='dependency_size_z',y='cves_z',data=df,fit_reg=True, scatter_kws={"s": marker_size})
seaborn.lmplot(x='d_sloc_z', y='cves_z',data=df,fit_reg=True, scatter_kws={"s": marker_size})
seaborn.lmplot(x='dependency_size_z',y='d_sloc_z',data=df,fit_reg=True, scatter_kws={"s": marker_size})
seaborn.lmplot(x='dependencies_z',y='d_sloc_z',data=df,fit_reg=True, scatter_kws={"s": marker_size})



<a id="rq3-count"></a>
### RQ3 - Vulnerable projects
The following script identifies projects that contain at least one vulnerable dependency.

[Back to table of contents](#index)

In [None]:
vul_projects = df[df['#cves'] > 0]
print("Vulnerable projects {} out of {} [{:2.2%}]".format(len(vul_projects.index), len(df.index), len(vul_projects.index)/len(df.index)))

<a id="rq4"></a>
## RQ4
__RQ4: "How is the use frequency of a dependency related to its disclosed vulnerabilities"__.
For this RQ we: 
1. Generate the dataset and present the descriptive statistics,
2. Count the vulnerable dependencies
3. Perform a univariate regression analysis between the number of vulnerabilities and its use frequency.

[Back to table of contents](#index)


<a id="rq4-pd"></a>
### RQ4 - Prepare dataset
The following code generates the dataset used for answering RQ4 and presents its descriptive statistics. 
[Back to table of contents](#index)

In [None]:
import csv
import logging
import pandas as pd

logging.basicConfig(level=logging.INFO)

def get_dependencies(dependencies_usages):
    dependencies = []
    with open(dependencies_usages, 'r') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=';')
        for row in csv_reader:
            logging.debug("{}::{}".format(row[0],row[1]))
            dependencies.append([row[0],row[1]])
    
    return dependencies


def get_vulnerabilities(owasp_vulnerabilities):
    dependencies_vulns = {}
    with open(owasp_vulnerabilities, 'r') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=';')
        for row in csv_reader:
            logging.debug("{}::{}".format(row[0],row[1]))
            dependencies_vulns[row[0]] = row[1]
    
    return dependencies_vulns

def get_potential_vulnerabilities(depependencies_spotbugs):
    depependencies_potential_vulns = {}
    with open(depependencies_spotbugs, 'r') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=';')
        for row in csv_reader:
            logging.debug("{}::{}".format(row[0],row[1]))
            depependencies_potential_vulns[row[0]] = row[1]
    
    return depependencies_potential_vulns
    
    
def create_dataset(dependencies_usages, owasp_vulnerabilities, depependencies_spotbugs):
    dependencies = get_dependencies(dependencies_usages)
    logging.info("Dependencies with usages :: {}".format(len(dependencies)))
    dependencies_vulns = get_vulnerabilities(owasp_vulnerabilities)
    logging.info("Dependencies with vulnerabilities :: {}".format(len(dependencies_vulns)))
    depependencies_potential_vulns = get_potential_vulnerabilities(depependencies_spotbugs)
    logging.info("Dependencies with potential vulnerabilities :: {}".format(len(depependencies_potential_vulns)))
    data = []
    
    logging.info("Creating dataset...")
    for entry in dependencies:
        logging.debug("Parsing usage dependency :: {}".format(entry))
        dependency = entry[0]
        usages = int(entry[1])
        vulns = 0
        potential_vulns = 0
        if dependency not in dependencies_vulns:
            logging.warning("Dependency not in owasp reports :: {}".format(dependency))
        else:
            vulns = int(dependencies_vulns[dependency])
        if dependency not in depependencies_potential_vulns:
            logging.warning("Dependency not in spotbugs reports :: {}".format(dependency))
        else:
            potential_vulns = int(depependencies_potential_vulns[dependency])
            
        data_entry = [dependency, usages, vulns, potential_vulns]
        data.append(data_entry)
    
    return data

owasp_vulnerabilities = '../owasp_vulnerabilities.csv'
dependencies_usages = '../depependencies_usages.csv'
depependencies_spotbugs = "../depependencies_spotbugs.csv"

data = create_dataset(dependencies_usages, owasp_vulnerabilities, depependencies_spotbugs)
logging.info("Created dataset with {} entries".format(len(data)))
# print(data[1:10])
df_vulnerable = pd.DataFrame(data, columns = ['Dependency', 'Usages', 'Vulnerabilities', 'Potential_vulns'])
# df_vulnerable[1:10]
df_vulnerable.describe()

<a id="rq4-count"></a>
### RQ4 - Count Vulnerable dependencies
In this step we analyze the dependencies used in the projects and report those that are vulnerable with at least one disclosed vulnerability. 

[Back to table of contents](#index)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats

# df_vulnerable[1:10]
df_vulnerable_filtered = df_vulnerable[df_vulnerable['Vulnerabilities'] > 0] # exclude non-vulnerable dependencies
df_vulnerable_filtered = df_vulnerable_filtered[df_vulnerable_filtered['Vulnerabilities'] < 40] # exclude one extreme (outlier) value
# df_vulnerable = df_vulnerable[df_vulnerable['Usages'] < 40] # exclude one extreme (outlier) value

print("Found {} vulnerable dependencies out of {} total [{:2.2%}]".format(len(df_vulnerable_filtered.index), len(df_vulnerable.index), len(df_vulnerable_filtered.index)/len(df_vulnerable.index)))

sns.set(font_scale=1.1)
ax = sns.violinplot(y=df_vulnerable_filtered['Vulnerabilities'])
fig = ax.get_figure()
ax.set_ylabel("Disclosed Vulnerabilities")
fig.savefig('../figs/rq4_violin.pdf')
# df_vulnerable_filtered.plot(kind='scatter',x='Usages',y='Vulnerabilities',color='red')
# df_vulnerable[1:10]


<a id="rq4-regression"></a>
### RQ4 - Regression Analysis [CVEs - Usages]


[Back to table of contents](#index)

In [None]:
# todo zero values
df_vulnerable_filtered['Vulnerabilities_z'] = df_vulnerable_filtered['Vulnerabilities'].pipe(stats.zscore)
df_vulnerable_filtered['Usages_z'] = df_vulnerable_filtered['Usages'].pipe(stats.zscore)

# OLS with beta standardized 
ols_model = sm.ols(formula="Vulnerabilities_z ~ Usages_z", data=df_vulnerable_filtered)
result = ols_model.fit()

print(result.summary())

# print plots with regression line
# seaborn.lmplot(x='sloc',y='v',data=df,fit_reg=True, scatter_kws={"s": marker_size})
seaborn.lmplot(x='Vulnerabilities_z',y='Usages_z',data=df_vulnerable_filtered,fit_reg=True, scatter_kws={"s": marker_size})


<a id="rq4-regression2"></a>
### RQ4 - Regression Analysis [Potential Vulns - Usages]


[Back to table of contents](#index)

In [None]:
df_vulnerable_filtered['Potential_vulns_z'] = df_vulnerable_filtered['Potential_vulns'].pipe(stats.zscore)

# OLS with beta standardized 
ols_model = sm.ols(formula="Potential_vulns_z ~ Usages_z", data=df_vulnerable_filtered)
result = ols_model.fit()

print(result.summary())

# print plots with regression line
seaborn.lmplot(x='Usages',y='Potential_vulns',data=df_vulnerable_filtered,fit_reg=True, scatter_kws={"s": marker_size})


<a id="discussion"></a>
## [Discussion] How are potential vulnerabilities related to disclosed ones?

[Back to table of contents](#index)

In [None]:
# OLS with beta standardized 
ols_model = sm.ols(formula="cves_z ~ dv_z", data=df)
result = ols_model.fit()

print(result.summary())

# print plots with regression line
fig = seaborn.lmplot(x='cves_z',y='dv_z',data=df,fit_reg=True, scatter_kws={"s": marker_size})

plt.xlabel('Disclosed vulnerabilities')
plt.ylabel('Potential vulnerabilities')
fig.savefig('../figs/vulnerabilities_z.pdf')

In [None]:
# print plots with regression line
fig = seaborn.lmplot(x='cves',y='dv',data=df,fit_reg=True, scatter_kws={"s": marker_size})

plt.xlabel('Disclosed vulnerabilities')
plt.ylabel('Potential vulnerabilities')
fig.savefig('../figs/vulnerabilities.pdf')

<a id="jss-rev1"></a>
# JSS Revision 1 - New analysis

<a id="eyes-on-dependencies"></a>
## Eyes on dependencies

[Back to table of contents](#index)

In [None]:
import pandas as pd
import pprint
pp = pprint.PrettyPrinter(indent=1)

def create_dependencies_to_contributors_dataframe(projects_info,dependencies_usages,dependencies_info,dependencies_cves, dependencies_spotbugs):
    # collect projects and contributors
    with open(projects_info) as f:
        lines = f.read().splitlines()
    projects = {}
    for line in lines[1:]:
        fields = line.split(',')
        projects[fields[0]]=int(fields[5])
        
    # collect dependencies info
    with open(dependencies_info) as f:
        lines = f.read().splitlines()
    
    dependencies = {}
    for line in lines[1:]:
        fields = line.split(';')
        dependencies[fields[0]]=[int(fields[2]),int(fields[6])]
    
    # collect dependencies usages
    with open(dependencies_usages) as f:
        lines = f.read().splitlines()
    
    for line in lines[1:]:
        fields = line.split(';')
        dep_name = fields[0]
        used_in = fields[2:]
        sum=0
        for project in used_in:
            sum += projects[project]
        if dep_name in dependencies:
            dependencies[dep_name].append(len(used_in))
            dependencies[dep_name].append(sum)
            
    # collect cves
    with open(dependencies_cves) as f:
        lines = f.read().splitlines()
    for line in lines:
        fields = line.split(';')
        dep_name = fields[0]
        cves = fields[2]
        if dep_name in dependencies:
            dependencies[dep_name].append(int(cves))
    
    # collect spotbugs
    with open(dependencies_spotbugs) as f:
        lines = f.read().splitlines()
    for line in lines:
        fields = line.split(';')
        dep_name = fields[0]
        potential_vulns = fields[1]
        if dep_name in dependencies:
            dependencies[dep_name].append(int(potential_vulns))

    # cleanup entries with missing fields (only one)
    delete = [key for key in dependencies if len(dependencies[key]) < 5] 
    for key in delete: del dependencies[key] 
    
    #TODO: transform the dict to dataframe
    df_dict = {'dependency': [],'enterprise': [],'well_known': [],'used_projects': [], 'contributors_in_used_projects': [],'cves': [],'spotbugs_vuls': []}
    for d in dependencies:
        df_dict['dependency'].append(d)
        df_dict['enterprise'].append(dependencies[d][0])
        df_dict['well_known'].append(dependencies[d][1])
        df_dict['used_projects'].append(dependencies[d][2])
        df_dict['contributors_in_used_projects'].append(dependencies[d][3])
        df_dict['cves'].append(dependencies[d][4])
        df_dict['spotbugs_vuls'].append(dependencies[d][5])
    
    return pd.DataFrame.from_dict(df_dict)
    

dependencies_info = "../dependencies_groupids_enterprise_info.csv"
dependencies_usages = "../depependencies_usages.csv"
projects_info = "../projects_groupids_enterprise_info.csv"
dependencies_spotbugs = "../depependencies_spotbugs.csv"
dependencies_cves = "../owasp_vulnerabilities.csv"

df_deps = create_dependencies_to_contributors_dataframe(projects_info,dependencies_usages,dependencies_info, dependencies_cves, dependencies_spotbugs)
df_deps.describe()
df_deps.median()

df_deps_enterprise = df_deps[df_deps['enterprise'] >0]
print((len(df_deps_enterprise)/len(df_deps))*100)

In [None]:
from scipy import stats 

print(f'overall (N={len(df_deps)})')
df_test = df_deps
for v in ['cves', 'spotbugs_vuls']:
    tau, p_value = stats.kendalltau(df_test[v], df_test['contributors_in_used_projects'])
    print(f'  [{v} x contrib.] tau: {round(tau,2)}, p-value: {round(p_value,2)}')
    tau, p_value = stats.kendalltau(df_test[v], df_test['used_projects'])
    print(f'  [{v} x n_projs ] tau: {round(tau,2)}, p-value: {round(p_value,2)}')
            
for c in ['enterprise', 'well_known']:
    for b in [1,0]:
        df_test = df_deps[df_deps[c] == b]
        print(f'{c} == {b} (N={len(df_test)})')
        for v in ['cves', 'spotbugs_vuls']:
            tau, p_value = stats.kendalltau(df_test[v], df_test['contributors_in_used_projects'])
            print(f'  [{v} x contrib.] tau: {round(tau,2)}, p-value: {round(p_value,2)}')
            tau, p_value = stats.kendalltau(df_test[v], df_test['used_projects'])
            print(f'  [{v} x n_projs ] tau: {round(tau,2)}, p-value: {round(p_value,2)}')

df_test = df_deps[(df_deps['enterprise'] == 1) & (df_deps['well_known'] == 1)]
print(f'enterprise == 1 & well_known == 1 (N={len(df_test)})')
for v in ['cves', 'spotbugs_vuls']:
    tau, p_value = stats.kendalltau(df_test[v], df_test['contributors_in_used_projects'])
    print(f'  [{v} x contrib.] tau: {round(tau,2)}, p-value: {round(p_value,2)}')
    tau, p_value = stats.kendalltau(df_test[v], df_test['used_projects'])
    print(f'  [{v} x n_projs ] tau: {round(tau,2)}, p-value: {round(p_value,2)}')      

In [None]:
ols_model = sm.ols(formula="contributors_in_used_projects ~ cves", data=df_deps)
result = ols_model.fit()

print(result.summary())

<a id="enterprise-vs-volunteer"></a>
## Redo RQs for enterprise projects vs. volunteer projects [revision 1 comment 1]

[Back to table of contents](#index)

In [None]:
# Split datasets
df_e = df[df['is_enterprise'] == 1]
df_ne = df[df['is_enterprise'] == 0]

In [None]:
#
# RQ1
#

# OLS with beta standardized
ols_model = sm.ols(formula="v_z ~ sloc_z + dependencies_z + reuse_ratio_z + classes_z", data=df_e)
result = ols_model.fit()
print(result.summary())

ols_model = sm.ols(formula="v_z ~ sloc_z + dependencies_z + reuse_ratio_z + classes_z", data=df_ne)
result = ols_model.fit()
print(result.summary())

ols_model = sm.ols(formula="v_z ~ u_sloc_z + dw_sloc_z  + dnw_sloc_z", data=df_e)
result = ols_model.fit()
print(result.summary())

ols_model = sm.ols(formula="v_z ~ u_sloc_z + dw_sloc_z  + dnw_sloc_z", data=df_ne)
result = ols_model.fit()
print(result.summary())

# Correlation with Kendall Tau
tau, p_value = stats.kendalltau(df_e['v_z'], df_e['wk_ratio_z'])
print(f'Enterprise:     tau: {round(tau,2)}, p-value: {round(p_value,2)}')
tau, p_value = stats.kendalltau(df_ne['v_z'], df_ne['wk_ratio_z'])
print(f'Non-enterprise: tau: {round(tau,2)}, p-value: {round(p_value,2)}')

In [None]:
#
# RQ2
#

print('=====================================================')
print('RQ2 - Regression Analysis [vuln-density, reuse-ratio]')
print('=====================================================\n')

ols_model = sm.ols(formula="v_sloc_z ~ reuse_ratio_z", data=df)
result = ols_model.fit()
print(result.summary())
ols_model = sm.ols(formula="v_sloc_z ~ reuse_ratio_z", data=df_e)
result = ols_model.fit()
print(result.summary())
ols_model = sm.ols(formula="v_sloc_z ~ reuse_ratio_z", data=df_ne)
result = ols_model.fit()
print(result.summary())

In [None]:
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline

matplotlib.rcParams['mathtext.fontset'] = 'custom'
matplotlib.rcParams['mathtext.rm'] = 'Bitstream Vera Sans'
matplotlib.rcParams['mathtext.it'] = 'Bitstream Vera Sans:italic'
matplotlib.rcParams['mathtext.bf'] = 'Bitstream Vera Sans:bold'
matplotlib.rcParams['font.family'] = 'STIXGeneral'

bp_vars = ['uv_ratio', 'dv_ratio', '#v_sloc'] #'reuse_ratio'
labels = ['Native\nvulnerabilities density', '', 'Reused\nvulnerabilities density', '', 'Overall\nvulnerabilities density'] #'Reuse ratio', 

# Multiple box plots on one Axes
boxplots_df = []
for v in bp_vars:
    boxplots_df.append(df_e[v]*1000)
    boxplots_df.append(df_ne[v]*1000)

fig = plt.figure(1, figsize=(6, 4),tight_layout = {'pad': 1})

# Create an axes instance
ax = fig.add_subplot(111)
ax.boxplot(boxplots_df, showfliers=False, widths=0.15)

# ax.set_xticks([])
ax.set_ylim([-0.0001*1000,0.0100*1000])
ax.yaxis.grid(False)
## Custom x-axis labels
ax.set_xticklabels(labels)
ax.set_axisbelow(True)

# Create the boxplot
# bp = ax.boxplot(boxplots_df)
# plt.savefig("../figs/boxplots_rq2_compact.pdf")
plt.show()

In [None]:
for test_var in ['uv_ratio', 'dv_ratio', '#v_sloc']:
    t = stats.ttest_ind(df_e[test_var],df_ne[test_var])
    print(f'Comparison of {test_var}')
    print(f'\tStatistic={t[0]:.2f} (p={t[1]:.2f})')

In [None]:
#
# RQ3
#

print('=====================================================')
print('RQ3 - Regression Analysis [#cves - #dependencies]')
print('=====================================================\n')
ols_model = sm.ols(formula="cves_z ~ dependencies_z", data=df)
result = ols_model.fit()
print(result.summary())
ols_model = sm.ols(formula="cves_z ~ dependencies_z", data=df_e)
result = ols_model.fit()
print(result.summary())
ols_model = sm.ols(formula="cves_z ~ dependencies_z", data=df_ne)
result = ols_model.fit()
print(result.summary())

# print plots with regression line
# seaborn.lmplot(x='dependencies',y='cves',data=df,fit_reg=True, scatter_kws={"s": marker_size})

print('=====================================================')
print('RQ3 - Regression Analysis [#v - #dependencies]')
print('=====================================================\n')
ols_model = sm.ols(formula="v_z ~ dependencies_z", data=df)
result = ols_model.fit()
print(result.summary())
ols_model = sm.ols(formula="v_z ~ dependencies_z", data=df_e)
result = ols_model.fit()
print(result.summary())
ols_model = sm.ols(formula="v_z ~ dependencies_z", data=df_ne)
result = ols_model.fit()
print(result.summary())

# print plots with regression line
# seaborn.lmplot(x='dependencies',y='v',data=df,fit_reg=True, scatter_kws={"s": marker_size})


print('=====================================================')
print('RQ3 - Regression Analysis [#cves - #module_size]')
print('=====================================================\n')
ols_model = sm.ols(formula="cves_z ~ dependency_size_z", data=df)
result = ols_model.fit()
print(result.summary())
ols_model = sm.ols(formula="cves_z ~ dependency_size_z", data=df_e)
result = ols_model.fit()
print(result.summary())
ols_model = sm.ols(formula="cves_z ~ dependency_size_z", data=df_ne)
result = ols_model.fit()
print(result.summary())
      
# print plots with regression line
# seaborn.lmplot(x='dependency_size',y='cves',data=df,fit_reg=True, scatter_kws={"s": marker_size})


print('=====================================================')
print('RQ3 - Regression Analysis [#cve-density - #dependencies]')
print('=====================================================\n')
df_filtered = df[df['cve_density'] < 0.2] # filter a great outlier
df_e_filtered = df_e[df_e['cve_density'] < 0.2] # filter a great outlier
df_ne_filtered = df_ne[df_ne['cve_density'] < 0.2] # filter a great outlier

ols_model = sm.ols(formula="cve_density_z ~ dependencies_z", data=df_filtered)
result = ols_model.fit()
print(result.summary())
ols_model = sm.ols(formula="cve_density_z ~ dependencies_z", data=df_e_filtered)
result = ols_model.fit()
print(result.summary())
ols_model = sm.ols(formula="cve_density_z ~ dependencies_z", data=df_ne_filtered)
result = ols_model.fit()
print(result.summary())

# seaborn.lmplot(x='dependencies', y='cve_density',data=df,fit_reg=True, scatter_kws={"s": marker_size})


print('=====================================================')
print('RQ3 - Multivariate Regression Analysis')
print('=====================================================\n')
ols_model = sm.ols(formula="cves_z ~ dependencies_z + dependency_size_z + d_sloc_z", data=df)
result = ols_model.fit()
print(result.summary())
ols_model = sm.ols(formula="cves_z ~ dependencies_z + dependency_size_z + d_sloc_z", data=df_e)
result = ols_model.fit()
print(result.summary())
ols_model = sm.ols(formula="cves_z ~ dependencies_z + dependency_size_z + d_sloc_z", data=df_ne)
result = ols_model.fit()
print(result.summary())


print('=====================================================')
print('RQ3 - Vulnerable projects')
print('=====================================================\n')
vul_projects   = df[df['#cves'] > 0]
vul_projects_e = df_e[df_e['#cves'] > 0]
vul_projects_ne = df_ne[df_ne['#cves'] > 0]
print("Vulnerable projects {} out of {} [{:2.2%}]".format(len(vul_projects.index), len(df.index), len(vul_projects.index)/len(df.index)))
print("Enterprise: Vulnerable projects {} out of {} [{:2.2%}]".format(len(vul_projects_e.index), len(df_e.index), len(vul_projects_e.index)/len(df_e.index)))
print("Non-enterprise: Vulnerable projects {} out of {} [{:2.2%}]".format(len(vul_projects_ne.index), len(df_ne.index), len(vul_projects_ne.index)/len(df_ne.index)))