In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

from statsmodels.tsa.stattools import kpss
from statsmodels.tsa.stattools import adfuller
from scipy import stats

%matplotlib inline
%config Completer.use_jedi = False

mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42

sns.set_context("paper")

sns.set(style="whitegrid", color_codes=True, font_scale=1.3)
color_blind = ["#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"]
divert_color = ['#a6611a','#dfc27d','#f5f5f5','#80cdc1','#018571']

In [2]:
# Confusion matrix comes from pandas.crosstab
def cramers_corrected_stat(confusion_matrix):
    chi2 = stats.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))

In [3]:
df_sr_commits = pd.read_feather('data/feather_files/SecurityRelevantCommits.feather')
df_sr_commits.drop_duplicates(inplace=True)
display(df_sr_commits.head())

Unnamed: 0,PostId,PostVersionId,IsRelevant
0,39166048,5221081,-1
3,39166048,5221085,-1
4,13120495,53622690,-1
7,13120495,53622692,-1
9,13120495,53622693,-1


In [4]:
df_PostVersion = pd.read_feather('data/feather_files/PostVersion.feather', columns=['Id', 'CreationDate', 'PostId', 'PostHistoryId', 'PostHistoryTypeId'])
display(df_PostVersion.head())

Unnamed: 0,Id,CreationDate,PostId,PostHistoryId,PostHistoryTypeId
0,1,2012-11-30 13:41:44,13646426,32556579,2
1,2,2014-09-23 07:29:34,25989369,74075424,2
2,3,2016-06-11 03:54:15,37759745,120182375,2
3,4,2020-04-24 17:56:14,61414546,220036678,2
4,5,2016-06-11 11:08:04,37759745,120194969,5


In [5]:
df_CommonMarkEdits = pd.read_feather('data/feather_files/PostHistoryId_CommonMark.feather')

In [6]:
df_pcs = pd.merge(df_sr_commits, df_PostVersion, left_on=['PostVersionId', 'PostId'], right_on=['Id', 'PostId'], how='left')
assert(df_pcs['PostVersionId'].equals(df_pcs['Id']))
df_pcs.drop(columns=['Id'], inplace=True)
df_pcs = df_pcs[df_pcs['PostHistoryTypeId'] == 5]
df_pcs = df_pcs[~df_pcs['PostHistoryId'].isin(df_CommonMarkEdits['Id'])]
df_pcs.drop_duplicates(inplace=True)
df_pcs.set_index('CreationDate', inplace=True)
df_pcs.sort_values(['CreationDate', 'PostVersionId'], inplace=True)
display(df_pcs)

Unnamed: 0_level_0,PostId,PostVersionId,IsRelevant,PostHistoryId,PostHistoryTypeId
CreationDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2008-08-01 00:20:21,12,39797219,-1,14804,5
2008-08-01 00:23:59,12,39797222,-1,14805,5
2008-08-01 05:02:47,16,271,-1,31726,5
2008-08-01 05:20:00,16,272,-1,31731,5
2008-08-01 05:35:17,19,397,-1,36218,5
...,...,...,...,...,...
2022-06-05 08:05:51,72502353,87238422,-1,271605164,5
2022-06-05 08:09:26,61221343,49184500,-1,271605260,5
2022-06-05 08:09:41,72503522,35689852,0,271605273,5
2022-06-05 08:10:34,72505062,35696041,-1,271605290,5


In [7]:
df_PostVersionData = pd.read_feather('data/feather_files/PostVersionData.feather')
display(df_PostVersionData.head())

Unnamed: 0,Id,PostId,PostVersionId,PostHistoryId,RootPostBlockVersionId,PostBlockVersionId,Label
0,1,18,39797300,16,138300137,138300137,1
1,2,18,39797300,16,138300142,138300142,1
2,3,30,39797390,22,138300395,138300395,1
3,4,34,635,28,2034,2034,1
4,5,52,39797475,43,138300601,138300601,1


In [8]:
df_CodeBlockVersion = pd.read_feather('data/feather_files/CodeBlockVersion.feather')
display(df_CodeBlockVersion.head())

Unnamed: 0,Id,PostBlockVersionId,RootPostBlockVersionId,Language
0,1,21,21,PHP
1,2,61,2,CoffeeScript
2,3,65,4,CoffeeScript
3,4,69,11,C#
4,5,73,37,JavaScript


In [9]:
# Merge the PostVersionData and CodeBlockVersion dataframes such that we have the programming language for each PostBlockVersion (i.e., code snippet)
df_merge = pd.merge(df_PostVersionData[['PostBlockVersionId', 'RootPostBlockVersionId', 'PostId', 'PostVersionId', 'PostHistoryId']], df_CodeBlockVersion[['RootPostBlockVersionId', 'Language']], on=['RootPostBlockVersionId'], how='left')
print(f"Nr. of NaN values in Language: {df_merge['Language'].isna().sum()}")
display(df_merge)

Nr. of NaN values in Language: 30907977


Unnamed: 0,PostBlockVersionId,RootPostBlockVersionId,PostId,PostVersionId,PostHistoryId,Language
0,138300137,138300137,18,39797300,16,SQL
1,138300142,138300142,18,39797300,16,PHP
2,138300395,138300395,30,39797390,22,Markdown
3,2034,2034,34,635,28,JavaScript
4,138300601,138300601,52,39797475,43,TypeScript
...,...,...,...,...,...,...
78483129,118616529,118616529,72160753,34347425,269723453,SQL
78483130,118616531,118616531,72160753,34347425,269723453,SQL
78483131,118616534,118616534,72160753,34347425,269723453,SQL
78483132,118616570,118616570,72160755,34347438,269723461,TypeScript


In [10]:
# Remove all rows where the Language is NaN
df_merge_rough = df_merge[~df_merge['Language'].isna()]
# Remove duplicates for PostVersionId and Language
df_merge_rough = df_merge_rough.drop_duplicates(subset=['PostVersionId', 'Language'])
display(df_merge_rough)

Unnamed: 0,PostBlockVersionId,RootPostBlockVersionId,PostId,PostVersionId,PostHistoryId,Language
0,138300137,138300137,18,39797300,16,SQL
1,138300142,138300142,18,39797300,16,PHP
2,138300395,138300395,30,39797390,22,Markdown
3,2034,2034,34,635,28,JavaScript
4,138300601,138300601,52,39797475,43,TypeScript
...,...,...,...,...,...,...
78483126,118616464,118616461,72160749,34347410,269723447,C#
78483127,267231693,267231682,72160262,86307266,269723449,TypeScript
78483128,118616527,118616527,72160753,34347425,269723453,Ruby
78483129,118616529,118616529,72160753,34347425,269723453,SQL


In [11]:
# Count the number of Language values per PostVersionId
df_merge_rough.groupby('PostVersionId')['Language'].count().reset_index().sort_values('Language', ascending=False)

# Note: Guesslang has a high error rate, we need to find a better way to detect the programming language; ChatGPT works better but is not scalable enough and too expensive for this amount of data

Unnamed: 0,PostVersionId,Language
11733574,24982427,10
20725265,44362685,10
26699101,61194465,10
26503961,60661053,9
30841215,72301692,9
...,...,...
13651251,28805941,1
13651250,28805934,1
13651248,28805931,1
13651247,28805930,1


In [12]:
# Check how many programming languages are used per PostId
df_merge_rough.groupby('PostId')['Language'].nunique().describe()

count    2.692128e+07
mean     1.206909e+00
std      4.901867e-01
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.000000e+01
Name: Language, dtype: float64

In [13]:
df_merge_rough.groupby('RootPostBlockVersionId')['Language'].nunique().reset_index().sort_values('Language', ascending=False)

Unnamed: 0,RootPostBlockVersionId,Language
0,2,1
22515725,181444580,1
22515691,181444296,1
22515690,181444283,1
22515689,181444276,1
...,...,...
11257834,89088308,1
11257833,89088306,1
11257832,89088300,1
11257831,89088278,1


In [14]:
# Check how many programming languages per code snippet; should be only one
df_merge_rough.groupby('RootPostBlockVersionId')['Language'].nunique().describe()

count    33773522.0
mean            1.0
std             0.0
min             1.0
25%             1.0
50%             1.0
75%             1.0
max             1.0
Name: Language, dtype: float64

In [15]:
min_index = df_pcs.index.min()
max_index = df_pcs.index.max()
print(f"Min date: {min_index}, Max date: {max_index}")

Min date: 2008-08-01 00:20:21, Max date: 2022-06-05 08:11:06


In [16]:
df_pcs = df_pcs[df_pcs.index > "2008-09-01"] # August is the first month with very few commits
df_pcs = df_pcs[df_pcs.index < "2022-06-01"]

# Confirm
min_index = df_pcs.index.min()
max_index = df_pcs.index.max()
print(f"Min date: {min_index}, Max date: {max_index}")

Min date: 2008-09-01 00:58:28, Max date: 2022-05-31 23:59:37


In [17]:
print(f"Nr. of commits: {len(df_pcs):,}")

Nr. of commits: 13,725,307


In [18]:
# This will create multiple rows for each PostId if multiple programming languages are used in the same PostId
# df_pcs_lang = pd.merge(df_pcs.reset_index(), df_merge_rough[['PostId', 'Language']], on=['PostId'], how='left')
df_pcs_lang = pd.merge(df_pcs.reset_index(), df_merge_rough[['PostVersionId', 'Language']], on=['PostVersionId'], how='left')
df_pcs_lang.set_index('CreationDate', inplace=True)
print(f"Nr. of NaN values in Language (e.g., code block too small, no code contained): {df_pcs_lang['Language'].isna().sum()} ({df_pcs_lang['Language'].isna().sum()/len(df_pcs_lang)*100:.2f}%)")
display(df_pcs_lang)

Nr. of NaN values in Language (e.g., code block too small, no code contained): 1270437 (8.58%)


Unnamed: 0_level_0,PostId,PostVersionId,IsRelevant,PostHistoryId,PostHistoryTypeId,Language
CreationDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008-09-01 00:58:28,37318,39918246,0,349837,5,C#
2008-09-01 02:04:10,36296,41564,0,51699,5,Rust
2008-09-01 02:35:37,37385,39918459,-1,52811,5,C#
2008-09-01 03:32:33,36742,41930,0,51763,5,
2008-09-01 05:25:40,37006,39917063,-1,51864,5,
...,...,...,...,...,...,...
2022-05-31 23:58:16,72420112,35346814,-1,271324960,5,Markdown
2022-05-31 23:58:45,72420101,35346725,-1,271324966,5,TypeScript
2022-05-31 23:58:54,72420065,35346587,-1,271324968,5,Markdown
2022-05-31 23:59:15,72425311,87025437,0,271324981,5,Shell


In [19]:
def closest_smaller_key(d, k):
    # Initialize variables to keep track of the closest key and the smallest difference
    closest_key = None
    smallest_diff = float('inf')

    # Iterate over the items in the sub-dictionary
    for key, value in d.items():
        if value < k:
            diff = k - value
            if diff < smallest_diff:
                smallest_diff = diff
                closest_key = key

    return closest_key

def closest_larger_key(d, k):
    # Initialize variables to keep track of the closest key and the smallest difference
    closest_key = None
    smallest_diff = float('inf')

    # Iterate over the items in the sub-dictionary
    for key, value in d.items():
        if value > k:
            diff = value - k
            if diff < smallest_diff:
                smallest_diff = diff
                closest_key = key

    return closest_key

In [20]:
def analyze_psc(df_psc, lang=None):
    if lang is None:
        lang = "All"
    else:
        df_psc= df_psc[df_psc['Language'] == lang]

    # if 'Language' in df_psc.columns:
    #     df_psc.drop(columns=['Language'], inplace=True)

    cols = {}


    # Overall number of IsRelevant commits
    count_numeric = df_psc.groupby('IsRelevant')['PostVersionId'].count()

    # Ratio of all IsRelevant values
    count_normalized = df_psc['IsRelevant'].value_counts(normalize=True)

    print(f"Commits for {lang} with empty commit message: {count_numeric[-1]:,}, with non-security relevant commit message: {count_numeric[0]:,}, with security relevant commit message: {count_numeric[1]:,}")
    confusion_matrix = {"empty": count_numeric[-1], "non_security_relevant": count_numeric[0], "security_relevant": count_numeric[1]}

    df_updates = df_psc.groupby('IsRelevant').resample(rule='ME')[['PostId']].count().reset_index('IsRelevant')

    plt.figure(figsize=(12,3))
    ax_updates = sns.lineplot(x="CreationDate", y="PostId", hue="IsRelevant", data=df_updates)
    ax_updates.set(yscale='log')
    handles, previous_labels = ax_updates.get_legend_handles_labels()
    ax_updates.legend(handles=handles, labels=["No commit message", "Not Security-Relevant", "Security-Relevant"], ncols=3, title=None)
    ax_updates.set(ylabel='Nr. of post edits / month', xlabel=None)
    ax_updates.autoscale(enable=True, axis='x', tight=True)
    plt.savefig(f'plots/commits_{lang}.pdf', bbox_inches='tight')

    df_updates_psc = df_updates.pivot_table(values='PostId', index=df_updates.index, columns='IsRelevant')
    df_updates_psc['PSC_NonEmpty'] = df_updates_psc[1] / (df_updates_psc[0] + df_updates_psc[1])
    df_updates_psc['PSC_All'] = df_updates_psc[1] / (df_updates_psc[0] + df_updates_psc[1] + df_updates_psc[-1])

    # Calculate the average with a 95% confidence interval
    mean_all = df_updates_psc['PSC_All'].mean()
    mean_nonempty = df_updates_psc['PSC_NonEmpty'].mean()
    std_all = df_updates_psc['PSC_All'].std()
    std_nonempty = df_updates_psc['PSC_NonEmpty'].std()
    ci_all = 1.96 * std_all / np.sqrt(len(df_updates_psc))
    ci_nonempty = 1.96 * std_nonempty / np.sqrt(len(df_updates_psc))

    cols['mean_all'] = mean_all
    cols['mean_nonempty'] = mean_nonempty
    cols['std_all'] = std_all
    cols['std_nonempty'] = std_nonempty
    cols['ci_all'] = ci_all
    cols['ci_nonempty'] = ci_nonempty

    # Calculate the Kwiatkowski–Phillips–Schmidt–Shin (KPSS) test for the null hypothesis that the data is level or trend stationary.

    kpss_all, p_all, _, dict_all = kpss(df_updates_psc['PSC_All'], regression='c')
    kpss_nonempty, p_nonempty, _, dict_nonempty = kpss(df_updates_psc['PSC_NonEmpty'], regression='c')

    cols['kpss_all'] = kpss_all
    cols['kpss_p_all'] = p_all
    cols['kpss_nonempty'] = kpss_nonempty
    cols['kpss_p_nonempty'] = p_nonempty

    cols['kpss_crit_all'] = closest_smaller_key(dict_all, kpss_all)
    cols['kpss_crit_nonempty'] = closest_smaller_key(dict_nonempty, kpss_nonempty)

    # Calculate the Augmented Dickey-Fuller (ADF) test for the null hypothesis that the data is non-stationary.

    adf_stat_all, adf_p_value_all, _, _, adf_critical_values_dict_all, _ = adfuller(df_updates_psc['PSC_All'], regression='c')
    adf_stat_nonempty, adf_p_value_nonempty, _, _, adf_critical_values_dict_nonempty, _ = adfuller(df_updates_psc['PSC_NonEmpty'], regression='c')

    cols['adf_stat_all'] = adf_stat_all
    cols['adf_p_value_all'] = adf_p_value_all
    cols['adf_stat_nonempty'] = adf_stat_nonempty
    cols['adf_p_value_nonempty'] = adf_p_value_nonempty

    cols['adf_crit_all'] = closest_larger_key(adf_critical_values_dict_all, adf_stat_all)
    cols['adf_crit_nonempty'] = closest_larger_key(adf_critical_values_dict_nonempty, adf_stat_nonempty)

    x = np.arange(len(df_updates_psc.index))

    y_all = df_updates_psc['PSC_All'].values
    slope_all, intercept_all, r_value_all, p_value_all, std_err_all = stats.linregress(x,y_all)
    print(slope_all, intercept_all, r_value_all, p_value_all, std_err_all)
    # To get coefficient of determination (r_squared)
    print("r-squared_all:", r_value_all**2)

    cols['r_squared_all'] = r_value_all**2
    cols['p_value_all'] = p_value_all

    y = df_updates_psc['PSC_NonEmpty'].values
    slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
    print(slope, intercept, r_value, p_value, std_err)
    # To get coefficient of determination (r_squared)
    print("r-squared:", r_value**2)

    cols['r_squared'] = r_value**2
    cols['p_value'] = p_value

    plt.figure(figsize=(12,3))
    ax = sns.lineplot(x="CreationDate", y="PSC_All", data=df_updates_psc)
    ax.set(ylabel='PCS$_{All}$', xlabel=None)
    ax.get_yaxis().set_major_formatter(mpl.ticker.FuncFormatter(lambda x, p: "{:.0%}".format(x)))
    ax.autoscale(enable=True, axis='x', tight=True)
    # print(axs[0].get_xlim())
    x1, x2 = ax.get_xlim()
    ax.axline((x1, intercept_all), (x2,intercept_all+(slope_all/30)*(x2-x1)), color='red', ls='--') # Slope was per month, hence, we divide by 30 to get per day, which is the x-axis scale
    plt.savefig(f'plots/psc_all_{lang}.pdf', bbox_inches='tight')

    plt.figure(figsize=(12,3))
    ax = sns.lineplot(x="CreationDate", y="PSC_NonEmpty", data=df_updates_psc)
    ax.set(ylabel='PCS$_{NonEmpty}$', xlabel=None)
    ax.get_yaxis().set_major_formatter(mpl.ticker.FuncFormatter(lambda x, p: "{:.0%}".format(x)))
    ax.autoscale(enable=True, axis='x', tight=True)
    # print(axs[1].get_xlim())
    x1, x2 = ax.get_xlim()
    ax.axline((x1, intercept), (x2,intercept+(slope/30)*(x2-x1)), color='red', ls='--') # Slope was per month, hence, we divide by 30 to get per day, which is the x-axis scale
    plt.savefig(f'plots/psc_nonempty_{lang}.pdf', bbox_inches='tight')

    return {lang: cols}, {lang: confusion_matrix}

In [21]:
languages = [None, 'C', 'C++', 'Java', 'Python', 'JavaScript']
d_stats = dict()
d_cm = dict()
for lang in languages:
    print(f"Analyzing {lang}")
    row, cm = analyze_psc(df_pcs_lang, lang)
    d_stats.update(row)
    d_cm.update(cm)
print(d_stats)
print(d_cm)

Analyzing None
Commits for All with empty commit message: 10,120,802, with non-security relevant commit message: 4,096,685, with security relevant commit message: 590,850
-5.2633622159909545e-05 0.04567544673785275 -0.35831252864788293 2.2957493528968124e-06 1.074160296237833e-05
r-squared_all: 0.12838786818603992
1.1230253169864526e-05 0.12770469971091936 0.04905691200343148 0.5314895683689445 1.7909040782952747e-05
r-squared: 0.0024065806153124197
Analyzing C
Commits for C with empty commit message: 177,750, with non-security relevant commit message: 70,960, with security relevant commit message: 16,572


look-up table. The actual p-value is greater than the p-value returned.

  kpss_all, p_all, _, dict_all = kpss(df_updates_psc['PSC_All'], regression='c')
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_nonempty, p_nonempty, _, dict_nonempty = kpss(df_updates_psc['PSC_NonEmpty'], regression='c')


-5.854427886809585e-05 0.06811344725415461 -0.17666317764563713 0.02321293197218184 2.5548150846066307e-05
r-squared_all: 0.031209878335853942
0.0001649068471157694 0.17347063154095488 0.26328656307579085 0.000633727284109509 4.732781174383867e-05
r-squared: 0.0693198142962624
Analyzing C++
Commits for C++ with empty commit message: 192,438, with non-security relevant commit message: 67,659, with security relevant commit message: 13,470


look-up table. The actual p-value is smaller than the p-value returned.

  kpss_all, p_all, _, dict_all = kpss(df_updates_psc['PSC_All'], regression='c')
look-up table. The actual p-value is greater than the p-value returned.

  kpss_nonempty, p_nonempty, _, dict_nonempty = kpss(df_updates_psc['PSC_NonEmpty'], regression='c')


-0.00010536624375701665 0.05932902281615353 -0.45498961035276503 8.273480448779594e-10 1.6152455486808238e-05
r-squared_all: 0.20701554552896095
5.255066339037557e-06 0.16526762100676584 0.011802963918210924 0.8803981901508311 3.4870892922400655e-05
r-squared: 0.00013930995725458897
Analyzing Java
Commits for Java with empty commit message: 734,503, with non-security relevant commit message: 351,794, with security relevant commit message: 51,886


look-up table. The actual p-value is greater than the p-value returned.

  kpss_all, p_all, _, dict_all = kpss(df_updates_psc['PSC_All'], regression='c')


-3.66458419119131e-05 0.05029635109066819 -0.16378721266860424 0.03554385651003088 1.7288053670278695e-05
r-squared_all: 0.026826251033750595
3.876790336321611e-05 0.130538524519572 0.09155792052494394 0.24216187779703033 3.3025895751481964e-05
r-squared: 0.00838285281085195
Analyzing Python
Commits for Python with empty commit message: 824,752, with non-security relevant commit message: 297,813, with security relevant commit message: 55,415


look-up table. The actual p-value is smaller than the p-value returned.

  kpss_all, p_all, _, dict_all = kpss(df_updates_psc['PSC_All'], regression='c')
look-up table. The actual p-value is greater than the p-value returned.

  kpss_nonempty, p_nonempty, _, dict_nonempty = kpss(df_updates_psc['PSC_NonEmpty'], regression='c')


-0.00012881439816594812 0.06174121174483649 -0.6184622257536536 8.647380545331377e-19 1.2819688785352332e-05
r-squared_all: 0.3824955246841632
-2.748210433213815e-05 0.16076904912699622 -0.08045100825627564 0.304316072588049 2.6669487272849764e-05
r-squared: 0.006472364729451332
Analyzing JavaScript
Commits for JavaScript with empty commit message: 1,126,021, with non-security relevant commit message: 455,609, with security relevant commit message: 66,121


look-up table. The actual p-value is greater than the p-value returned.

  kpss_all, p_all, _, dict_all = kpss(df_updates_psc['PSC_All'], regression='c')


-2.575491928817391e-05 0.04355629956657933 -0.12781147749235183 0.10183879134752091 1.565380697786769e-05
r-squared_all: 0.01633577377877796
-5.150405427630552e-05 0.13522777223111146 -0.15632082249128143 0.04495708245105083 2.54893418462356e-05
r-squared: 0.02443619954435072
{'All': {'mean_all': 0.04135948972074017, 'mean_nonempty': 0.12862558047084824, 'std_all': 0.007017891632762834, 'std_nonempty': 0.01093691238250648, 'ci_all': 0.001070830560363787, 'ci_nonempty': 0.0016688174494650027, 'kpss_all': 0.41662645626838124, 'kpss_p_all': 0.06998859643604258, 'kpss_nonempty': 0.3861258843753328, 'kpss_p_nonempty': 0.08313539466580483, 'kpss_crit_all': '10%', 'kpss_crit_nonempty': '10%', 'adf_stat_all': -4.847816507786274, 'adf_p_value_all': 4.401781802824113e-05, 'adf_stat_nonempty': -2.004119739246722, 'adf_p_value_nonempty': 0.28476966372188384, 'adf_crit_all': '1%', 'adf_crit_nonempty': None, 'r_squared_all': 0.12838786818603992, 'p_value_all': 2.2957493528968124e-06, 'r_squared': 0.

In [22]:
df_stats = pd.DataFrame.from_dict(d_stats, orient='index')
display(df_stats)

Unnamed: 0,mean_all,mean_nonempty,std_all,std_nonempty,ci_all,ci_nonempty,kpss_all,kpss_p_all,kpss_nonempty,kpss_p_nonempty,...,adf_stat_all,adf_p_value_all,adf_stat_nonempty,adf_p_value_nonempty,adf_crit_all,adf_crit_nonempty,r_squared_all,p_value_all,r_squared,p_value
All,0.041359,0.128626,0.007018,0.010937,0.001071,0.001669,0.416626,0.069989,0.386126,0.083135,...,-4.847817,4.401782e-05,-2.00412,0.2847697,1%,,0.128388,2.295749e-06,0.002407,0.53149
C,0.063313,0.186993,0.015832,0.029924,0.002416,0.004566,0.191712,0.1,0.779651,0.01,...,-5.91755,2.551597e-07,-4.507706,0.0001904303,1%,1%,0.03121,0.02321293,0.06932,0.000634
C++,0.050689,0.165699,0.011064,0.021271,0.001688,0.003246,0.876476,0.01,0.074439,0.1,...,-4.408641,0.0002865529,-11.059928,4.83815e-20,1%,1%,0.207016,8.27348e-10,0.000139,0.880398
Java,0.047291,0.133717,0.010689,0.020229,0.001631,0.003087,0.211482,0.1,0.356003,0.096119,...,-6.151669,7.536307e-08,-3.525426,0.007357623,1%,1%,0.026826,0.03554386,0.008383,0.242162
Python,0.051178,0.158516,0.009951,0.01632,0.001518,0.00249,1.138851,0.01,0.306401,0.1,...,-4.12961,0.0008649507,-5.173131,9.973265e-06,1%,1%,0.382496,8.647380999999999e-19,0.006472,0.304316
JavaScript,0.041444,0.131004,0.009627,0.015741,0.001469,0.002402,0.170648,0.1,0.48831,0.0443,...,-5.367413,3.964e-06,-2.121559,0.2359344,1%,,0.016336,0.1018388,0.024436,0.044957


In [23]:
def mark_significance(value, p_value):
    """
    Maps a p-value to a number with three decimals precision and marks the significance with * symbols.

    Parameters:
    p_value (float): The p-value to be mapped and marked.

    Returns:
    str: The formatted p-value with significance markers.
    """
    if not isinstance(p_value, (int, float)) or p_value < 0 or p_value > 1:
        raise ValueError("p_value should be a number between 0 and 1")

    # Determine the significance level
    if p_value < 0.001:
        significance = "***"
    elif p_value < 0.01:
        significance = "**"
    elif p_value < 0.05:
        significance = "*"
    else:
        significance = ""

    return f"{value}{significance}"

In [24]:
df_stats_latex = df_stats[['mean_all', 'ci_all', 'r_squared_all', 'p_value_all', 'mean_nonempty', 'ci_nonempty', 'r_squared', 'p_value']].copy()
# Map mean values to percentage
df_stats_latex['mean_all'] = df_stats_latex['mean_all'] * 100
df_stats_latex['mean_nonempty'] = df_stats_latex['mean_nonempty'] *100
# Map the ci values to percentage
df_stats_latex['ci_all'] = df_stats_latex['ci_all'] *100
df_stats_latex['ci_nonempty'] = df_stats_latex['ci_nonempty'] *100
# Round all columns except the p-values
df_stats_latex = df_stats_latex.round({'mean_all': 3, 'ci_all': 3, 'mean_nonempty': 3, 'ci_nonempty': 3})
# format the r_squared values as string with three decimals
df_stats_latex['r_squared_all'] = df_stats_latex['r_squared_all'].map("{:.3f}".format)
df_stats_latex['r_squared'] = df_stats_latex['r_squared'].map("{:.3f}".format)
# Map the p-values to the correct format
df_stats_latex['r_squared_all'] = df_stats_latex.apply(lambda row: mark_significance(row['r_squared_all'], row['p_value_all']), axis=1)
# combine the mean_all and ci_all into one column as strings joined by a plus sign
df_stats_latex['mean_all'] = df_stats_latex['mean_all'].astype(str) + "(" + df_stats_latex['ci_all'].astype(str) +")"
df_stats_latex.drop(columns=['ci_all'], inplace=True)
df_stats_latex.drop(columns=['p_value_all'], inplace=True)
# combine the mean_nonempty and ci_nonempty into one column as strings joined by a plus sign
df_stats_latex['mean_nonempty'] = df_stats_latex['mean_nonempty'].astype(str) + "(" + df_stats_latex['ci_nonempty'].astype(str)+")"
df_stats_latex.drop(columns=['ci_nonempty'], inplace=True)
df_stats_latex['r_squared'] = df_stats_latex.apply(lambda row: mark_significance(row['r_squared'], row['p_value']), axis=1)
df_stats_latex.drop(columns=['p_value'], inplace=True)
# First column text in \textbf{}
df_stats_latex.index = [f"\\textbf{{{x}}}" for x in df_stats_latex.index]
# Rename columns
df_stats_latex.columns = ['Mean PSC', 'R$^2$', 'Mean PSC', 'R$^2$']
display(df_stats_latex)

Unnamed: 0,Mean PSC,R$^2$,Mean PSC.1,R$^2$.1
\textbf{All},4.136(0.107),0.128***,12.863(0.167),0.002
\textbf{C},6.331(0.242),0.031*,18.699(0.457),0.069***
\textbf{C++},5.069(0.169),0.207***,16.57(0.325),0.000
\textbf{Java},4.729(0.163),0.027*,13.372(0.309),0.008
\textbf{Python},5.118(0.152),0.382***,15.852(0.249),0.006
\textbf{JavaScript},4.144(0.147),0.016,13.1(0.24),0.024*


In [25]:
print(df_stats_latex.to_latex(escape=False, float_format="%.3f"))

\begin{tabular}{lllll}
\toprule
 & Mean PSC & R$^2$ & Mean PSC & R$^2$ \\
\midrule
\textbf{All} & 4.136(0.107) & 0.128*** & 12.863(0.167) & 0.002 \\
\textbf{C} & 6.331(0.242) & 0.031* & 18.699(0.457) & 0.069*** \\
\textbf{C++} & 5.069(0.169) & 0.207*** & 16.57(0.325) & 0.000 \\
\textbf{Java} & 4.729(0.163) & 0.027* & 13.372(0.309) & 0.008 \\
\textbf{Python} & 5.118(0.152) & 0.382*** & 15.852(0.249) & 0.006 \\
\textbf{JavaScript} & 4.144(0.147) & 0.016 & 13.1(0.24) & 0.024* \\
\bottomrule
\end{tabular}



In [26]:
# The Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test can be used to test for stationarity in a univariate time series.
# The null hypothesis is that the series is level or trend stationary.
# The function returns the test statistic, p-value, number of lags used, and critical values.
# The p-value is interpolated from a table of critical values, and a boundary point is returned if the test statistic is outside the table of critical values.

# ADF test for stationarity
# The Augmented Dickey-Fuller test can be used to test for a unit root in a univariate time series.
# The null hypothesis is that the time series is non-stationary.
# If the p-value is less than the threshold, then the null hypothesis is rejected and the series is stationary.
# The function returns the test statistic, p-value, number of lags used, number of observations, critical values, and the maximized information criterion value.
# The p-value is interpolated from a table of critical values, and a boundary point is returned if the test statistic is outside the table of critical values.

# https://www.statisticshowto.com/kpss-test/#:~:text=The%20KPSS%20test%20authors%20derived,the%20series%20is%20non%2Dstationary
# A major disadvantage for the KPSS test is that it has a high rate of Type I errors (it tends to reject the null hypothesis too often). If attempts are made to control these errors (by having larger p-values), then that negatively impacts the test’s power.
# One way to deal with the potential for high Type I errors is to combine the KPSS with an ADF test. If the result from both tests suggests that the time series in stationary, then it probably is.

# https://www.statsmodels.org/dev/examples/notebooks/generated/stationarity_detrending_adf_kpss.html
# It is always better to apply both KPSS and ADF, so that it can be ensured that the series is truly stationary. Possible outcomes of applying these stationary tests are as follows:
#
# Case 1: Both tests conclude that the series is not stationary - The series is not stationary
# Case 2: Both tests conclude that the series is stationary - The series is stationary
# Case 3: KPSS indicates stationarity and ADF indicates non-stationarity - The series is trend stationary. Trend needs to be removed to make series strict stationary. The detrended series is checked for stationarity.
# Case 4: KPSS indicates non-stationarity and ADF indicates stationarity - The series is difference stationary. Differencing is to be used to make series stationary. The differenced series is checked for stationarity.

def check_stationary( kpss_p, adf_p):
    if kpss_p > 0.05 and adf_p < 0.05:
        return "Stationary"
    elif kpss_p < 0.05 and adf_p > 0.05:
        return "Non-Stationary"
    elif kpss_p < 0.05 and adf_p < 0.05:
        return "Difference Stationary"
    elif kpss_p > 0.05 and adf_p > 0.05:
        return "Trend Stationary"
    else:
        return "Unknown"

In [27]:
df_kpss_adf = df_stats[['kpss_all', 'kpss_p_all', 'kpss_crit_all', 'adf_stat_all', 'adf_p_value_all', 'adf_crit_all', 'kpss_nonempty', 'kpss_p_nonempty', 'kpss_crit_nonempty', 'adf_stat_nonempty', 'adf_p_value_nonempty', 'adf_crit_nonempty']]
display(df_kpss_adf)

Unnamed: 0,kpss_all,kpss_p_all,kpss_crit_all,adf_stat_all,adf_p_value_all,adf_crit_all,kpss_nonempty,kpss_p_nonempty,kpss_crit_nonempty,adf_stat_nonempty,adf_p_value_nonempty,adf_crit_nonempty
All,0.416626,0.069989,10%,-4.847817,4.401782e-05,1%,0.386126,0.083135,10%,-2.00412,0.2847697,
C,0.191712,0.1,,-5.91755,2.551597e-07,1%,0.779651,0.01,1%,-4.507706,0.0001904303,1%
C++,0.876476,0.01,1%,-4.408641,0.0002865529,1%,0.074439,0.1,,-11.059928,4.83815e-20,1%
Java,0.211482,0.1,,-6.151669,7.536307e-08,1%,0.356003,0.096119,10%,-3.525426,0.007357623,1%
Python,1.138851,0.01,1%,-4.12961,0.0008649507,1%,0.306401,0.1,,-5.173131,9.973265e-06,1%
JavaScript,0.170648,0.1,,-5.367413,3.964e-06,1%,0.48831,0.0443,5%,-2.121559,0.2359344,


In [28]:
df_kpss_adf_latex = df_kpss_adf[['kpss_all', 'kpss_p_all', 'adf_stat_all', 'adf_p_value_all', 'kpss_nonempty', 'kpss_p_nonempty', 'adf_stat_nonempty', 'adf_p_value_nonempty']].copy()

# Combine KPSS and ADF into one column for the conclusion
df_kpss_adf_latex['stationary_all'] = df_kpss_adf_latex.apply(lambda row: check_stationary(row['kpss_p_all'], row['adf_p_value_all']), axis=1)
df_kpss_adf_latex['station_nonempty'] = df_kpss_adf_latex.apply(lambda row: check_stationary(row['kpss_p_nonempty'], row['adf_p_value_nonempty']), axis=1)

df_kpss_adf_latex['kpss_all'] = df_kpss_adf_latex['kpss_all'].map("{:.3f}".format)
df_kpss_adf_latex['kpss_nonempty'] = df_kpss_adf_latex['kpss_nonempty'].map("{:.3f}".format)
df_kpss_adf_latex['adf_stat_all'] = df_kpss_adf_latex['adf_stat_all'].map("{:.3f}".format)
df_kpss_adf_latex['adf_stat_nonempty'] = df_kpss_adf_latex['adf_stat_nonempty'].map("{:.3f}".format)

# Map the p-values to the correct format
df_kpss_adf_latex['kpss_all'] = df_kpss_adf_latex.apply(lambda row: mark_significance(row['kpss_all'], row['kpss_p_all']), axis=1)
df_kpss_adf_latex.drop(columns=['kpss_p_all'], inplace=True)
df_kpss_adf_latex['adf_stat_all'] = df_kpss_adf_latex.apply(lambda row: mark_significance(row['adf_stat_all'], row['adf_p_value_all']), axis=1)
df_kpss_adf_latex.drop(columns=['adf_p_value_all'], inplace=True)
df_kpss_adf_latex['kpss_nonempty'] = df_kpss_adf_latex.apply(lambda row: mark_significance(row['kpss_nonempty'], row['kpss_p_nonempty']), axis=1)
df_kpss_adf_latex.drop(columns=['kpss_p_nonempty'], inplace=True)
df_kpss_adf_latex['adf_stat_nonempty'] = df_kpss_adf_latex.apply(lambda row: mark_significance(row['adf_stat_nonempty'], row['adf_p_value_nonempty']), axis=1)
df_kpss_adf_latex.drop(columns=['adf_p_value_nonempty'], inplace=True)

# Reorder columns
df_kpss_adf_latex = df_kpss_adf_latex[['kpss_all', 'adf_stat_all', 'stationary_all', 'kpss_nonempty', 'adf_stat_nonempty', 'station_nonempty']]

# Rename columns
df_kpss_adf_latex.columns = ['KPSS$_{All}$', 'ADF$_{All}$', 'Stationary$_{All}$', 'KPSS$_{NonEmpty}$', 'ADF$_{NonEmpty}$', 'Stationary$_{NonEmpty}$']
df_kpss_adf_latex.index = [f"\\textbf{{{x}}}" for x in df_kpss_adf_latex.index]
print(df_kpss_adf_latex.to_latex(escape=False))

\begin{tabular}{lllllll}
\toprule
 & KPSS$_{All}$ & ADF$_{All}$ & Stationary$_{All}$ & KPSS$_{NonEmpty}$ & ADF$_{NonEmpty}$ & Stationary$_{NonEmpty}$ \\
\midrule
\textbf{All} & 0.417 & -4.848*** & Stationary & 0.386 & -2.004 & Trend Stationary \\
\textbf{C} & 0.192 & -5.918*** & Stationary & 0.780* & -4.508*** & Difference Stationary \\
\textbf{C++} & 0.876* & -4.409*** & Difference Stationary & 0.074 & -11.060*** & Stationary \\
\textbf{Java} & 0.211 & -6.152*** & Stationary & 0.356 & -3.525** & Stationary \\
\textbf{Python} & 1.139* & -4.130*** & Difference Stationary & 0.306 & -5.173*** & Stationary \\
\textbf{JavaScript} & 0.171 & -5.367*** & Stationary & 0.488* & -2.122 & Non-Stationary \\
\bottomrule
\end{tabular}



In [29]:
df_cm = pd.DataFrame.from_dict(d_cm, orient='index')
display(df_cm)

Unnamed: 0,empty,non_security_relevant,security_relevant
All,10120802,4096685,590850
C,177750,70960,16572
C++,192438,67659,13470
Java,734503,351794,51886
Python,824752,297813,55415
JavaScript,1126021,455609,66121


In [30]:
# Format with thousand separator
df_cm_latex = df_cm.copy()
df_cm_latex['empty'] = df_cm_latex['empty'].map("{:,}".format)
df_cm_latex['non_security_relevant'] = df_cm_latex['non_security_relevant'].map("{:,}".format)
df_cm_latex['security_relevant'] = df_cm_latex['security_relevant'].map("{:,}".format)
# Rename columns
df_cm_latex.columns = ['Empty Commit Message', 'Not Security-Relevant', 'Security-Relevant']

print(df_cm_latex.to_latex())

\begin{tabular}{llll}
\toprule
 & Empty Commit Message & Not Security-Relevant & Security-Relevant \\
\midrule
All & 10,120,802 & 4,096,685 & 590,850 \\
C & 177,750 & 70,960 & 16,572 \\
C++ & 192,438 & 67,659 & 13,470 \\
Java & 734,503 & 351,794 & 51,886 \\
Python & 824,752 & 297,813 & 55,415 \\
JavaScript & 1,126,021 & 455,609 & 66,121 \\
\bottomrule
\end{tabular}



In [31]:
# Chi squared test for independence of variables in a contingency table.
# This function computes the chi-square statistic and p-value for the hypothesis test of independence of the observed frequencies in the contingency table observed.
# The expected frequencies are computed based on the marginal sums under the assumption of independence.
# The number of degrees of freedom is (expressed using numpy functions and attributes) (rows - 1) * (columns - 1).
# If the calculated p-value is below the threshold, then the null hypothesis is rejected.
# The null hypothesis is that the two categorical variables are independent.
chi2, p, dof, expected = stats.chi2_contingency(df_cm)
total_sample_size = df_cm.values.sum()
# Print results
print("\nChi-Square Test Results:")
print(f"Chi2: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of Freedom: {dof}")
print("Expected Frequencies:")
print(expected)

# Interpret the p-value
if p < 0.05:
    print("There is a statistically significant difference between the distributions.")
    print(f"Chi-square({dof}, N={total_sample_size}) = {chi2}, p = {p}")
else:
    print("There is no statistically significant difference between the distributions.")


Chi-Square Test Results:
Chi2: 16288.464418589745
P-value: 0.0
Degrees of Freedom: 10
Expected Frequencies:
[[10103960.27826701  4095272.66262616   609104.05910683]
 [  181006.06371527    73364.22195732    10911.71432741]
 [  186659.04898333    75655.45384986    11252.49716681]
 [  776600.08827452   314766.58891311    46816.32281237]
 [  803754.20471542   325772.5220003     48453.27328428]
 [ 1124286.31604445   455688.55065325    67776.1333023 ]]
There is a statistically significant difference between the distributions.
Chi-square(10, N=19311100) = 16288.464418589745, p = 0.0


In [32]:
# Calculate the Cramér's V statistic for categorical-categorical association.
# This is a symmetric measure of association between two categorical variables, giving a value between 0 and +1 (inclusive).
# It is based on Pearson's chi-squared statistic and is a generalization of the phi coefficient.
# The value is in the range of [0,1], where 0 means no association and 1 is full association.
# The statistic is computed as sqrt(chisq / (n*(min(C, R) - 1)), where chisq is the chi-squared statistic and n is the number of observations.
# The function returns the Cramér's V statistic.

n = df_cm.sum().sum()
phi2 = chi2 / n
r, k = df_cm.shape
phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
rcorr = r - ((r-1)**2)/(n-1)
kcorr = k - ((k-1)**2)/(n-1)
cramer_v = np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))

print(f"Cramer's V: {cramer_v}")

Cramer's V: 0.020529966256082176


In [None]:
# Normalize the values in dm_cm by dividing by the sum of the row
df_cm_normalized = df_cm.div(df_cm.sum(axis=1), axis=0)
plt.figure(figsize=(8,4))
# plt.xticks(fontsize='medium')
# plt.yticks(fontsize='medium')
ax = sns.heatmap(df_cm_normalized, annot=True, cmap="Blues")
ax.set_xticklabels(['Empty Commit Message', 'Not Security-Relevant', 'Security-Relevant'])
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.savefig('plots/confusion_matrix_all.pdf', bbox_inches='tight')

In [34]:
# Remove the empty commit messages
df_cm_nonempty = df_cm.drop(columns=['empty'])

chi2, p, dof, ex = stats.chi2_contingency(df_cm_nonempty)
total_sample_size = df_cm_nonempty.values.sum()
# Print results
print("\nChi-Square Test Results:")
print(f"Chi2: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of Freedom: {dof}")
print("Expected Frequencies:")
print(expected)

# Interpret the p-value
if p < 0.05:
    print("There is a statistically significant difference between the distributions.")
    print(f"Chi-square({dof}, N={total_sample_size}) = {chi2}, p = {p}")
else:
    print("There is no statistically significant difference between the distributions.")

n = df_cm_nonempty.sum().sum()
phi2 = chi2 / n
r, k = df_cm.shape
phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
rcorr = r - ((r-1)**2)/(n-1)
kcorr = k - ((k-1)**2)/(n-1)
cramer_v = np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))

print(f"Cramer's V: {cramer_v}")

df_cm_nonempty_normalized = df_cm_nonempty.div(df_cm_nonempty.sum(axis=1), axis=0)

plt.figure(figsize=(6,3))
# plt.xticks(fontsize='medium')
# plt.yticks(fontsize='medium')
ax = sns.heatmap(df_cm_nonempty_normalized, annot=True, cmap="Blues")
ax.set_xticklabels(['Not Security-Relevant', 'Security-Relevant'])
plt.savefig('plots/confusion_matrix_nonempty.pdf', bbox_inches='tight')


Chi-Square Test Results:
Chi2: 6624.336291923613
P-value: 0.0
Degrees of Freedom: 5
Expected Frequencies:
[[10103960.27826701  4095272.66262616   609104.05910683]
 [  181006.06371527    73364.22195732    10911.71432741]
 [  186659.04898333    75655.45384986    11252.49716681]
 [  776600.08827452   314766.58891311    46816.32281237]
 [  803754.20471542   325772.5220003     48453.27328428]
 [ 1124286.31604445   455688.55065325    67776.1333023 ]]
There is a statistically significant difference between the distributions.
Chi-square(5, N=6134834) = 6624.336291923613, p = 0.0
Cramer's V: 0.023218106607479753


### Calculate the fraction of Posts that received at least one security-relevant commit

In [35]:
# We exploit that isRelevant is stored as integer 1, hence, when the sum is larger 0 we had at least one relevant commit message
# Remove all empty commit messages
df_nonempty_commits = df_pcs[df_pcs['IsRelevant'] != -1]
# Only need the PostId, PostVersionId, and IsRelevant columns
df_nonempty_commits = df_nonempty_commits[['PostId', 'PostVersionId', 'IsRelevant']]
df_nonempty_commits['CumSecUpdates_PostId'] = df_nonempty_commits.groupby('PostId')['IsRelevant'].cumsum()
# astype(bool): If value is 0 then is False, if >0 then is True
df_nonempty_commits['ReceivedSecUpdate_PostId'] = df_nonempty_commits['CumSecUpdates_PostId'].astype(bool)
df_nonempty_commits

Unnamed: 0_level_0,PostId,PostVersionId,IsRelevant,CumSecUpdates_PostId,ReceivedSecUpdate_PostId
CreationDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2008-09-01 00:58:28,37318,39918246,0,0,False
2008-09-01 02:04:10,36296,41564,0,0,False
2008-09-01 03:32:33,36742,41930,0,0,False
2008-09-01 07:35:26,37545,39919232,0,0,False
2008-09-01 08:36:28,37619,39919515,0,0,False
...,...,...,...,...,...
2022-05-31 23:55:58,72439964,35429952,0,0,False
2022-05-31 23:56:07,72445311,35451615,0,0,False
2022-05-31 23:56:07,72445311,35451616,0,0,False
2022-05-31 23:59:15,72425311,87025437,0,0,False


In [36]:
df_nonempty_commits.CumSecUpdates_PostId.max()

27

In [37]:
df_nonempty_commits.loc[df_nonempty_commits['PostId'] == 42473747]

Unnamed: 0_level_0,PostId,PostVersionId,IsRelevant,CumSecUpdates_PostId,ReceivedSecUpdate_PostId
CreationDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-02-26 20:45:20,42473747,79443479,1,1,True
2017-02-26 21:00:17,42473747,79443482,1,2,True
2017-02-26 21:11:12,42473747,79443485,1,3,True
2017-02-26 22:53:21,42473747,79443488,0,3,True
2017-02-26 23:11:36,42473747,79443493,1,4,True
2017-02-26 23:20:44,42473747,79443500,1,5,True
2017-02-26 23:24:24,42473747,79443503,1,6,True
2017-02-26 23:31:23,42473747,79443506,0,6,True
2017-02-26 23:36:32,42473747,79443511,1,7,True
2017-02-26 23:42:16,42473747,79443516,1,8,True


In [38]:
# Count unique PostIds
print(f"{df_nonempty_commits.loc[df_nonempty_commits['ReceivedSecUpdate_PostId'] == True, 'PostId'].nunique():,}")

514,666


In [39]:
# Average number of ReceviedSecUpdate_PostId per PostId when they received at least one security-relevant commit
df_nonempty_commits.loc[df_nonempty_commits['ReceivedSecUpdate_PostId']==True].groupby('PostId')['CumSecUpdates_PostId'].max().describe()

count    514666.000000
mean          1.068388
std           0.291319
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max          27.000000
Name: CumSecUpdates_PostId, dtype: float64

# Comments

In [41]:
# Load data
df_sr_comments = pd.read_feather("data/feather_files/SecurityRelevantComments.feather")
display(df_sr_comments.head())

Unnamed: 0,Id,CommentId,PostId,CommentDate,IsRelevant
0,1,24342921,4,2008-09-21 04:09:42,0
1,2,117303571,4,2021-02-24 13:07:37,0
2,3,119523765,4,2021-05-20 13:17:54,0
3,4,126100323,4,2022-03-03 16:26:30,1
4,5,316584,12,2009-02-01 19:33:54,0


In [42]:
# Fraction of comments where IsRelevant is True
display(df_sr_comments["IsRelevant"].value_counts())
display(df_sr_comments["IsRelevant"].value_counts(normalize=True))
print(f"{len(df_sr_comments):,}")

IsRelevant
0    45095570
1     3991533
Name: count, dtype: int64

IsRelevant
0    0.918685
1    0.081315
Name: proportion, dtype: float64

49,087,103


In [43]:
# Number of unique PostIds
print(f"{df_sr_comments['PostId'].nunique():,}")

14,909,452


In [44]:
# Number of unique PostIds with IsRelevant == True
print(f"{df_sr_comments.loc[df_sr_comments['IsRelevant'] == True, 'PostId'].nunique():,}")

3,128,208


In [45]:
# Intersection of PostIds with security-relevant comments and security-relevant commits
df_sr_commits_PostIds = set(df_sr_commits.loc[df_sr_commits['IsRelevant'] == True, 'PostId'].unique())
df_sr_comments_PostIds = set(df_sr_comments.loc[df_sr_comments['IsRelevant'] == True, 'PostId'].unique())
intersection_PostIds = df_sr_commits_PostIds.intersection(df_sr_comments_PostIds)
print(f"{len(intersection_PostIds):,}")

98,139


In [46]:
# Break down by programming language
df_sr_comments_lang = pd.merge(df_sr_comments, df_merge_rough[['PostId', 'Language']], on='PostId', how='left')
df_sr_comments_lang.drop_duplicates(inplace=True)
# CommentDate to datetime
df_sr_comments_lang['CommentDate'] = pd.to_datetime(df_sr_comments_lang['CommentDate'])
# Set CreationDate as index
df_sr_comments_lang.set_index('CommentDate', inplace=True)
# Sort by index
df_sr_comments_lang.sort_index(inplace=True)
display(df_sr_comments_lang.head())

Unnamed: 0_level_0,Id,CommentId,PostId,IsRelevant,Language
CommentDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2008-08-02 01:33:38,128,8365771,109,0,C#
2008-08-04 01:05:30,718,8764903,845,0,CSS
2008-08-05 04:44:27,1656,13874226,1843,0,C#
2008-08-06 18:45:56,3355,18306732,3831,0,C#
2008-08-06 21:46:17,3436,16080588,3976,0,Batchfile


In [47]:
df_sr_comments_lang = df_sr_comments_lang[df_sr_comments_lang.index > "2008-09-01"] # August is the first month with very few commits
df_sr_comments_lang = df_sr_comments_lang[df_sr_comments_lang.index < "2022-06-01"]

In [48]:
def analyze_comment_psc(df_psc, lang=None):
    if lang is None:
        lang = "All"
    else:
        df_psc= df_psc[df_psc['Language'] == lang]

    # if 'Language' in df_psc.columns:
    #     df_psc.drop(columns=['Language'], inplace=True)

    cols = {}


    # Overall number of IsRelevant comments
    count_numeric = df_psc.groupby('IsRelevant')['PostId'].count()

    # Ratio of all IsRelevant values
    count_normalized = df_psc['IsRelevant'].value_counts(normalize=True)

    print(f"Comment for {lang} that are non-security relevant {count_numeric[0]:,} and that are security relevant {count_numeric[1]:,}")
    confusion_matrix = {"non_security_relevant": count_numeric[0], "security_relevant": count_numeric[1]}

    df_updates = df_psc.groupby('IsRelevant').resample(rule='ME')[['CommentId']].count().reset_index('IsRelevant')

    plt.figure(figsize=(12,3))
    ax_updates = sns.lineplot(x="CommentDate", y="CommentId", hue="IsRelevant", data=df_updates)
    ax_updates.set(yscale='log')
    handles, previous_labels = ax_updates.get_legend_handles_labels()
    ax_updates.legend(handles=handles, labels=["Not Security-Relevant", "Security-Relevant"], ncols=2, title=None)
    ax_updates.set(ylabel='Nr. of comments / month', xlabel=None)
    ax_updates.autoscale(enable=True, axis='x', tight=True)
    plt.savefig(f'plots/comments_{lang}.pdf', bbox_inches='tight')

    df_updates_psc = df_updates.pivot_table(values='CommentId', index=df_updates.index, columns='IsRelevant')
    df_updates_psc['PSC_All'] = df_updates_psc[1] / (df_updates_psc[0] + df_updates_psc[1])

    # Calculate the average with a 95% confidence interval
    mean_all = df_updates_psc['PSC_All'].mean()
    std_all = df_updates_psc['PSC_All'].std()
    ci_all = 1.96 * std_all / np.sqrt(len(df_updates_psc))

    cols['mean_all'] = mean_all
    cols['std_all'] = std_all
    cols['ci_all'] = ci_all

    # Calculate the Kwiatkowski–Phillips–Schmidt–Shin (KPSS) test for the null hypothesis that the data is level or trend stationary.

    kpss_all, p_all, _, dict_all = kpss(df_updates_psc['PSC_All'], regression='c')

    cols['kpss_all'] = kpss_all
    cols['kpss_p_all'] = p_all

    cols['kpss_crit_all'] = closest_smaller_key(dict_all, kpss_all)

    # Calculate the Augmented Dickey-Fuller (ADF) test for the null hypothesis that the data is non-stationary.

    adf_stat_all, adf_p_value_all, _, _, adf_critical_values_dict_all, _ = adfuller(df_updates_psc['PSC_All'], regression='c')

    cols['adf_stat_all'] = adf_stat_all
    cols['adf_p_value_all'] = adf_p_value_all

    cols['adf_crit_all'] = closest_larger_key(adf_critical_values_dict_all, adf_stat_all)

    x = np.arange(len(df_updates_psc.index))

    y_all = df_updates_psc['PSC_All'].values
    slope_all, intercept_all, r_value_all, p_value_all, std_err_all = stats.linregress(x,y_all)
    print(slope_all, intercept_all, r_value_all, p_value_all, std_err_all)
    # To get coefficient of determination (r_squared)
    print("r-squared_all:", r_value_all**2)

    cols['r_squared_all'] = r_value_all**2
    cols['p_value_all'] = p_value_all

    plt.figure(figsize=(12,3))
    ax = sns.lineplot(x="CommentDate", y="PSC_All", data=df_updates_psc)
    ax.set(ylabel='PSC Comments', xlabel=None)
    ax.get_yaxis().set_major_formatter(mpl.ticker.FuncFormatter(lambda x, p: "{:.0%}".format(x)))
    ax.autoscale(enable=True, axis='x', tight=True)
    # print(axs[0].get_xlim())
    x1, x2 = ax.get_xlim()
    ax.axline((x1, intercept_all), (x2,intercept_all+(slope_all/30)*(x2-x1)), color='red', ls='--') # Slope was per month, hence, we divide by 30 to get per day, which is the x-axis scale
    plt.savefig(f'plots/psc_comments_all_{lang}.pdf', bbox_inches='tight')

    return {lang: cols}, {lang: confusion_matrix}

In [49]:
languages = [None, 'C', 'C++', 'Java', 'Python', 'JavaScript']
d_stats_comments = dict()
d_cm_comments = dict()
for lang in languages:
    print(f"Analyzing {lang}")
    row, cm = analyze_comment_psc(df_sr_comments_lang, lang)
    d_stats_comments.update(row)
    d_cm_comments.update(cm)
print(d_stats_comments)
print(d_cm_comments)

Analyzing None
Comment for All that are non-security relevant 56,809,680 and that are security relevant 5,024,047


  plt.figure(figsize=(12,3))
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_all, p_all, _, dict_all = kpss(df_updates_psc['PSC_All'], regression='c')


0.00018966770524184775 0.062258830625652614 0.9669069217292979 1.2929207619255424e-98 3.919903772922189e-06
r-squared_all: 0.9349089952880266
Analyzing C
Comment for C that are non-security relevant 1,201,196 and that are security relevant 175,761


look-up table. The actual p-value is smaller than the p-value returned.

  kpss_all, p_all, _, dict_all = kpss(df_updates_psc['PSC_All'], regression='c')


0.00036710539822074485 0.09124194017925889 0.9346544321987911 4.1921943673316123e-75 1.093844703169467e-05
r-squared_all: 0.8735789076288445
Analyzing C++
Comment for C++ that are non-security relevant 1,279,502 and that are security relevant 168,262


look-up table. The actual p-value is smaller than the p-value returned.

  kpss_all, p_all, _, dict_all = kpss(df_updates_psc['PSC_All'], regression='c')


0.0002906828529514653 0.08741477669052815 0.8940988481613656 9.568801253941698e-59 1.1404906738425698e-05
r-squared_all: 0.7994127502834808
Analyzing Java
Comment for Java that are non-security relevant 4,585,317 and that are security relevant 468,694


look-up table. The actual p-value is smaller than the p-value returned.

  kpss_all, p_all, _, dict_all = kpss(df_updates_psc['PSC_All'], regression='c')


0.0002667078597660057 0.06771624492714949 0.9188537743776521 1.0109539482585663e-67 8.971203557504719e-06
r-squared_all: 0.8442922586880572
Analyzing Python
Comment for Python that are non-security relevant 3,996,595 and that are security relevant 334,873


look-up table. The actual p-value is smaller than the p-value returned.

  kpss_all, p_all, _, dict_all = kpss(df_updates_psc['PSC_All'], regression='c')


0.00020290602608410066 0.05427468615803731 0.9198214185982444 3.9583476049644855e-68 6.7788687483004936e-06
r-squared_all: 0.8460714421120867
Analyzing JavaScript
Comment for JavaScript that are non-security relevant 6,914,110 and that are security relevant 542,021


look-up table. The actual p-value is smaller than the p-value returned.

  kpss_all, p_all, _, dict_all = kpss(df_updates_psc['PSC_All'], regression='c')


0.00021969097124892824 0.051699618014006216 0.9576016929021174 5.2263016311941395e-90 5.176901782951817e-06
r-squared_all: 0.9170010022490012
{'All': {'mean_all': 0.07781158245548413, 'std_all': 0.009371607983798012, 'ci_all': 0.0014299742364145603, 'kpss_all': 1.8507349240645485, 'kpss_p_all': 0.01, 'kpss_crit_all': '1%', 'adf_stat_all': -3.778169200628035, 'adf_p_value_all': 0.003138738683789129, 'adf_crit_all': '1%', 'r_squared_all': 0.9349089952880266, 'p_value_all': 1.2929207619255424e-98}, 'C': {'mean_all': 0.12134458283335997, 'std_all': 0.018764849868253216, 'ci_all': 0.0028632494987177854, 'kpss_all': 1.9044563366371758, 'kpss_p_all': 0.01, 'kpss_crit_all': '1%', 'adf_stat_all': -1.3413549568678056, 'adf_p_value_all': 0.6100465376334677, 'adf_crit_all': None, 'r_squared_all': 0.8735789076288445, 'p_value_all': 4.1921943673316123e-75}, 'C++': {'mean_all': 0.1112507706325483, 'std_all': 0.015532425230677355, 'ci_all': 0.0023700274219006236, 'kpss_all': 1.831416487784432, 'kpss_p

In [50]:
df_stats_comments = pd.DataFrame.from_dict(d_stats_comments, orient='index')
display(df_stats_comments)

Unnamed: 0,mean_all,std_all,ci_all,kpss_all,kpss_p_all,kpss_crit_all,adf_stat_all,adf_p_value_all,adf_crit_all,r_squared_all,p_value_all
All,0.077812,0.009372,0.00143,1.850735,0.01,1%,-3.778169,0.003139,1%,0.934909,1.292921e-98
C,0.121345,0.018765,0.002863,1.904456,0.01,1%,-1.341355,0.610047,,0.873579,4.192194e-75
C++,0.111251,0.015532,0.00237,1.831416,0.01,1%,-1.467128,0.549705,,0.799413,9.568801e-59
Java,0.089586,0.013867,0.002116,1.844817,0.01,1%,-2.176757,0.214789,,0.844292,1.010954e-67
Python,0.070913,0.010539,0.001608,1.858243,0.01,1%,-2.781202,0.061009,10%,0.846071,3.9583480000000005e-68
JavaScript,0.069714,0.010961,0.001672,1.882102,0.01,1%,-0.323305,0.922138,,0.917001,5.226302e-90


In [51]:
df_stats_latex = df_stats_comments[['mean_all', 'ci_all', 'r_squared_all', 'p_value_all']].copy()
# Map mean values to percentage
df_stats_latex['mean_all'] = df_stats_latex['mean_all'] * 100
# Map the ci values to percentage
df_stats_latex['ci_all'] = df_stats_latex['ci_all'] *100
# Round all columns except the p-values
df_stats_latex = df_stats_latex.round({'mean_all': 3, 'ci_all': 3})
# format the r_squared values as string with three decimals
df_stats_latex['r_squared_all'] = df_stats_latex['r_squared_all'].map("{:.3f}".format)
# Map the p-values to the correct format
df_stats_latex['r_squared_all'] = df_stats_latex.apply(lambda row: mark_significance(row['r_squared_all'], row['p_value_all']), axis=1)
# combine the mean_all and ci_all into one column as strings joined by a plus sign
df_stats_latex['mean_all'] = df_stats_latex['mean_all'].astype(str) + "(" + df_stats_latex['ci_all'].astype(str) +")"
df_stats_latex.drop(columns=['ci_all'], inplace=True)
df_stats_latex.drop(columns=['p_value_all'], inplace=True)
# First column text in \textbf{}
df_stats_latex.index = [f"\\textbf{{{x}}}" for x in df_stats_latex.index]
# Rename columns
df_stats_latex.columns = ['Mean PSC', 'R$^2$']
print(df_stats_latex.to_latex(escape=False))

\begin{tabular}{lll}
\toprule
 & Mean PSC & R$^2$ \\
\midrule
\textbf{All} & 7.781(0.143) & 0.935*** \\
\textbf{C} & 12.134(0.286) & 0.874*** \\
\textbf{C++} & 11.125(0.237) & 0.799*** \\
\textbf{Java} & 8.959(0.212) & 0.844*** \\
\textbf{Python} & 7.091(0.161) & 0.846*** \\
\textbf{JavaScript} & 6.971(0.167) & 0.917*** \\
\bottomrule
\end{tabular}



In [52]:
df_kpss_adf = df_stats_comments[['kpss_all', 'kpss_p_all', 'kpss_crit_all', 'adf_stat_all', 'adf_p_value_all', 'adf_crit_all']]
display(df_kpss_adf)

Unnamed: 0,kpss_all,kpss_p_all,kpss_crit_all,adf_stat_all,adf_p_value_all,adf_crit_all
All,1.850735,0.01,1%,-3.778169,0.003139,1%
C,1.904456,0.01,1%,-1.341355,0.610047,
C++,1.831416,0.01,1%,-1.467128,0.549705,
Java,1.844817,0.01,1%,-2.176757,0.214789,
Python,1.858243,0.01,1%,-2.781202,0.061009,10%
JavaScript,1.882102,0.01,1%,-0.323305,0.922138,


In [53]:
df_kpss_adf_latex = df_kpss_adf[['kpss_all', 'kpss_p_all', 'adf_stat_all', 'adf_p_value_all']].copy()

# Combine KPSS and ADF into one column for the conclusion
df_kpss_adf_latex['stationary_all'] = df_kpss_adf_latex.apply(lambda row: check_stationary(row['kpss_p_all'], row['adf_p_value_all']), axis=1)

df_kpss_adf_latex['kpss_all'] = df_kpss_adf_latex['kpss_all'].map("{:.3f}".format)
df_kpss_adf_latex['adf_stat_all'] = df_kpss_adf_latex['adf_stat_all'].map("{:.3f}".format)

# Map the p-values to the correct format
df_kpss_adf_latex['kpss_all'] = df_kpss_adf_latex.apply(lambda row: mark_significance(row['kpss_all'], row['kpss_p_all']), axis=1)
df_kpss_adf_latex.drop(columns=['kpss_p_all'], inplace=True)
df_kpss_adf_latex['adf_stat_all'] = df_kpss_adf_latex.apply(lambda row: mark_significance(row['adf_stat_all'], row['adf_p_value_all']), axis=1)
df_kpss_adf_latex.drop(columns=['adf_p_value_all'], inplace=True)

# Reorder columns
df_kpss_adf_latex = df_kpss_adf_latex[['kpss_all', 'adf_stat_all', 'stationary_all']]

# Rename columns
df_kpss_adf_latex.columns = ['KPSS$_{All}$', 'ADF$_{All}$', 'Stationary$_{All}$']
df_kpss_adf_latex.index = [f"\\textbf{{{x}}}" for x in df_kpss_adf_latex.index]
print(df_kpss_adf_latex.to_latex(escape=False))

\begin{tabular}{llll}
\toprule
 & KPSS$_{All}$ & ADF$_{All}$ & Stationary$_{All}$ \\
\midrule
\textbf{All} & 1.851* & -3.778** & Difference Stationary \\
\textbf{C} & 1.904* & -1.341 & Non-Stationary \\
\textbf{C++} & 1.831* & -1.467 & Non-Stationary \\
\textbf{Java} & 1.845* & -2.177 & Non-Stationary \\
\textbf{Python} & 1.858* & -2.781 & Non-Stationary \\
\textbf{JavaScript} & 1.882* & -0.323 & Non-Stationary \\
\bottomrule
\end{tabular}

