In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config Completer.use_jedi = False

mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42

sns.set(style="whitegrid", color_codes=True)
sns.set_context("paper")

sns.set(style="whitegrid", color_codes=True, font_scale=1.3)

color_blind = ["#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"]

# Load the data

In [2]:
df_PostBlockVersion = pd.read_feather('data/feather_files/PostBlockVersion.feather')

In [3]:
df_CodeBlockVersion = pd.read_feather('data/feather_files/CodeBlockVersion.feather')

In [4]:
df_PostVersion = pd.read_feather('data/feather_files/PostVersion.feather')

In [5]:
df_CommonMarkEdits = pd.read_feather('data/feather_files/PostHistoryId_CommonMark.feather')

In [6]:
df_PostVersionData = pd.read_feather('data/feather_files/PostVersionData.feather')

## Select and merge the relevant columns

In [7]:
# Drop all rows with PostBlockTypeId=1 (Text)
df_PostBlockVersion = df_PostBlockVersion[df_PostBlockVersion.PostBlockTypeId != 1]

In [8]:
# Drop all rows with LineCount smaller than 5
df_PostBlockVersion = df_PostBlockVersion[df_PostBlockVersion.LineCount >= 5]

In [9]:
df_PostBlockVersion.PostBlockTypeId.value_counts()

PostBlockTypeId
2    65333782
Name: count, dtype: int64

In [10]:
display(df_PostVersion)

Unnamed: 0,Id,PostId,PostTypeId,PostHistoryId,PostHistoryTypeId,CreationDate,PredPostHistoryId,SuccPostHistoryId,MostRecentVersion
0,1,13646426,1,32556579,2,2012-11-30 13:41:44,0,0,1
1,2,25989369,1,74075424,2,2014-09-23 07:29:34,0,0,1
2,3,37759745,1,120182375,2,2016-06-11 03:54:15,0,120194969,0
3,4,61414546,1,220036678,2,2020-04-24 17:56:14,0,220073195,0
4,5,37759745,1,120194969,5,2016-06-11 11:08:04,120182375,0,1
...,...,...,...,...,...,...,...,...,...
89524315,89524316,45760362,2,154260519,2,2017-08-18 15:39:00,0,0,1
89524316,89524317,45760364,2,154260527,2,2017-08-18 15:39:04,0,0,1
89524317,89524318,45760365,2,154260528,2,2017-08-18 15:39:04,0,154260951,0
89524318,89524319,45760365,2,154260951,5,2017-08-18 15:44:40,154260528,154261261,0


In [11]:
display(df_PostBlockVersion)

Unnamed: 0,Id,PostBlockTypeId,PostId,PostHistoryId,PredPostBlockVersionId,PredPostHistoryId,RootPostBlockVersionId,RootPostHistoryId,LineCount,MostRecentVersion
1,2,2,49570324,170160072,,,2,170160072,27,0
3,4,2,49570324,170160072,,,4,170160072,77,0
6,7,2,49570324,170160524,2.0,170160072.0,2,170160072,27,0
8,9,2,49570324,170160524,4.0,170160072.0,4,170160072,77,0
10,11,2,49570324,170160524,,,11,170160524,9,0
...,...,...,...,...,...,...,...,...,...,...
276591294,276591295,2,45760365,154260951,276591291.0,154260528.0,276591291,154260528,5,0
276591296,276591297,2,45760365,154260951,276591293.0,154260528.0,276591293,154260528,6,0
276591300,276591301,2,45760365,154261261,276591295.0,154260951.0,276591291,154260528,5,1
276591302,276591303,2,45760365,154261261,276591297.0,154260951.0,276591293,154260528,6,1


In [12]:
display(df_CodeBlockVersion)

Unnamed: 0,Id,PostBlockVersionId,RootPostBlockVersionId,Language
0,1,21,21,PHP
1,2,61,2,CoffeeScript
2,3,65,4,CoffeeScript
3,4,69,11,C#
4,5,73,37,JavaScript
...,...,...,...,...
37515185,37515186,276591277,276591277,Java
37515186,37515187,276591280,276591280,C#
37515187,37515188,276591301,276591291,Batchfile
37515188,37515189,276591303,276591293,Markdown


In [13]:

# Retrieve the initial version for each post block
# PredPostBlockVersionId is NaN or 0, or RootPostBlockVersionId is equal to Id
df_PostBlockVersion_Initial = df_PostBlockVersion[df_PostBlockVersion.PredPostBlockVersionId.isnull() | (df_PostBlockVersion.PredPostBlockVersionId == 0) | (df_PostBlockVersion.RootPostBlockVersionId == df_PostBlockVersion.Id)]
print(f"Number of initial code blocks: {len(df_PostBlockVersion_Initial):,}")
# Ratio of code blocks with no language detected
df_PostBlockVersion_Initial = pd.merge(df_PostBlockVersion_Initial, df_CodeBlockVersion, how='left', right_on='RootPostBlockVersionId', left_on='RootPostBlockVersionId')
nr_no_lang = df_PostBlockVersion_Initial.Language.isnull().sum()
print(f"Number of rows with no language: {nr_no_lang:,} ({nr_no_lang/len(df_PostBlockVersion_Initial)*100:.2f}%)")

Number of initial code blocks: 37,815,382
Number of rows with no language: 861,720 (2.28%)


In [14]:
# Drop all rows with no language
df_PostBlockVersion_Initial = df_PostBlockVersion_Initial.dropna(subset=['Language'])
print(f"Final number of initial code blocks: {len(df_PostBlockVersion_Initial):,}")

Final number of initial code blocks: 36,953,662


In [15]:
display(df_PostBlockVersion_Initial)

Unnamed: 0,Id_x,PostBlockTypeId,PostId,PostHistoryId,PredPostBlockVersionId,PredPostHistoryId,RootPostBlockVersionId,RootPostHistoryId,LineCount,MostRecentVersion,Id_y,PostBlockVersionId,Language
0,2,2,49570324,170160072,,,2,170160072,27,0,2.0,61.0,CoffeeScript
1,4,2,49570324,170160072,,,4,170160072,77,0,3.0,65.0,CoffeeScript
2,11,2,49570324,170160524,,,11,170160524,9,0,4.0,69.0,C#
3,21,2,13646426,32556579,,,21,32556579,37,1,1.0,21.0,PHP
4,37,2,49570324,170176560,,,37,170176560,23,0,5.0,73.0,JavaScript
...,...,...,...,...,...,...,...,...,...,...,...,...,...
37815377,276591270,2,45760345,154260461,,,276591270,154260461,5,1,37515185.0,276591270.0,SQL
37815378,276591277,2,45760353,154260488,,,276591277,154260488,13,1,37515186.0,276591277.0,Java
37815379,276591280,2,45760355,154260494,,,276591280,154260494,11,1,37515187.0,276591280.0,C#
37815380,276591291,2,45760365,154260528,,,276591291,154260528,5,0,37515188.0,276591301.0,Batchfile


In [16]:
c = df_PostBlockVersion_Initial['Language'] == 'C'
cpp = df_PostBlockVersion_Initial['Language'] == 'C++'
java = df_PostBlockVersion_Initial['Language'] == 'Java'
python = df_PostBlockVersion_Initial['Language'] == 'Python'
javascript = df_PostBlockVersion_Initial['Language'] == 'JavaScript'
df_PostBlockVersion_Initial_Selected = df_PostBlockVersion_Initial[c | cpp | java | python | javascript]

In [17]:
print(f"Number of initial code blocks with selected languages: {len(df_PostBlockVersion_Initial_Selected):,}")

Number of initial code blocks with selected languages: 11,782,183


In [18]:
display(df_PostBlockVersion_Initial_Selected)

Unnamed: 0,Id_x,PostBlockTypeId,PostId,PostHistoryId,PredPostBlockVersionId,PredPostHistoryId,RootPostBlockVersionId,RootPostHistoryId,LineCount,MostRecentVersion,Id_y,PostBlockVersionId,Language
4,37,2,49570324,170176560,,,37,170176560,23,0,5.0,73.0,JavaScript
5,80,2,61414546,220036678,,,80,220036678,33,0,6.0,84.0,Python
6,123,2,13646428,32556583,,,123,32556583,12,1,7.0,123.0,Java
8,136,2,37759749,120182386,,,136,120182386,6,1,9.0,136.0,C
9,145,2,13646431,32556595,,,145,32556595,11,0,13.0,177.0,JavaScript
...,...,...,...,...,...,...,...,...,...,...,...,...,...
37815359,276591174,2,45760294,154260267,,,276591174,154260267,61,1,37515168.0,276591174.0,JavaScript
37815365,276591207,2,45760306,154260295,,,276591207,154260295,5,1,37515174.0,276591207.0,Python
37815367,276591214,2,45760316,154260337,,,276591214,154260337,6,1,37515176.0,276591214.0,JavaScript
37815376,276591256,2,45760340,154260447,,,276591256,154260447,14,0,37515184.0,276591262.0,Java


In [19]:
# Remove all rows with PostHistoryId in df_CommonMarkEdits.Id
df_PostBlockVersion_Initial_Selected = df_PostBlockVersion_Initial_Selected[~df_PostBlockVersion_Initial_Selected.PostHistoryId.isin(df_CommonMarkEdits.Id)]

In [20]:
df_merge_posthistory = pd.merge(df_PostBlockVersion_Initial_Selected, df_PostVersion[['PostHistoryId', 'CreationDate']], how='left', left_on='PostHistoryId', right_on='PostHistoryId')

In [21]:
df_merge_posthistory = df_merge_posthistory[['PostBlockVersionId', 'Language', 'CreationDate']]

In [22]:
df_merge_posthistory = df_merge_posthistory.set_index('CreationDate')
display(df_merge_posthistory)

Unnamed: 0_level_0,PostBlockVersionId,Language
CreationDate,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-03-30 12:20:20,73.0,JavaScript
2020-04-24 17:56:14,84.0,Python
2012-11-30 13:41:46,123.0,Java
2016-06-11 03:54:36,136.0,C
2012-11-30 13:41:52,177.0,JavaScript
...,...,...
2017-08-18 15:34:52,276591174.0,JavaScript
2017-08-18 15:35:35,276591207.0,Python
2017-08-18 15:36:02,276591214.0,JavaScript
2017-08-18 15:37:27,276591262.0,Java


In [23]:
language_counts = df_merge_posthistory['Language'].value_counts()

In [24]:
# convert to a Dataframe object and add an additional column
language_counts_df = language_counts.to_frame()
# language_counts_df['Count (distinct)'] = language_counts_df['Language']
# language_counts_df[['Count (distinct)']]
language_counts_df

Unnamed: 0_level_0,count
Language,Unnamed: 1_level_1
JavaScript,4742633
Java,3089804
Python,2840153
C++,590396
C,519148


In [25]:
grouped = df_merge_posthistory[['Language', 'PostBlockVersionId']].groupby([pd.Grouper(freq='ME'), 'Language'])
grouped_count = grouped.count()
grouped_df = grouped_count.unstack() # we unstack so each language become its own column
grouped_df.drop(grouped_df.tail(1).index,inplace=True)
grouped_df

Unnamed: 0_level_0,PostBlockVersionId,PostBlockVersionId,PostBlockVersionId,PostBlockVersionId,PostBlockVersionId
Language,C,C++,Java,JavaScript,Python
CreationDate,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2008-08-31,62,47,159,165,102
2008-09-30,268,298,803,753,599
2008-10-31,282,291,764,981,607
2008-11-30,240,295,643,854,563
2008-12-31,221,273,664,849,548
...,...,...,...,...,...
2022-01-31,2743,3231,9240,22948,30584
2022-02-28,2358,2930,8461,21604,29575
2022-03-31,2712,3252,9286,22671,31707
2022-04-30,2511,3254,9106,21852,30483


In [27]:
title=""
ax = grouped_df.plot(title=title, figsize=(12,5))
ax.set(xlabel='', ylabel='New code snippets per month')
languages = grouped_df.columns.get_level_values('Language').to_list()
ax.legend(languages)
ax.get_yaxis().set_major_formatter(mpl.ticker.FuncFormatter(lambda x, p: '{:,.0f}'.format(x/1000) + 'K'))

ax.axvline(x='2018-03-30', color='black', linestyle='--') # Fischer: Java
ax.annotate("Java", xy=('2018-03-30', 56500), ha='center', va='bottom', fontsize=12, color='black', backgroundcolor='white', rotation=90, annotation_clip=False)

ax.axvline(x='2016-03-30', color='black', linestyle='--') # Fischer: Java
ax.annotate("Java", xy=('2016-03-30', 56500), ha='center', va='bottom', fontsize=12, color='black', backgroundcolor='white', rotation=90, annotation_clip=False)

ax.axvline(x='2016-11-30', color='black', linestyle='--') # DICOS: C/C++
ax.annotate("C/C++", xy=('2016-11-30', 56500), ha='center', va='bottom', fontsize=12, color='black', backgroundcolor='white', rotation=90, annotation_clip=False)

ax.axvline(x='2018-11-30', color='black', linestyle='--') # Zhang: C/C++
ax.annotate("JS/C/C++", xy=('2018-12-15', 56500), ha='center', va='bottom', fontsize=12, color='black', backgroundcolor='white', rotation=90, annotation_clip=False)

ax.axvline(x='2018-08-31', color='black', linestyle='--') # Verdi: C++
ax.annotate("C++", xy=('2018-08-31', 56500), ha='center', va='bottom', fontsize=12, color='black', rotation=90, annotation_clip=False)

ax.axvline(x='2021-12-31', color='black', linestyle='--') # Selvaraj : C/C++
ax.annotate("C/C++", xy=('2021-12-31', 56500), ha='center', va='bottom', fontsize=12, color='black', rotation=90, annotation_clip=False)

ax.axvline(x='2015-10-31', color='black', linestyle='--') # Acar : Java
ax.annotate("Java", xy=('2015-10-31', 56500), ha='center', va='bottom', fontsize=12, color='black', rotation=90, annotation_clip=False)

ax.axvline(x='2015-12-31', color='black', linestyle='--') # Ragkhitwets: Java
ax.annotate("Java", xy=('2015-12-31', 56500), ha='center', va='bottom', fontsize=10, color='black', rotation=90, annotation_clip=False)

ax.axvline(x='2016-09-30', color='black', linestyle='--') # Zhang et al.: Java
ax.annotate("Java", xy=('2016-09-30', 56500), ha='center', va='bottom', fontsize=10, color='black', rotation=90, annotation_clip=False)

ax.axvline(x='2021-07-31', color='black', linestyle='--') # Rhaman et al.: Java
ax.annotate("Java", xy=('2021-07-31', 56500), ha='center', va='bottom', fontsize=10, color='black', rotation=90, annotation_clip=False)

ax.axvline(x='2022-02-28', color='black', linestyle='--') # Schmidt: JS,P
ax.annotate("JS/Python", xy=('2022-02-28', 56500), ha='center', va='bottom', fontsize=10, color='black', rotation=90, annotation_clip=False)

ax.axvline(x='2019-02-28', color='black', linestyle='--') # Ren, Mahajan: Java,
ax.annotate("Java", xy=('2019-02-28', 56500), ha='center', va='bottom', fontsize=10, color='black', rotation=90, annotation_clip=False)

ax.axvline(x='2020-12-31', color='black', linestyle='--') # Chakraborty: Python
ax.annotate("Python", xy=('2020-12-31', 56500), ha='center', va='bottom', fontsize=10, color='black', rotation=90, annotation_clip=False)

plt.savefig('plots/prog_languages.pdf', bbox_inches='tight')

# Edits per snippet

In [28]:
display(df_PostVersionData)

Unnamed: 0,Id,PostId,PostVersionId,PostHistoryId,RootPostBlockVersionId,PostBlockVersionId,Label
0,1,18,39797300,16,138300137,138300137,1
1,2,18,39797300,16,138300142,138300142,1
2,3,30,39797390,22,138300395,138300395,1
3,4,34,635,28,2034,2034,1
4,5,52,39797475,43,138300601,138300601,1
...,...,...,...,...,...,...,...
78483129,78483130,72160753,34347425,269723453,118616529,118616529,1
78483130,78483131,72160753,34347425,269723453,118616531,118616531,1
78483131,78483132,72160753,34347425,269723453,118616534,118616534,1
78483132,78483133,72160755,34347438,269723461,118616570,118616570,1


In [29]:
display(df_PostBlockVersion)

Unnamed: 0,Id,PostBlockTypeId,PostId,PostHistoryId,PredPostBlockVersionId,PredPostHistoryId,RootPostBlockVersionId,RootPostHistoryId,LineCount,MostRecentVersion
1,2,2,49570324,170160072,,,2,170160072,27,0
3,4,2,49570324,170160072,,,4,170160072,77,0
6,7,2,49570324,170160524,2.0,170160072.0,2,170160072,27,0
8,9,2,49570324,170160524,4.0,170160072.0,4,170160072,77,0
10,11,2,49570324,170160524,,,11,170160524,9,0
...,...,...,...,...,...,...,...,...,...,...
276591294,276591295,2,45760365,154260951,276591291.0,154260528.0,276591291,154260528,5,0
276591296,276591297,2,45760365,154260951,276591293.0,154260528.0,276591293,154260528,6,0
276591300,276591301,2,45760365,154261261,276591295.0,154260951.0,276591291,154260528,5,1
276591302,276591303,2,45760365,154261261,276591297.0,154260951.0,276591293,154260528,6,1


In [30]:
desc_PostBlockVersion = df_PostBlockVersion.loc[df_PostBlockVersion.PostBlockTypeId == 2].groupby('RootPostBlockVersionId')['Id'].count().describe()
display(desc_PostBlockVersion)

count    3.840173e+07
mean     1.701324e+00
std      1.196432e+00
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      2.000000e+00
max      7.540000e+02
Name: Id, dtype: float64

In [31]:
# Confidence interval for the number of edits per snippet
mean = desc_PostBlockVersion['mean']
std = desc_PostBlockVersion['std']
n = desc_PostBlockVersion['count']
z = 1.96
ci = z * (std / np.sqrt(n))
display(mean, ci)
display(desc_PostBlockVersion['max'])

1.7013240133360665

0.0003784154270043955

754.0

In [32]:
# get the 99% percentile
percentile = df_PostBlockVersion.loc[df_PostBlockVersion.PostBlockTypeId == 2].groupby('RootPostBlockVersionId')['Id'].count().quantile(0.99)
display(percentile)

6.0

In [33]:
df_pv = df_PostVersion.loc[(df_PostVersion.PostTypeId == 2) & (df_PostVersion.PostHistoryTypeId == 5)]
display(df_pv)

Unnamed: 0,Id,PostId,PostTypeId,PostHistoryId,PostHistoryTypeId,CreationDate,PredPostHistoryId,SuccPostHistoryId,MostRecentVersion
39797098,39797099,45760375,2,154261940,5,2017-08-18 16:00:25,154260576,0,1
39797103,39797104,33254309,2,102311465,5,2015-10-21 09:06:25,102308174,0,1
39797104,39797105,10036752,2,23004466,5,2012-04-06 14:36:03,22984034,146742687,0
39797105,39797106,10036752,2,146742687,5,2017-05-23 11:58:22,23004466,0,1
39797109,39797110,58732850,2,208775033,5,2019-11-07 18:47:56,208683620,208776473,0
...,...,...,...,...,...,...,...,...,...
89524304,89524305,45760340,2,154261323,5,2017-08-18 15:50:26,154260447,154261811,0
89524305,89524306,45760340,2,154261811,5,2017-08-18 15:57:55,154261323,0,1
89524314,89524315,45760356,2,154260922,5,2017-08-18 15:44:13,154260501,0,1
89524318,89524319,45760365,2,154260951,5,2017-08-18 15:44:40,154260528,154261261,0


In [35]:
df_pv_merge = pd.merge(df_pv[['Id', 'PostId', 'PostHistoryId', 'PostHistoryTypeId', 'CreationDate']], df_PostVersionData[['PostVersionId', 'RootPostBlockVersionId', 'PostBlockVersionId']], how='inner', left_on='Id', right_on='PostVersionId')
display(df_pv_merge)

Unnamed: 0,Id,PostId,PostHistoryId,PostHistoryTypeId,CreationDate,PostVersionId,RootPostBlockVersionId,PostBlockVersionId
0,39797099,45760375,154261940,5,2017-08-18 16:00:25,39797099,138299582,138299582
1,39797104,33254309,102311465,5,2015-10-21 09:06:25,39797104,138299591,138299595
2,39797114,45760382,154260648,5,2017-08-18 15:40:53,39797114,138299611,138299611
3,39797133,10036759,22984176,5,2012-04-05 22:21:13,39797133,138299652,138299654
4,39797135,10036759,22986445,5,2012-04-06 00:10:27,39797135,138299659,138299659
...,...,...,...,...,...,...,...,...
10589092,89524300,45760324,154295683,5,2017-08-19 00:31:26,89524300,276591242,276591242
10589093,89524300,45760324,154295683,5,2017-08-19 00:31:26,89524300,276591244,276591244
10589094,89524305,45760340,154261323,5,2017-08-18 15:50:26,89524305,276591256,276591259
10589095,89524319,45760365,154260951,5,2017-08-18 15:44:40,89524319,276591299,276591299


In [36]:
df_pv_merge_lang = pd.merge(df_pv_merge, df_CodeBlockVersion[['RootPostBlockVersionId', 'Language']], how='inner', left_on='RootPostBlockVersionId', right_on='RootPostBlockVersionId')
display(df_pv_merge_lang)

Unnamed: 0,Id,PostId,PostHistoryId,PostHistoryTypeId,CreationDate,PostVersionId,RootPostBlockVersionId,PostBlockVersionId,Language
0,39797104,33254309,102311465,5,2015-10-21 09:06:25,39797104,138299591,138299595,PowerShell
1,39797133,10036759,22984176,5,2012-04-05 22:21:13,39797133,138299652,138299654,JavaScript
2,39797135,10036759,22986445,5,2012-04-06 00:10:27,39797135,138299659,138299659,JavaScript
3,39797137,58732860,208684098,5,2019-11-06 15:00:11,39797137,138299662,138299666,JavaScript
4,39797139,58732860,208728637,5,2019-11-07 07:20:27,39797139,138299662,138299671,JavaScript
...,...,...,...,...,...,...,...,...,...
6665501,89524285,45760295,154313424,5,2017-08-19 11:02:47,89524285,276591178,276591187,SQL
6665502,89524290,45760303,154264737,5,2017-08-18 16:44:36,89524290,276591204,276591204,SQL
6665503,89524305,45760340,154261323,5,2017-08-18 15:50:26,89524305,276591256,276591259,Java
6665504,89524319,45760365,154260951,5,2017-08-18 15:44:40,89524319,276591299,276591299,JavaScript


In [37]:
c = df_pv_merge_lang['Language'] == 'C'
cpp = df_pv_merge_lang['Language'] == 'C++'
java = df_pv_merge_lang['Language'] == 'Java'
python = df_pv_merge_lang['Language'] == 'Python'
javascript = df_pv_merge_lang['Language'] == 'JavaScript'
df_pv_merge_lang_selected = df_pv_merge_lang[c | cpp | java | python | javascript]
display(df_pv_merge_lang_selected)

Unnamed: 0,Id,PostId,PostHistoryId,PostHistoryTypeId,CreationDate,PostVersionId,RootPostBlockVersionId,PostBlockVersionId,Language
1,39797133,10036759,22984176,5,2012-04-05 22:21:13,39797133,138299652,138299654,JavaScript
2,39797135,10036759,22986445,5,2012-04-06 00:10:27,39797135,138299659,138299659,JavaScript
3,39797137,58732860,208684098,5,2019-11-06 15:00:11,39797137,138299662,138299666,JavaScript
4,39797139,58732860,208728637,5,2019-11-07 07:20:27,39797139,138299662,138299671,JavaScript
5,39797141,33254332,102310469,5,2015-10-21 08:52:24,39797141,138299680,138299680,Python
...,...,...,...,...,...,...,...,...,...
6665499,89524259,45760240,154262647,5,2017-08-18 16:12:03,89524259,276591102,276591104,Python
6665500,89524260,45760240,154281654,5,2017-08-18 22:10:01,89524260,276591102,276591106,Python
6665503,89524305,45760340,154261323,5,2017-08-18 15:50:26,89524305,276591256,276591259,Java
6665504,89524319,45760365,154260951,5,2017-08-18 15:44:40,89524319,276591299,276591299,JavaScript


In [38]:
desc_lang_edits = df_pv_merge_lang_selected.groupby(['Language', 'RootPostBlockVersionId'])['Id'].count().reset_index()
display(desc_lang_edits)

Unnamed: 0,Language,RootPostBlockVersionId,Id
0,C,138302503,1
1,C,138303315,1
2,C,138303543,1
3,C,138303687,2
4,C,138303943,1
...,...,...,...
1688170,Python,276586769,1
1688171,Python,276587359,1
1688172,Python,276588988,2
1688173,Python,276591078,2


In [39]:
# For every language, get the mean number of edits per snippet
desc_lang_edits_mean = desc_lang_edits.groupby('Language')['Id'].describe()
display(desc_lang_edits_mean)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
C,82456.0,1.474168,1.00403,1.0,1.0,1.0,2.0,36.0
C++,90264.0,1.480003,1.01099,1.0,1.0,1.0,2.0,34.0
Java,359257.0,1.306878,0.725893,1.0,1.0,1.0,1.0,27.0
JavaScript,700592.0,1.310672,0.743709,1.0,1.0,1.0,1.0,53.0
Python,455606.0,1.410434,0.915394,1.0,1.0,1.0,2.0,58.0


In [40]:
# Confidence interval for the number of edits per snippet
mean = desc_lang_edits_mean['mean']
std = desc_lang_edits_mean['std']
n = desc_lang_edits_mean['count']
z = 1.96
ci = z * (std / np.sqrt(n))
display(ci)
desc_lang_edits_mean['ci']=ci

Language
C             0.006853
C++           0.006595
Java          0.002374
JavaScript    0.001742
Python        0.002658
dtype: float64

In [41]:
# get the 99% percentile
percentile = desc_lang_edits.groupby('Language')['Id'].quantile(0.99).reset_index().rename(columns={'Id': '99%'})
display(percentile)


Unnamed: 0,Language,99%
0,C,5.0
1,C++,5.0
2,Java,4.0
3,JavaScript,4.0
4,Python,5.0


In [42]:
df_per_lang = pd.merge(desc_lang_edits_mean, percentile, how='inner', left_on='Language', right_on='Language')
df_per_lang.drop(['25%', '50%', 'std'], axis=1, inplace=True)
# rearrange columns
df_per_lang = df_per_lang[['Language', 'count', 'mean', 'ci', 'min', '75%', '99%', 'max']]
# Combine the mean and the confidence interval
df_per_lang['mean'] = df_per_lang['mean'].map('{:,.3f}'.format) + '(' + df_per_lang['ci'].map('{:,.3f}'.format) + ')'
df_per_lang.drop('ci', axis=1, inplace=True)
display(df_per_lang)

Unnamed: 0,Language,count,mean,min,75%,99%,max
0,C,82456.0,1.474(0.007),1.0,2.0,5.0,36.0
1,C++,90264.0,1.480(0.007),1.0,2.0,5.0,34.0
2,Java,359257.0,1.307(0.002),1.0,1.0,4.0,27.0
3,JavaScript,700592.0,1.311(0.002),1.0,1.0,4.0,53.0
4,Python,455606.0,1.410(0.003),1.0,2.0,5.0,58.0


In [43]:
print(df_per_lang.to_latex(index=False, float_format="%.0f"))

\begin{tabular}{lrlrrrr}
\toprule
Language & count & mean & min & 75% & 99% & max \\
\midrule
C & 82456 & 1.474(0.007) & 1 & 2 & 5 & 36 \\
C++ & 90264 & 1.480(0.007) & 1 & 2 & 5 & 34 \\
Java & 359257 & 1.307(0.002) & 1 & 1 & 4 & 27 \\
JavaScript & 700592 & 1.311(0.002) & 1 & 1 & 4 & 53 \\
Python & 455606 & 1.410(0.003) & 1 & 2 & 5 & 58 \\
\bottomrule
\end{tabular}

