In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline
%config Completer.use_jedi = False

mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42


sns.set(style="whitegrid", color_codes=True)
sns.set_context("paper")

sns.set(style="whitegrid", color_codes=True)
color_blind = ["#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"]
divert_color = ['#a6611a','#dfc27d','#f5f5f5','#80cdc1','#018571']

from itertools import combinations
from collections import Counter

In [2]:
df_fischer = pd.read_csv('data/fischer_questions.csv', parse_dates=True)

In [3]:
df_fischer

Unnamed: 0,QuestionId,Tags,CreationDate
0,10012918,<java><passwords><encryption>,2012-04-04 14:14:00
1,10012918,<java><passwords><encryption>,2012-04-04 14:14:00
2,1009996,<java><cryptography>,2009-06-17 23:18:23
3,10136585,<java><php><rijndael>,2012-04-13 07:05:12
4,10136585,<java><php><rijndael>,2012-04-13 07:05:12
...,...,...,...
9363,31939111,<android><ssl>,2015-08-11 10:19:25
9364,36548757,<java><security><ssl>,2016-04-11 12:30:13
9365,37937535,<android><google-play><android-security><trust...,2016-06-21 07:01:10
9366,40914688,<java><x509><pki><java-security><ocsp>,2016-12-01 15:37:21


In [4]:
df_all_tags = pd.read_feather('data/feather_files/Tags_September_2023.feather')

In [5]:
df_all_tags['Tags'] = df_all_tags['Tags'].str.replace('><', ' ')
df_all_tags['Tags'] = df_all_tags['Tags'].str.replace('<', '')
df_all_tags['Tags'] = df_all_tags['Tags'].str.replace('>', '')
df_all_tags['Tags'] = df_all_tags['Tags'].str.split(' ')

In [6]:
display(df_all_tags)

Unnamed: 0,QuestionId,Tags,CreationDate
0,4,"[c#, floating-point, type-conversion, double, ...",2008-07-31 21:42:52
1,6,"[html, css, internet-explorer-7]",2008-07-31 22:08:08
2,9,"[c#, .net, datetime]",2008-07-31 23:40:59
3,11,"[c#, datetime, time, datediff, relative-time-s...",2008-07-31 23:55:37
4,13,"[html, browser, timezone, user-agent, timezone...",2008-08-01 00:42:38
...,...,...,...
23876738,77031693,"[next.js, supabase, supabase-database]",2023-09-03 09:32:29
23876739,77031694,"[javascript, arrays, html-table]",2023-09-03 09:32:56
23876740,77031699,"[flutter, kotlin, dart, bluetooth-lowenergy, p...",2023-09-03 09:33:37
23876741,77031700,"[python, memory]",2023-09-03 09:33:40


In [7]:
print(len(df_all_tags))

23876743


In [8]:
c_tuples = Counter()
c_tags = Counter()
for tags in df_fischer.Tags.items():
    tag_list = tags[1].replace('<','').replace('>', ' ').strip().split(' ')
    tag_list = sorted(tag_list)
    if 'android' in tag_list:
        tag_list.insert(0, tag_list.pop(tag_list.index('android')))
    if 'java' in tag_list:
        tag_list.insert(0, tag_list.pop(tag_list.index('java')))
    c_tags.update(tag_list)
    c_tuples.update(combinations(tag_list,r=2))
display(c_tuples)
display(c_tags)

Counter({('java', 'encryption'): 3325,
         ('java', 'aes'): 1684,
         ('aes', 'encryption'): 1509,
         ('java', 'android'): 1418,
         ('android', 'encryption'): 1229,
         ('java', 'cryptography'): 1135,
         ('cryptography', 'encryption'): 820,
         ('java', 'ssl'): 812,
         ('android', 'aes'): 528,
         ('android', 'ssl'): 471,
         ('aes', 'cryptography'): 435,
         ('java', 'php'): 355,
         ('java', 'c#'): 332,
         ('java', 'security'): 326,
         ('java', 'rsa'): 323,
         ('java', 'https'): 291,
         ('android', 'cryptography'): 271,
         ('encryption', 'php'): 262,
         ('https', 'ssl'): 245,
         ('c#', 'encryption'): 244,
         ('encryption', 'rsa'): 244,
         ('android', 'https'): 230,
         ('java', 'bouncycastle'): 219,
         ('java', 'hash'): 219,
         ('java', 'md5'): 211,
         ('encryption', 'security'): 164,
         ('java', 'javascript'): 162,
         ('android', 'p

Counter({'java': 6984,
         'encryption': 4064,
         'android': 3152,
         'aes': 2055,
         'cryptography': 1287,
         'ssl': 1188,
         'https': 454,
         'php': 450,
         'c#': 415,
         'rsa': 391,
         'security': 380,
         'hash': 273,
         'md5': 256,
         'bouncycastle': 255,
         'ssl-certificate': 216,
         'ios': 206,
         'javascript': 201,
         'base64': 183,
         'des': 158,
         'openssl': 150,
         'facebook': 150,
         'sockets': 147,
         'objective-c': 137,
         'web-services': 131,
         'jce': 128,
         'python': 127,
         'certificate': 120,
         'exception': 114,
         'sha256': 112,
         'node.js': 110,
         'sha1': 105,
         'cryptojs': 103,
         'string': 102,
         'keystore': 101,
         'arrays': 96,
         'encoding': 94,
         '3des': 92,
         'encryption-symmetric': 91,
         '.net': 91,
         'httpclient': 87,

In [9]:
# Filter Counter based on key value
# Dict from list of tuples
filter_tags = ['c#',
               '.net',
               'facebook',
               'sockets',
               'base64',
               'javascript',
               'php',
               'web-services',
               'string',
               'exception',
               'node.js',
               'ios',
               'python',
               'encoding',
               'spring',
               'objective-c',
               'file',
               'padding',
               'mysql',
               'apache-httpclient-4.x',
               'ruby',
               'soap',
               'rest',
               'bytearray',
               'algorithm',
               'eclipse',
               'android-asynctask',
               'android-volley',
               'arrays',
               'json',
               'performance',
               'okhttp',
               'hex',
               'byte',
               'android-studio',
               'facebook-graph-api',
               'http',
               'jsp',
               'xml',
               'servlets',
               'tomcat',
               'hibernate',
               'post',
               'multithreading',
               'apache',
               'okhttp3',
               'c',
               'scala',
               'jakarta-ee',
               'amazon-web-services',
               'groovy',
               'websocket',
               'iphone',
               'amazon-s3']
c_tuples = Counter({k: v for k, v in c_tuples.items() if ('android' in k or 'java' in k) and not ('android' in k and 'java' in k) and not any(tag in k for tag in filter_tags)})
display(c_tuples)

Counter({('java', 'encryption'): 3325,
         ('java', 'aes'): 1684,
         ('android', 'encryption'): 1229,
         ('java', 'cryptography'): 1135,
         ('java', 'ssl'): 812,
         ('android', 'aes'): 528,
         ('android', 'ssl'): 471,
         ('java', 'security'): 326,
         ('java', 'rsa'): 323,
         ('java', 'https'): 291,
         ('android', 'cryptography'): 271,
         ('android', 'https'): 230,
         ('java', 'bouncycastle'): 219,
         ('java', 'hash'): 219,
         ('java', 'md5'): 211,
         ('java', 'des'): 133,
         ('java', 'ssl-certificate'): 128,
         ('java', 'jce'): 126,
         ('java', 'openssl'): 124,
         ('android', 'rsa'): 114,
         ('android', 'ssl-certificate'): 98,
         ('android', 'security'): 92,
         ('java', 'sha256'): 91,
         ('java', 'sha1'): 84,
         ('java', 'cryptojs'): 82,
         ('java', '3des'): 81,
         ('java', 'certificate'): 81,
         ('java', 'keystore'): 75,
     

In [10]:
display(", ".join(filter_tags))

'c#, .net, facebook, sockets, base64, javascript, php, web-services, string, exception, node.js, ios, python, encoding, spring, objective-c, file, padding, mysql, apache-httpclient-4.x, ruby, soap, rest, bytearray, algorithm, eclipse, android-asynctask, android-volley, arrays, json, performance, okhttp, hex, byte, android-studio, facebook-graph-api, http, jsp, xml, servlets, tomcat, hibernate, post, multithreading, apache, okhttp3, c, scala, jakarta-ee, amazon-web-services, groovy, websocket, iphone, amazon-s3'

In [11]:
c_tags_top_crypto = Counter({k: v for k, v in c_tags.items() if (k not in filter_tags and k not in ['android', 'java'])})
display(c_tags_top_crypto.most_common(50))

[('encryption', 4064),
 ('aes', 2055),
 ('cryptography', 1287),
 ('ssl', 1188),
 ('https', 454),
 ('rsa', 391),
 ('security', 380),
 ('hash', 273),
 ('md5', 256),
 ('bouncycastle', 255),
 ('ssl-certificate', 216),
 ('des', 158),
 ('openssl', 150),
 ('jce', 128),
 ('certificate', 120),
 ('sha256', 112),
 ('sha1', 105),
 ('cryptojs', 103),
 ('keystore', 101),
 ('3des', 92),
 ('encryption-symmetric', 91),
 ('httpclient', 87),
 ('authentication', 79),
 ('blowfish', 78),
 ('public-key-encryption', 77),
 ('sha', 73),
 ('httpsurlconnection', 71),
 ('badpaddingexception', 68),
 ('sslhandshakeexception', 68),
 ('hmac', 67),
 ('tripledes', 61),
 ('x509certificate', 61),
 ('cbc-mode', 59),
 ('aes-gcm', 57),
 ('message-digest', 54),
 ('rijndael', 52),
 ('passwords', 51),
 ('digital-signature', 48),
 ('httpurlconnection', 47),
 ('tls1.2', 45),
 ('javax.crypto', 44),
 ('password-encryption', 41),
 ('c++', 39),
 ('key', 37),
 ('initialization-vector', 37),
 ('secret-key', 37),
 ('image', 35),
 ('vb.n

In [12]:
l_tags_top_crypto = [k for k, v in c_tags_top_crypto.most_common(50)]
display(l_tags_top_crypto)

['encryption',
 'aes',
 'cryptography',
 'ssl',
 'https',
 'rsa',
 'security',
 'hash',
 'md5',
 'bouncycastle',
 'ssl-certificate',
 'des',
 'openssl',
 'jce',
 'certificate',
 'sha256',
 'sha1',
 'cryptojs',
 'keystore',
 '3des',
 'encryption-symmetric',
 'httpclient',
 'authentication',
 'blowfish',
 'public-key-encryption',
 'sha',
 'httpsurlconnection',
 'badpaddingexception',
 'sslhandshakeexception',
 'hmac',
 'tripledes',
 'x509certificate',
 'cbc-mode',
 'aes-gcm',
 'message-digest',
 'rijndael',
 'passwords',
 'digital-signature',
 'httpurlconnection',
 'tls1.2',
 'javax.crypto',
 'password-encryption',
 'c++',
 'key',
 'initialization-vector',
 'secret-key',
 'image',
 'vb.net',
 'sslsocketfactory',
 'self-signed']

In [13]:
c_tags_top_all = Counter({k: v for k, v in c_tags.most_common(102) if (k not in ['android', 'java'])})
df_tags_top = pd.DataFrame.from_dict(dict(c_tags_top_all), orient='index').reset_index()
df_tags_top.columns = ['tag', 'count']
df_tags_top['crypto']=df_tags_top['tag'].apply(lambda x: x in l_tags_top_crypto)
print(df_tags_top['crypto'].value_counts())
display(df_tags_top)

crypto
True     50
False    50
Name: count, dtype: int64


Unnamed: 0,tag,count,crypto
0,encryption,4064,True
1,aes,2055,True
2,cryptography,1287,True
3,ssl,1188,True
4,https,454,True
...,...,...,...
95,self-signed,35,True
96,android-asynctask,34,False
97,websocket,34,False
98,retrofit,34,False


In [14]:
plt.figure(figsize=(3,15))
ax_tags = sns.barplot(y='tag', x='count', data=df_tags_top, hue='crypto', palette=sns.color_palette("Set1")[0:2])
ax_tags.set_xscale('log')
ax_tags.set(xlabel="Count (log)", ylabel="")
sns.despine(left=True, bottom=True)
plt.yticks(fontsize=7)
ax_tags.legend(ncol=2, loc="lower right", frameon=True, title='Crypto Tag')
plt.savefig('plots/tags_top100.pdf', bbox_inches='tight')

In [15]:
c_tuples_top_crypto = Counter({k: v for k, v in c_tuples.items() if any(x in l_tags_top_crypto for x in k)})
display(c_tuples_top_crypto)

Counter({('java', 'encryption'): 3325,
         ('java', 'aes'): 1684,
         ('android', 'encryption'): 1229,
         ('java', 'cryptography'): 1135,
         ('java', 'ssl'): 812,
         ('android', 'aes'): 528,
         ('android', 'ssl'): 471,
         ('java', 'security'): 326,
         ('java', 'rsa'): 323,
         ('java', 'https'): 291,
         ('android', 'cryptography'): 271,
         ('android', 'https'): 230,
         ('java', 'bouncycastle'): 219,
         ('java', 'hash'): 219,
         ('java', 'md5'): 211,
         ('java', 'des'): 133,
         ('java', 'ssl-certificate'): 128,
         ('java', 'jce'): 126,
         ('java', 'openssl'): 124,
         ('android', 'rsa'): 114,
         ('android', 'ssl-certificate'): 98,
         ('android', 'security'): 92,
         ('java', 'sha256'): 91,
         ('java', 'sha1'): 84,
         ('java', 'cryptojs'): 82,
         ('java', '3des'): 81,
         ('java', 'certificate'): 81,
         ('java', 'keystore'): 75,
     

In [16]:
# Heatmap from Counter of tuple (tag1, tag2) tags
df_tags_top_crypto = pd.DataFrame.from_dict(c_tuples_top_crypto, orient='index').reset_index()

# Tuple to columns
df_tags_top_crypto['tag1'] = df_tags_top_crypto['index'].apply(lambda x: x[0])
df_tags_top_crypto['tag2'] = df_tags_top_crypto['index'].apply(lambda x: x[1])

df_tags_top_crypto.columns = ['index', 'count', 'tag1', 'tag2']
# delete index column
# del df_tags['index']
df_tags_top_crypto = df_tags_top_crypto.pivot(index='tag2', columns='tag1', values='count').fillna(0)
display(df_tags_top_crypto)

plt.figure(figsize=(2,11))
ax_tags = sns.heatmap(df_tags_top_crypto, annot=True, fmt='g', cmap="Blues", cbar=False)
ax_tags.set(xlabel="", ylabel="")
ax_tags.xaxis.tick_top()
plt.savefig('plots/fischer_heatmap_top50.pdf', bbox_inches='tight')

tag1,android,java
tag2,Unnamed: 1_level_1,Unnamed: 2_level_1
3des,15,81
aes,528,1684
aes-gcm,12,48
authentication,29,44
badpaddingexception,23,53
blowfish,6,68
bouncycastle,47,219
c++,11,30
cbc-mode,11,46
certificate,47,81


In [17]:
frozen_l_tags_top_crypto = frozenset(l_tags_top_crypto)
frozen_tags = frozenset(['android', 'java'])

In [18]:
%%timeit

# Slower version
# df_all_tags['Crypto'] = df_all_tags['Tags'].map(lambda x: bool(frozen_l_tags_top_crypto & set(x)) and bool(frozen_tags & set(x)))

def isCryptoSet(x):
    return bool(frozen_l_tags_top_crypto & set(x)) and bool(frozen_tags & set(x))

vecIsCryptoSet = np.vectorize(isCryptoSet)
df_all_tags['Crypto'] = vecIsCryptoSet(df_all_tags['Tags'])

13.7 s ± 588 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
df_all_tags['Crypto'].value_counts()

Crypto
False    23792905
True        83838
Name: count, dtype: int64

In [20]:
display(df_all_tags[df_all_tags['Crypto']==True])

Unnamed: 0,QuestionId,Tags,CreationDate,Crypto
312,3049,"[java, c++, windows, unix]",2008-08-06 03:19:19,True
790,8318,"[java, security, jboss]",2008-08-11 22:53:50,True
1401,14617,"[java, ftp, sftp, security]",2008-08-18 13:43:48,True
1724,17944,"[java, c#, c++, math]",2008-08-20 13:27:40,True
1852,19347,"[c#, java, c++, visual-studio]",2008-08-21 04:46:02,True
...,...,...,...,...
23875210,77028536,"[java, c++, recursion, static]",2023-09-02 13:45:39,True
23875352,77028842,"[android, image, android-studio, android-asset...",2023-09-02 15:16:20,True
23875466,77029070,"[android, authentication, authorization, maui]",2023-09-02 16:23:42,True
23875562,77029281,"[java, google-play-console, public-key-encrypt...",2023-09-02 17:17:12,True


In [21]:
df_all_tags_crypto = df_all_tags[df_all_tags['Crypto']==True].filter(['CreationDate', 'Crypto'])
# Set index to CreationDate
df_all_tags_crypto.set_index('CreationDate', inplace=True)
# Index is datetime
df_all_tags_crypto.index = pd.to_datetime(df_all_tags_crypto.index)
df_all_tags_crypto = df_all_tags_crypto[df_all_tags_crypto.index > "2008-09-01"]
df_all_tags_crypto = df_all_tags_crypto[df_all_tags_crypto.index < "2023-09-01"]
display(df_all_tags_crypto)

Unnamed: 0_level_0,Crypto
CreationDate,Unnamed: 1_level_1
2008-09-04 15:21:44,True
2008-09-05 01:50:33,True
2008-09-08 06:13:19,True
2008-09-08 11:04:13,True
2008-09-09 05:52:23,True
...,...
2023-08-31 12:47:18,True
2023-08-31 14:53:25,True
2023-08-31 19:01:27,True
2023-08-31 19:44:15,True


In [22]:
# Group by month
df_all_tags_crypto = df_all_tags_crypto.resample('ME').sum()
display(df_all_tags_crypto)

Unnamed: 0_level_0,Crypto
CreationDate,Unnamed: 1_level_1
2008-09-30,54
2008-10-31,40
2008-11-30,37
2008-12-31,33
2009-01-31,54
...,...
2023-04-30,155
2023-05-31,194
2023-06-30,160
2023-07-31,151


In [23]:
from statsmodels.tsa.stattools import kpss
kpss(df_all_tags_crypto['Crypto'], regression='c')

(0.48768237486517385,
 0.04444090656189779,
 9,
 {'10%': 0.347, '5%': 0.463, '2.5%': 0.574, '1%': 0.739})

In [24]:
from scipy import stats
x = np.arange(len(df_all_tags_crypto.index))
y = df_all_tags_crypto['Crypto'].values
slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
print(slope, intercept, r_value, p_value, std_err)
# To get coefficient of determination (r_squared)
print("r-squared:", r_value**2)

-0.7855057254853547 535.8694290976059 -0.1575727080600621 0.0346395406186459 0.3689765012275969
r-squared: 0.02482915832538156


In [25]:
# Plot timeseries
plt.figure(figsize=(10,4))
ax_all_tags_crypto = sns.lineplot(data=df_all_tags_crypto, x='CreationDate', y='Crypto')
ax_all_tags_crypto.set(xlabel="", ylabel="New posts with crypto-related tags / month")
plt.autoscale(enable=True, axis='x', tight=True)
# x1, x2 = ax_all_tags_crypto.get_xlim()
# ax_all_tags_crypto.axline((x1, intercept), (x2,intercept+(slope/30)*(x2-x1)), color='red', ls='--') # Slope was per month, hence, we divide by 30 to get per day, which is the x-axis scale
sns.despine(left=True, bottom=True)

ax_all_tags_crypto.axvline(x=pd.Timestamp('2018-03-30'), color='black', linestyle=':') # Fischer: Java
ax_all_tags_crypto.annotate("DS$_{2018}$", xy=(pd.Timestamp('2018-03-30'), 50), ha='center', va='center', fontsize=12, color='black', backgroundcolor='white', annotation_clip=False)
#
ax_all_tags_crypto.axvline(x=pd.Timestamp('2016-03-30'), color='black', linestyle=':') # Fischer: Java
ax_all_tags_crypto.annotate("DS$_{2016}$", xy=(pd.Timestamp('2016-03-30'), 50), ha='center', va='center', fontsize=12, color='black', backgroundcolor='white',annotation_clip=False)

plt.savefig('plots/crypto_tag_timeseries.pdf', bbox_inches='tight')

In [26]:
# Crypto-related tags before Fischer's DS2016
df_all_tags_crypto[df_all_tags_crypto.index < "2016-03-30"].Crypto.sum()

48888

In [27]:
# Crypto-related tags after Fischer's DS2016
df_all_tags_crypto[df_all_tags_crypto.index > "2016-03-30"].Crypto.sum()

34914

In [28]:
# Calculate growth rate
df_all_tags_crypto.Crypto.sum() / df_all_tags_crypto[df_all_tags_crypto.index < "2016-03-30"].Crypto.sum() - 1

0.7141629847815414

In [29]:
df_ds2016 = df_all_tags_crypto[df_all_tags_crypto.index < "2016-03-30"]
display(df_ds2016)
kpss(df_ds2016['Crypto'], regression='c')

Unnamed: 0_level_0,Crypto
CreationDate,Unnamed: 1_level_1
2008-09-30,54
2008-10-31,40
2008-11-30,37
2008-12-31,33
2009-01-31,54
...,...
2015-10-31,824
2015-11-30,789
2015-12-31,779
2016-01-31,769


look-up table. The actual p-value is smaller than the p-value returned.

  kpss(df_ds2016['Crypto'], regression='c')


(1.453918315105839,
 0.01,
 5,
 {'10%': 0.347, '5%': 0.463, '2.5%': 0.574, '1%': 0.739})

In [30]:
df_ds2018 = df_all_tags_crypto[df_all_tags_crypto.index < "2018-03-30"]
display(df_ds2018)
kpss(df_ds2018['Crypto'], regression='c')

Unnamed: 0_level_0,Crypto
CreationDate,Unnamed: 1_level_1
2008-09-30,54
2008-10-31,40
2008-11-30,37
2008-12-31,33
2009-01-31,54
...,...
2017-10-31,611
2017-11-30,572
2017-12-31,468
2018-01-31,537


look-up table. The actual p-value is smaller than the p-value returned.

  kpss(df_ds2018['Crypto'], regression='c')


(1.151115714464594,
 0.01,
 6,
 {'10%': 0.347, '5%': 0.463, '2.5%': 0.574, '1%': 0.739})

In [31]:
c_all_tags = Counter()
for tags in df_all_tags.Tags.items():
    c_all_tags.update(tags[1])
display(c_all_tags)

Counter({'javascript': 2510216,
         'python': 2160775,
         'java': 1905586,
         'c#': 1601044,
         'php': 1461453,
         'android': 1409229,
         'html': 1179886,
         'jquery': 1034419,
         'c++': 798607,
         'css': 797541,
         'ios': 683117,
         'sql': 664893,
         'mysql': 661065,
         'r': 496409,
         'node.js': 466595,
         'reactjs': 465315,
         'arrays': 414660,
         'c': 398933,
         'asp.net': 373696,
         'json': 357859,
         'python-3.x': 340880,
         'ruby-on-rails': 337299,
         '.net': 333751,
         'sql-server': 331828,
         'swift': 329848,
         'django': 308976,
         'angular': 299176,
         'objective-c': 292443,
         'pandas': 282739,
         'excel': 281792,
         'angularjs': 262777,
         'regex': 258880,
         'ruby': 228384,
         'linux': 225973,
         'ajax': 221872,
         'typescript': 221483,
         'iphone': 221320,
   

In [32]:
# Create a DataFrame from the Counter that ranks the tags
df_tags = pd.DataFrame.from_dict(c_all_tags, orient='index').reset_index()
df_tags.columns = ['tag', 'count']
df_tags = df_tags.sort_values('count', ascending=False)
df_tags['rank'] = df_tags['count'].rank(ascending=False)
display(df_tags)

Unnamed: 0,tag,count,rank
208,javascript,2510216,1.0
123,python,2160775,2.0
78,java,1905586,3.0
0,c#,1601044,4.0
45,php,1461453,5.0
...,...,...,...
63530,bsb-framework,1,64170.0
60792,html5-pattern,1,64170.0
63525,magic.link,1,64170.0
51451,analytics-engine-python-sdk,1,64170.0


In [33]:
# Rank the tags in l_tags_top_crypto
df_tags_top_crypto_rank = df_tags_top_crypto.reset_index()[['tag2']].copy()
df_tags_top_crypto_rank.columns = ['Tag']
df_tags_top_crypto_rank['Rank'] = df_tags_top_crypto_rank['Tag'].map(df_tags.set_index('tag')['rank'])
df_tags_top_crypto_rank['Rank'] = df_tags_top_crypto_rank['Rank'].astype(int)
df_tags_top_crypto_rank = df_tags_top_crypto_rank.sort_values('Rank')
display(df_tags_top_crypto_rank)

Unnamed: 0,Tag,Rank
7,c++,9
48,vb.net,60
22,image,70
3,authentication,110
37,security,145
42,ssl,171
14,encryption,241
19,https,361
16,hash,380
30,openssl,551


In [34]:
c_all_tuples = Counter()
for tags in df_all_tags.Tags.items():
    tag_list = tags[1]
    # Check that android or java in tags
    if not ('android' in tag_list or 'java' in tag_list):
        continue
    android_present = False
    java_present = False
    if 'android' in tag_list:
        android_present = True
        tag_list.remove('android')
    if 'java' in tag_list:
        java_present = True
        tag_list.remove('java')
    if android_present:
        c_all_tuples.update([('android', element) for element in tag_list])
    if java_present:
        c_all_tuples.update([('java', element) for element in tag_list])
display(c_all_tuples)

Counter({('java', 'spring'): 128083,
         ('java', 'swing'): 79498,
         ('java', 'spring-boot'): 74193,
         ('android', 'android-studio'): 67465,
         ('java', 'eclipse'): 59401,
         ('java', 'hibernate'): 58764,
         ('android', 'android-layout'): 57068,
         ('android', 'kotlin'): 55218,
         ('java', 'arrays'): 52237,
         ('java', 'maven'): 47781,
         ('android', 'android-fragments'): 44034,
         ('java', 'multithreading'): 39271,
         ('java', 'json'): 39210,
         ('android', 'firebase'): 36870,
         ('java', 'xml'): 36507,
         ('android', 'listview'): 34768,
         ('java', 'string'): 32596,
         ('java', 'spring-mvc'): 32112,
         ('java', 'jpa'): 31324,
         ('android', 'xml'): 31314,
         ('java', 'mysql'): 30590,
         ('android', 'android-intent'): 29824,
         ('android', 'sqlite'): 29186,
         ('android', 'android-activity'): 27768,
         ('android', 'android-recyclerview'): 276

In [35]:
# Dataframe from Counter
df_all_tuples = pd.DataFrame.from_dict(c_all_tuples, orient='index').reset_index()
df_all_tuples.columns = ['index', 'count']
# Tuple to columns
df_all_tuples['tag1'] = df_all_tuples['index'].apply(lambda x: x[0])
df_all_tuples['tag2'] = df_all_tuples['index'].apply(lambda x: x[1])
df_all_tuples.columns = ['index', 'count', 'tag1', 'tag2']
# delete index column
del df_all_tuples['index']
df_all_tuples['crypto'] = df_all_tuples['tag2'].apply(lambda x: x in l_tags_top_crypto)
df_all_tuples['rank'] = df_all_tuples['count'].rank(ascending=False)
df_all_tuples = df_all_tuples.sort_values('rank')
display(df_all_tuples)

Unnamed: 0,count,tag1,tag2,crypto,rank
73,128083,java,spring,False,1.0
16,79498,java,swing,False,2.0
17292,74193,java,spring-boot,False,3.0
9605,67465,android,android-studio,False,4.0
44,59401,java,eclipse,False,5.0
...,...,...,...,...,...
32956,1,java,access-protection,False,42940.5
17518,1,android,plugin-pattern,False,42940.5
40097,1,android,browsermob-proxy,False,42940.5
17509,1,android,ecma262,False,42940.5


In [36]:
df_all_tuples_10 = df_all_tuples.head(10)
del(df_all_tuples_10['crypto'])
df_all_tuples_10 = df_all_tuples_10.reset_index(drop=True)
# Rename columns
df_all_tuples_10.columns = ['Count', 'Tag1', 'Tag2', 'Rank']
df_all_tuples_10['Rank'] = df_all_tuples_10['Rank'].astype(int)
df_all_tuples_10['Count'] = df_all_tuples_10['Count'].apply(lambda x: "{:,}".format(x))
# Print as LaTeX table
print(df_all_tuples_10.to_latex(index=False))

\begin{tabular}{lllr}
\toprule
Count & Tag1 & Tag2 & Rank \\
\midrule
128,083 & java & spring & 1 \\
79,498 & java & swing & 2 \\
74,193 & java & spring-boot & 3 \\
67,465 & android & android-studio & 4 \\
59,401 & java & eclipse & 5 \\
58,764 & java & hibernate & 6 \\
57,068 & android & android-layout & 7 \\
55,218 & android & kotlin & 8 \\
52,237 & java & arrays & 9 \\
47,781 & java & maven & 10 \\
\bottomrule
\end{tabular}



In [37]:
df_all_tuples.loc[df_all_tuples['crypto']==True]

Unnamed: 0,count,tag1,tag2,crypto,rank
2379,10932,android,image,True,85.0
175,9098,java,image,True,111.0
22,7947,java,c++,True,130.0
598,7554,java,ssl,True,143.0
163,6936,java,encryption,True,159.0
...,...,...,...,...,...
7524,11,android,blowfish,True,17678.5
18563,10,android,rijndael,True,18416.0
15726,8,android,tripledes,True,20214.5
25468,6,android,cbc-mode,True,22748.0


In [38]:
# Print as LaTeX table
df_all_tuples_latex = df_all_tuples.loc[df_all_tuples['crypto']==True].copy()
del(df_all_tuples_latex['crypto'])
# Only the top 20
df_all_tuples_latex = df_all_tuples_latex.head(20)
df_all_tuples_latex = df_all_tuples_latex.reset_index(drop=True)
# Rename columns
df_all_tuples_latex.columns = ['Count', 'Tag1', 'Tag2', 'Rank']
df_all_tuples_latex['Rank'] = df_all_tuples_latex['Rank'].astype(int)
# Print as LaTeX table
print(df_all_tuples_latex.to_latex(index=False))

\begin{tabular}{rllr}
\toprule
Count & Tag1 & Tag2 & Rank \\
\midrule
10932 & android & image & 85 \\
9098 & java & image & 111 \\
7947 & java & c++ & 130 \\
7554 & java & ssl & 143 \\
6936 & java & encryption & 159 \\
6180 & java & security & 189 \\
6120 & android & c++ & 191 \\
4477 & java & authentication & 263 \\
3382 & android & authentication & 330 \\
2767 & java & cryptography & 399 \\
2600 & android & security & 426 \\
2514 & android & encryption & 446 \\
2388 & java & https & 472 \\
2333 & android & ssl & 488 \\
1958 & java & httpurlconnection & 592 \\
1911 & java & hash & 609 \\
1854 & java & httpclient & 629 \\
1833 & java & aes & 634 \\
1788 & android & httpurlconnection & 651 \\
1755 & java & bouncycastle & 666 \\
\bottomrule
\end{tabular}

