In [32]:
import numpy as np
import pandas as pd

In [33]:
df_tags = pd.read_csv('./Dataset/tags.txt', sep='\t', memory_map=True)

In [34]:
df_tags.head()

Unnamed: 0,Id,Tags
0,2288942,user-interface
1,4433442,canvas
2,6381190,shortcut
3,27466868,jboss7.x
4,8375328,math


In [35]:
df_tags.Tags = df_tags.Tags.astype(str)
df_tags.Id.value_counts()

15437010    4
1064729     4
7315033     4
2370817     4
12864781    4
           ..
21897196    1
24041461    1
17105519    1
958460      1
12584959    1
Name: Id, Length: 762142, dtype: int64

# make map of tags -> list of occurence

In [36]:
def make_tags_list(x):
    return x.Tags.values
list_que_tags = df_tags.groupby("Id").apply(lambda x: make_tags_list(x))

In [37]:
list_que_tags.head()
list_que_tags.shape

(762142,)

In [38]:
def make_tag_list_tags(x):
    return [list_que_tags[q_id] for q_id in x.Id.values]
tag_tags = df_tags.groupby("Tags").apply(lambda x: make_tag_list_tags(x))

In [46]:
one_list = np.concatenate(tag_tags['intellij-idea'])
onelist = np.unique(one_list, return_counts=True)
count_sort_ind = np.argsort(-onelist[1])
onelist = onelist[0][count_sort_ind]
onelist[onelist != 'intellij-idea'][:5]

array(['maven', 'android', 'eclipse', 'spring', 'tomcat'], dtype=object)

In [48]:
one_list = np.concatenate(tag_tags['jax-rs'])
onelist = np.unique(one_list, return_counts=True)
count_sort_ind = np.argsort(-onelist[1])
onelist = onelist[0][count_sort_ind]
onelist[onelist != 'jax-rs'][:5]

array(['rest', 'jersey', 'web-services', 'json', 'resteasy'], dtype=object)

In [49]:
one_list = np.concatenate(tag_tags['user-interface'])
onelist = np.unique(one_list, return_counts=True)
count_sort_ind = np.argsort(-onelist[1])
onelist = onelist[0][count_sort_ind]
onelist[onelist != 'user-interface'][:5]

array(['swing', 'android', 'jframe', 'netbeans', 'jpanel'], dtype=object)

# tag1 and tag2 distance

In [9]:
con_vec1 = np.concatenate(tag_tags['regex'])
con_vec2 = np.concatenate(tag_tags['static'])
similarity_set = np.append(con_vec1, con_vec2)
similarity_set = set(similarity_set)

In [10]:
vec2 = np.zeros(len(similarity_set))
vec1 = np.zeros(len(similarity_set))
for index,tag in enumerate(similarity_set):
    count = np.count_nonzero(con_vec1 == tag)
    vec1[index] += count

    count = np.count_nonzero(con_vec2 == tag)
    vec2[index] += count

In [11]:
dist = np.linalg.norm(vec1-vec2, 2)
dist / (len(con_vec1)+len(con_vec2))

0.4346567961089117

In [12]:
def get_dist(tag1, tag2):
    con_vec1 = np.concatenate(tag_tags[tag1])
    con_vec2 = np.concatenate(tag_tags[tag2])
    similarity_set = np.append(con_vec1, con_vec2)
    similarity_set = set(similarity_set)
    vec2 = np.zeros(len(similarity_set))
    vec1 = np.zeros(len(similarity_set))
    for index,tag in enumerate(similarity_set):
        count = np.count_nonzero(con_vec1 == tag)
        vec1[index] += count

        count = np.count_nonzero(con_vec2 == tag)
        vec2[index] += count
    dist = np.linalg.norm(vec1-vec2, 2)
    return dist / (len(con_vec1)+len(con_vec2))

In [30]:
print(get_dist('regex','static'))
print(get_dist('session','spring'))
print(get_dist('nullpointerexception','dependency-injection'))

0.4346567961089117
0.33204579536047457
0.2896672656814435


# list of top 10 distanced tags

In [52]:
i, j = np.triu_indices(len(tag_tags), 1)
dist_list = np.asanyarray([])
new_df = pd.DataFrame(np.stack([tag_tags.index[i], tag_tags.index[j]]).T, columns=['tag1','tag2'])

In [70]:
new_df['dist'] = new_df[:1000].apply(lambda x: get_dist(x.tag1,x.tag2), axis=1)

In [13]:
# sort tags by their occurence
new_df = pd.DataFrame({'tag':tag_tags.index,'ques':tag_tags.values})
new_df.head()

Unnamed: 0,tag,ques
0,.a,"[[c++, ubuntu, linux, .a]]"
1,.app,"[[dropbox, .app, api, download], [.app, osx, j..."
2,.bash-profile,"[[bash, .bash-profile], [maven, .bash-profile,..."
3,.class-file,"[[.class-file, jvm, bytecode, java-bytecode-as..."
4,.doc,"[[.doc, apache-poi, ms-word], [.doc, apache-po..."


In [14]:
def sort_tags_by_cnt(x):
    con_vec = np.concatenate(x.ques)
    con_vec = con_vec[con_vec != x.tag]
    con_vec = np.unique(con_vec, return_counts=True)
    try:
        x['tag1'] = con_vec[0][0]
        # x['tag2'] = con_vec[0][1]
        x['cnt1'] = con_vec[1][0]
        # x['cnt2'] = con_vec[1][1]
    except:
        pass
    return x

In [15]:
new_df = new_df.apply(lambda x: sort_tags_by_cnt(x), axis=1)

In [16]:
new_df.dropna(inplace=True)
new_df.sort_values(['cnt1'], ascending=False,inplace=True)

In [17]:
new_df.head()

Unnamed: 0,cnt1,ques,tag,tag1
2161,551.0,"[[c#, multithreading, synchronization, concurr...",c#,.net
154,156.0,"[[achartengine, android, charts], [achartengin...",achartengine,android
803,151.0,"[[android-volley, drupal-7, android, drupal], ...",android-volley,android
12024,144.0,"[[ormlite, rest, robospice, android], [ormlite...",ormlite,android
681,135.0,"[[android-mapview, overlay, android, itemizedo...",android-mapview,android


In [21]:
new_df = new_df[:20]

In [None]:
# select all prev tag and tag1 for top distance calculation
new_df['dist'] = new_df.apply(lambda x: get_dist(x.tag,x.tag1), axis=1)

In [31]:
new_df.sort_values(['dist']).drop(columns=['ques']).head(10)

Unnamed: 0,cnt1,tag,tag1,dist
2161,551.0,c#,.net,0.230577
898,99.0,apache-camel,activemq,0.253327
18021,79.0,wcf,.net,0.261574
16772,99.0,time-complexity,algorithm,0.359599
2113,96.0,bundle,android,0.387185
12024,144.0,ormlite,android,0.387297
154,156.0,achartengine,android,0.38751
803,151.0,android-volley,android,0.38751
681,135.0,android-mapview,android,0.387633
15685,89.0,sqlite3,android,0.387713
