In [7]:
%matplotlib
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import SparsePCA
from sklearn.decomposition import MiniBatchSparsePCA
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import FeatureAgglomeration
from sklearn import metrics
import numpy as np
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from re import match
from sklearn import random_projection
import pandas as pd
import os  # for os.path.basename
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.manifold import MDS
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score
from scipy.sparse import coo_matrix

Using matplotlib backend: GTK3Agg


In [2]:
file_name='/home/chaomai/Documents/IdeaProjects/feature_location/data/AllDatajEdit4.3/Corpus-jEdit4.3/Corpus-jEdit4.3CorpusTransformedStemmed.OUT'
java_keywords_file_name = '/home/chaomai/Documents/IdeaProjects/feature_location/data/java_keywords.txt'
functions_file_name='/home/chaomai/Documents/IdeaProjects/feature_location/data/AllDatajEdit4.3/Corpus-jEdit4.3/Corpus-jEdit4.3.mapping'
query_file_name='/home/chaomai/Documents/IdeaProjects/feature_location/data/AllDatajEdit4.3/Queries-jEdit4.3ShortLongDescriptionCorpusTransformedStemmed.OUT'

In [3]:
file=open(file_name)
dataset=file.readlines()
print(len(dataset))

java_keywords_file=open(java_keywords_file_name)
keywords=java_keywords_file.readlines()
java_keywords=[word_tokenize(k)[0] for k in keywords]

functions_file=open(functions_file_name)
functions=functions_file.readlines()
jedit_functions=[word_tokenize(k)[0] for k in functions]

print(java_keywords[:10])
print(jedit_functions[:10])

query_file=open(query_file_name)
query=[q for q in query_file.readlines() if len(q)>1]

6413
['abstract', 'assert', 'boolean', 'break', 'byte', 'case', 'catch', 'char', 'class', 'const']
['org.gjt.sp.jedit.gui.AbbrevEditor.AbbrevEditor', 'org.gjt.sp.jedit.gui.AbbrevEditor.getAbbrev', 'org.gjt.sp.jedit.gui.AbbrevEditor.setAbbrev', 'org.gjt.sp.jedit.gui.AbbrevEditor.getExpansion', 'org.gjt.sp.jedit.gui.AbbrevEditor.setExpansion', 'org.gjt.sp.jedit.gui.AbbrevEditor.getAbbrevField', 'org.gjt.sp.jedit.gui.AbbrevEditor.getBeforeCaretTextArea', 'org.gjt.sp.jedit.gui.AbbrevEditor.getAfterCaretTextArea', 'org.gjt.sp.jedit.Abbrevs.getExpandOnInput', 'org.gjt.sp.jedit.Abbrevs.setExpandOnInput']


In [198]:
def tokenize_stop_stem(text):
    tokens = word_tokenize(text)
    filtered_tokens = []
    stop_words = stopwords.words('english')
    for token in tokens:
        if match('[a-zA-Z]*$', token) is not None and len(token) > 2 \
                and token not in stop_words \
                and token not in java_keywords:
            filtered_tokens.append(token)

    stemmer = SnowballStemmer("english")
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [199]:
hasher = HashingVectorizer(n_features=5000, non_negative=True,
                           norm=None, binary=False,
                           tokenizer=tokenize_stop_stem, ngram_range=(1, 1))
hasher

HashingVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
         lowercase=True, n_features=5000, ngram_range=(1, 1),
         non_negative=True, norm=None, preprocessor=None, stop_words=None,
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=<function tokenize_stop_stem at 0x7f364c566378>)

In [200]:
vectorizer = make_pipeline(hasher, TfidfTransformer())
vectorizer

Pipeline(steps=[('hashingvectorizer', HashingVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
         lowercase=True, n_features=5000, ngram_range=(1, 1),
         non_negative=True, norm=None, preprocessor=None, stop...'tfidftransformer', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True))])

In [201]:
X = vectorizer.fit_transform(dataset)
X.shape

(6413, 5000)

In [202]:
svd = TruncatedSVD(n_components=700)
lsa = make_pipeline(svd, Normalizer(copy=False))
X = lsa.fit_transform(X)
print(X.shape)
explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))

(6413, 700)
Explained variance of the SVD step: 95%


In [10]:
svd = TruncatedSVD(n_components=700)
lsa = make_pipeline(svd, Normalizer(copy=False))
X1=coo_matrix(X)
X1 = lsa.fit_transform(X1.tocsr())
print(X1.shape)
explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))

(6413, 700)
Explained variance of the SVD step: 94%


In [None]:
pca = SparsePCA(n_components=700, max_iter=300, n_jobs=4)
pca_pipe = make_pipeline(pca, Normalizer(copy=False))
X = pca_pipe.fit_transform(X.toarray())
print(X.shape)
explained_variance = pca.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))

In [None]:
minibatchpca = MiniBatchSparsePCA(n_components=700, n_jobs=4)
minibatchpca_pipe = make_pipeline(minibatchpca, Normalizer(copy=False))
X = minibatchpca_pipe.fit_transform(X.toarray())
print(X.shape)
explained_variance = minibatchpca.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))

In [None]:
fa = FeatureAgglomeration(n_clusters=700)
fa_pipe = make_pipeline(fa, Normalizer(copy=False))
X = fa_pipe.fit_transform(X.toarray())

In [None]:
km = KMeans(n_clusters=100, init='k-means++', max_iter=100, n_init=1, verbose=False, n_jobs=-1)
print(km)
X1=coo_matrix(X)
km.fit(X1.tocsr())
print(km.labels_.shape)
print("Silhouette Coefficient: %0.3f" % silhouette_score(X, km.labels_, sample_size=1000))

KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=100, n_init=1,
    n_jobs=-1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=False)


In [203]:
km = MiniBatchKMeans(n_clusters=20, init='k-means++', max_iter=100, n_init=1, verbose=False)
print(km)
km.fit(X)
print(km.labels_.shape)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=2000))
# a measure of how tightly grouped all the data in the cluster are

MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
        init_size=None, max_iter=100, max_no_improvement=10, n_clusters=20,
        n_init=1, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=False)
(6413,)
Silhouette Coefficient: 0.049


In [204]:
Q = vectorizer.fit_transform(query)
type(Q)
Q1=Q * svd.components_.T

In [205]:
km.predict(Q1)

array([ 9,  9,  9, 19,  2,  9,  9,  9, 17,  9,  9, 12,  9,  9,  9,  9,  0,
        2,  9,  9, 13,  9,  9,  4,  9,  9,  9,  9, 11,  9,  9,  9,  9,  9,
        9,  9,  4,  9,  9,  9,  9, 15,  9,  9,  2,  2,  9,  9,  2, 18,  9,
        9,  4,  9,  9,  9,  9,  9,  5,  9, 13,  9,  9,  4, 10,  9, 10,  9,
        9,  9,  9,  9,  0, 11, 11,  9, 17,  9, 12,  2,  9, 12,  9,  9,  2,
        9,  9,  4,  9, 17,  9,  9,  9,  2,  9,  9,  2,  9,  9,  4,  9,  9,
        9,  9,  9,  6, 17,  9,  9,  9,  9,  6,  9,  6,  6,  6,  0,  2,  9,
       10, 10,  9,  9,  4, 10,  9,  9,  9,  9,  9,  9, 18,  9,  9,  9,  2,
       12, 10, 10,  9,  2,  9,  9, 10,  9,  9,  3,  9,  9,  9], dtype=int32)

In [206]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
for i in range(20):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % ind, end='')
    print()

Top terms per cluster:
Cluster 0: 32 31 0 29 7 24 35 44 26 22
Cluster 1: 6 21 22 0 29 34 20 16 30 4
Cluster 2: 0 11 7 13 14 12 3 23 10 19
Cluster 3: 0 4 14 15 19 13 8 25 24 29
Cluster 4: 5 0 3 15 4 11 18 22 9 37
Cluster 5: 13 6 2 0 43 9 7 8 5 41
Cluster 6: 3 0 4 1 16 6 12 11 13 28
Cluster 7: 28 30 0 29 35 31 65 10 51 86
Cluster 8: 2 0 14 9 4 11 20 10 21 35
Cluster 9: 0 10 6 64 26 33 17 63 68 61
Cluster 10: 1 0 7 4 5 10 15 12 21 28
Cluster 11: 0 1 14 16 9 8 17 32 15 34
Cluster 12: 16 0 5 20 7 28 24 10 15 30
Cluster 13: 6 0 33 36 38 37 14 46 50 35
Cluster 14: 0 24 26 40 57 1 17 11 112 95
Cluster 15: 17 0 3 15 7 21 20 18 41 27
Cluster 16: 54 55 49 62 0 57 42 38 44 46
Cluster 17: 8 10 0 3 17 14 20 18 12 16
Cluster 18: 31 0 38 25 17 44 27 34 46 53
Cluster 19: 24 0 27 32 10 34 13 6 50 30


In [8]:
def tokenize_stop_stem(text):
    tokens = word_tokenize(text)
    filtered_tokens = []
    stop_words = stopwords.words('english')
    for token in tokens:
        if match('[a-zA-Z]*', token) is not None and len(token) > 2 \
                and token not in stop_words \
                and token not in java_keywords:
            filtered_tokens.append(token)

    stemmer = SnowballStemmer("english")
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

def tokenize_only(text):
    tokens = [word.lower() for word in word_tokenize(text)]
    filtered_tokens = []
    stop_words = stopwords.words('english')
    for token in tokens:
        if match('[a-zA-Z]*', token) is not None and len(token) > 2 \
                and token not in stop_words \
                and token not in java_keywords:
            filtered_tokens.append(token)

    return filtered_tokens

In [9]:
totalvocab_stop_stemmed = []
totalvocab_tokenized = []
for i in range(len(dataset)):
    allwords_stemmed = tokenize_stop_stem(dataset[i])
    totalvocab_stop_stemmed.extend(allwords_stemmed)
    
    allwords_tokenized = tokenize_only(dataset[i])
    totalvocab_tokenized.extend(allwords_tokenized)
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stop_stemmed)
print(len(totalvocab_stop_stemmed))
print(len(totalvocab_tokenized))

303038
303038


In [None]:
dist = 1 - cosine_similarity(X)
MDS()
# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist)  # shape (n_components, n_samples)
xs, ys = pos[:, 0], pos[:, 1]

In [209]:
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e',
                  5: '#1b9e77', 6: '#d95f02', 7: '#7570b3', 8: '#e7298a', 9: '#66a61e',
                  10: '#1b9e77', 11: '#d95f02', 12: '#7570b3', 13: '#e7298a', 14: '#66a61e',
                  15: '#1b9e77', 16: '#d95f02', 17: '#7570b3', 18: '#e7298a', 19: '#66a61e'}

#set up cluster names using a dict
cluster_names = {0: '0', 1: '1', 2: '2', 3: '3', 4: '4',
                 5: '0', 6: '1', 7: '2', 8: '3', 9: '4',
                 10: '0', 11: '1', 12: '2', 13: '3', 14: '4',
                 15: '0', 16: '1', 17: '2', 18: '3', 19: '4',
                }

In [None]:
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=km.labels_.tolist(), title=functions)) 

#group by cluster
groups = df.groupby('label')


# set up plot
fig, ax = plt.subplots(figsize=(17, 9)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, 
            label=cluster_names[name], color=cluster_colors[name], 
            mec='none')
    ax.set_aspect('auto')
    ax.tick_params(\
        axis= 'x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelbottom='off')
    ax.tick_params(\
        axis= 'y',         # changes apply to the y-axis
        which='both',      # both major and minor ticks are affected
        left='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelleft='off')
    
ax.legend(numpoints=1)  #show legend with only 1 point

#add label in x,y position with the label as the film title
for i in range(len(df)):
    ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8)  

    
    
plt.show() #show the plot

#uncomment the below to save the plot if need be
#plt.savefig('clusters_small_noaxes.png', dpi=200)