In [None]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
from scipy.sparse import load_npz,csr_matrix
from sklearn.metrics.pairwise import cosine_distances,cosine_similarity
import pickle
import matplotlib.pyplot as plt
import itertools

In [None]:
tfidf_source = csr_matrix(load_npz("../input/githubrecsys1/tfidf_source.npz"))
tfidf_text = csr_matrix(load_npz("../input/githubrecsys1/tfidf_text.npz"))
UB_matrix = load_npz("../input/githubrecsys1/UB_matrix.npz")
 
ind_usr = np.where(np.load("../input/githubrecsys1/valid_users.npy"))[0]
ind_repo = np.where(np.load("../input/githubrecsys1/valid_repos.npy"))[0]

UB_matrix = UB_matrix[ind_usr[:,np.newaxis],ind_repo]
repo_hash = np.load("../input/githubrecsys1/repo_hash.npy",allow_pickle = True)
user_features = load_npz("../input/githubrecsys1/Xevent.npz")

with open("../input/githubrecsys1/users.txt",'r') as f:
    usr = [x.strip() for x in f]
usr = np.array(usr)[ind_usr]

with open("../input/githubrecsys1/repos.txt",'r') as f:
    txt = [x.strip() for x in f]
txt = np.array(txt)[ind_repo]

w = np.load("../input/githubrecsys1/w.npy")
b = np.load("../input/githubrecsys1/b.npy")

query_text = tfidf_text[25]
query_source = tfidf_source[25]

# NearestNeighbors Brute with Cosine/Pearson Similarity

In [1]:
#____________________________________________________________________________
#
#                           Hyperparameters
#
#____________________________________________________________________________
K = 200
Pearson = False
Lmbda = 0.35
assert( 0<=Lmbda<=1),"Lambda should be in the range [0,1]"
#____________________________________________________________________________

if Pearson:
    tfidf_text = csr_matrix(tfidf_text - tfidf_text.mean(axis = 0))
    tfidf_source = csr_matrix(tfidf_source - tfidf_source.mean(axis = 0))
    

model1 = NearestNeighbors(n_neighbors = K,metric = cosine_distances)
model1.fit(tfidf_text)
model2= NearestNeighbors(n_neighbors = K,metric = cosine_distances)
model2.fit(tfidf_source)

pickle.dump(model1,open("KNN_text.pkl","wb"))
pickle.dump(model2,open("KNN_source.pkl","wb"))

NameError: ignored

In [None]:
%%time
out_txt = model1.kneighbors(query_text,return_distance = False)
out_src = model2.kneighbors(query_source,return_distance = False)

r_inds = sorted(list(set(out_txt.ravel()).union(set(out_src.ravel()))))
u_inds = sorted(list(set(itertools.chain.from_iterable(repo_hash[r_inds]))))

z = user_features[u_inds] * w + b
z = z/(1 + np.abs(z)) + 1

sim = cosine_similarity(csr_matrix(tfidf_text)[r_inds],query_text) * Lmbda + \
        (1 - Lmbda) * cosine_similarity(csr_matrix(tfidf_source)[r_inds],query_source)
usr_score = UB_matrix[np.array(u_inds)[:,np.newaxis],r_inds] * sim * z
usr_score = sorted([(x[0],y) for x,y in zip(usr_score,usr[u_inds])],reverse = True)

In [None]:
%%timeit
out_txt = model1.kneighbors(query_text,return_distance = False)
out_src = model2.kneighbors(query_source,return_distance = False)

r_inds = sorted(list(set(out_txt.ravel()).union(set(out_src.ravel()))))
u_inds = sorted(list(set(itertools.chain.from_iterable(repo_hash[r_inds]))))

sim = cosine_similarity(csr_matrix(tfidf_text)[r_inds],query_text) * Lmbda + \
        (1 - Lmbda) * cosine_similarity(csr_matrix(tfidf_source)[r_inds],query_source)
usr_score = UB_matrix[np.array(u_inds)[:,np.newaxis],r_inds] * sim
usr_score = sorted([(x[0],y) for x,y in zip(usr_score,usr[u_inds])],reverse = True)

In [None]:
LIM = 50

plt.figure(figsize = (15,15))
x = [i[1] for i in usr_score][:LIM]
y = [i[0] for i in usr_score][:LIM]

plt.bar(x,y)
_ = plt.xticks(rotation = 90)

In [None]:
out_txt = [x[1] for x in sorted([(y,x) for x,y in zip(out_txt.ravel(),cosine_similarity(csr_matrix(tfidf_text)[r_inds],query_text).ravel())],reverse = True)]
sim_txt = [x[0] for x in sorted([(y,x) for x,y in zip(out_txt,cosine_similarity(csr_matrix(tfidf_text)[r_inds],query_text).ravel())],reverse = True)]
out_src = [x[1] for x in sorted([(y,x) for x,y in zip(out_src.ravel(),cosine_similarity(csr_matrix(tfidf_source)[r_inds],query_source).ravel())],reverse = True)]
sim_src = [x[0] for x in sorted([(y,x) for x,y in zip(out_src,cosine_similarity(csr_matrix(tfidf_source)[r_inds],query_source).ravel())],reverse = True)]

In [None]:
LIM = 75

plt.figure(figsize = (15,35))

plt.subplot(211)
_ = plt.gcf().suptitle("Based on the Query Repository, the".format(),fontsize = 14)
plt.bar(txt[out_txt][0:LIM],sim_txt[0:LIM],alpha = 0.7)
_ = plt.xticks(rotation = 90)
_ = plt.ylabel("Similarity ---------->",fontsize = 15)
_ = plt.xlabel("Repository Path",fontsize = 15)
_ = plt.title("Most Similar repositories by Description/Text files content",fontsize = 20)
plt.subplot(212)
plt.bar(txt[out_src][0:LIM],sim_src[0:LIM],alpha = 0.7)
_ = plt.xticks(rotation = 90)
_ = plt.ylabel("Similarity ---------->",fontsize = 15)
_ = plt.xlabel("Repository Path",fontsize = 15)
_ = plt.title("Most Similar repositories by Source Code content",fontsize = 20)
plt.gcf().tight_layout(pad = 5)

# NearestNeighbors Ball-Tree on SVD-reduced dimensions of normalized vectors of tfidf_text and tfidf_soruce

In [None]:
from sklearn.decomposition import TruncatedSVD
import scipy.spatial.distance as ssd

In [None]:
tfidf_source = tfidf_source.multiply(1/np.sqrt(tfidf_source.multiply(tfidf_source).sum(axis=1)))
tfidf_text = tfidf_text.multiply(1/np.sqrt(tfidf_text.multiply(tfidf_text).sum(axis=1)))

In [None]:
dec1 = TruncatedSVD(1024,algorithm = 'arpack')
dec1.fit(tfidf_text)
dec2 = TruncatedSVD(1024,algorithm = 'arpack')
dec2.fit(tfidf_source)

TruncatedSVD(algorithm='arpack', n_components=1024)

In [None]:
model1 = NearestNeighbors(n_neighbors = K,metric = ssd.cosine,algorithm = 'ball_tree')
model1.fit(dec1.transform(tfidf_text))
model2 = NearestNeighbors(n_neighbors = K,metric = ssd.cosine,algorithm = 'ball_tree')
model2.fit(dec2.transform(tfidf_source))

pickle.dump(dec1,open("dec_text.pkl","wb"))
pickle.dump(dec2,open("dec_source.pkl","wb"))
pickle.dump(model1,open("KNN_cmp_text.pkl","wb"))
pickle.dump(model2,open("KNN_cmp_source.pkl","wb"))

In [None]:
%%time
s1 = model1.kneighbors(dec1.transform(query_text),return_distance = False)
s2 = model2.kneighbors(dec2.transform(query_source),return_distance = False)

z = user_features[u_inds] * w + b
z = z/(1 + np.abs(z)) + 1

r_inds = sorted(list(set(s1.ravel()).union(set(s2.ravel()))))
u_inds = sorted(list(set(itertools.chain.from_iterable(repo_hash[r_inds]))))

sim = cosine_similarity(csr_matrix(tfidf_text)[r_inds],query_text) * Lmbda + \
        (1 - Lmbda) * cosine_similarity(csr_matrix(tfidf_source)[r_inds],query_source)
usr_score = UB_matrix[np.array(u_inds)[:,np.newaxis],r_inds] * sim * z
usr_score = sorted([(x[0],y) for x,y in zip(usr_score,usr[u_inds])],reverse = True)

CPU times: user 635 ms, sys: 16.8 ms, total: 652 ms
Wall time: 651 ms


### IOU score of K-nearest indices found by K-Nearest Neighbors trained on reduced dimensions

In [None]:
print("Text")
print(len(set(out_txt).intersection(set(s1.ravel())))/len(set(out_txt).union(set(s1.ravel()))))
print()
print("Source")
print(len(set(out_src).intersection(set(s2.ravel())))/len(set(out_src).union(set(s2.ravel()))))

Text
0.7857142857142857

Source
0.8518518518518519


# NearestNeighbors LSHForest SVD-reduced dimensions of normalized vectors of tfidf_text and tfidf_soruce

In [None]:
from annoy import AnnoyIndex

In [None]:
################################ Hyperparameters ###################################################

n_trees = 256  # Numbers of trees to be used in LSH Forest

#__________________________________________________________________________________________________

model1 = AnnoyIndex(1024,"angular")
model2 = AnnoyIndex(1024,"angular")

for ind,i in enumerate(dec1.transform(tfidf_text)):
    model1.add_item(ind,i)
for ind,i in enumerate(dec2.transform(tfidf_source)):
    model2.add_item(ind,i)
    
model1.build(n_trees)
model2.build(n_trees)

model1.save("LSH_Text.ann")
model2.save("LSH_Source.ann")

True

In [None]:
%%time
s1 = np.array(model1.get_nns_by_vector(dec1.transform(query_text)[0],K))
s2 = np.array(model2.get_nns_by_vector(dec2.transform(query_source)[0],K))

z = user_features[u_inds] * w + b
z = z/(1 + np.abs(z)) + 1

r_inds = sorted(list(set(s1.ravel()).union(set(s2.ravel()))))
u_inds = sorted(list(set(itertools.chain.from_iterable(repo_hash[r_inds]))))

sim = cosine_similarity(csr_matrix(tfidf_text)[r_inds],query_text) * Lmbda + \
        (1 - Lmbda) * cosine_similarity(csr_matrix(tfidf_source)[r_inds],query_source)
usr_score = UB_matrix[np.array(u_inds)[:,np.newaxis],r_inds] * sim * z
usr_score = sorted([(x[0],y) for x,y in zip(usr_score,usr[u_inds])],reverse = True)

CPU times: user 146 ms, sys: 13.9 ms, total: 160 ms
Wall time: 159 ms


### IOU score of K-nearest indices found by K-Nearest Neighbors trained on reduced dimensions

In [None]:
print("Text")
print(len(set(out_txt).intersection(set(s1.ravel())))/len(set(out_txt).union(set(s1.ravel()))))
print()
print("Source")
print(len(set(out_src).intersection(set(s2.ravel())))/len(set(out_src).union(set(s2.ravel()))))

Text
0.7857142857142857

Source
0.6194331983805668


# Exponential-recency weight update

In [None]:
def update_weights(i):
    
    #########################  Hyperparameters #################################
    Alpha = 0.01
    #__________________________________________________________________________#
    
    global w,b
    out1 = user_features[i].toarray()[0]
    out2 = np.sum(out1 * w) + b
    out2 = 1 / ( 1 + (out2**2))
    w = (1 - Alpha) * w + Alpha * out1 * out2
    b = (1 - Alpha) * b + Alpha * out2    