In [1]:
!pip install -r requirements.txt



In [2]:
#-----------------------------------------------------------------------------------------
"""
Load the Libraries

"""


import numpy as np
import functools
from scipy.sparse import csr_matrix
import pickle
import rbo
from collections import Counter

filename = "project_author_files_dict.pkl"

mongo_db = np.load("mongo_top_200.npy")

In [3]:
def row_normalize(M):
    U = M
    for i in range(U.shape[0]):
        rowsum = U[i].sum()
        if rowsum > 0:
            U[i] = U[i] / rowsum
    return U

def custom_sort(a, b):
    if a[0]>b[0]:
        return -1
    elif a[0]==b[0] and a[1]<b[1]:
        return -1
    return 1

def repeated_items(lst,val):
    counts = Counter(lst)
    top_values = counts.most_common(val)
    return top_values

In [4]:
#-----------------------------------------------------------------------------------------

# Load the data from the pickle file
with open(filename, "rb") as f:
    project_author_file_dict = pickle.load(f)
print("Dictionary loaded from pickle file:")
# print(project_author_file_dict)

dict_items = list(project_author_file_dict.items())
# Take the first 20 elements from the list
first_20_elements = dict_items[:20]
# Convert the sliced list back into a dictionary
first_20_dict = dict(first_20_elements)

Dictionary loaded from pickle file:


In [5]:


def create_df_matrix(project_author_file_dict):

  unique_developers = set()
  all_files = []

  for project, authors_files in project_author_file_dict.items():
    for author, files in authors_files.items():
      unique_developers.add(author)
      all_files.extend(files)  # Extend the list with unique files from current project

  # Remove duplicates from all_files to ensure uniqueness
  unique_files = list(set(all_files))

  developer_index = {developer: idx for idx, developer in enumerate(unique_developers)}
  file_index = {file: idx for idx, file in enumerate(unique_files)}

  rows = []
  cols = []
  for project, authors_files in project_author_file_dict.items():
    for author, files in authors_files.items():
      dev_idx = developer_index[author]
      for file in files:
        file_idx = file_index[file]
        rows.append(dev_idx)
        cols.append(file_idx)

  df_matrix = csr_matrix((np.ones(len(rows)), (rows, cols)), shape=(len(unique_developers), len(unique_files)))
  return df_matrix, developer_index, file_index, all_files

In [6]:

# Call the function
df_matrix, developer_index, file_index ,all_files= create_df_matrix(first_20_dict)

# Print or inspect the outputs
print("DF Matrix:")
# print(df_matrix.toarray())  # Convert to dense array for easier inspection
print("\nDeveloper Index:")
# print(developer_index)
print("\nFile Index:")
# print(file_index)


#mongo_db = np.load("/content/drive/MyDrive/Thesis_Codes/mongo_top_200.npy")

DF Matrix:

Developer Index:

File Index:


In [7]:
#-----------------------------------------------------------------------------------------
"""
Copairs oriented Code

"""

def HRank_ASym_CoRanking_Modified(A, B, UAP, UP, MPUPC, thres ,alpha=0.20, beta=0.8, gamma=0.2):

  # Restart vector
  E_restart1 = np.full(len(A), 1/len(A))
  E_restart2 = np.full(len(B), 1/len(B))

  # Initial rank
  Vis_Prob1 = E_restart1.reshape(1, -1)
  Vis_Prob2 = E_restart2.reshape(1, -1)

  # Calculate MCP
  MCP = np.dot(UAP, np.dot(UP, MPUPC))

  # HRank iterations
  prev_iter1 = np.full(len(A), 1)
  prev_iter2 = np.full(len(B), 1)

  cn = 0
  while True:
    Vis_Prob1_copy = Vis_Prob1.copy()
    Vis_Prob2_copy = Vis_Prob2.copy()

    Vis_Prob1 = np.power(alpha * np.dot(Vis_Prob1_copy, MCP) + (1 - alpha) * E_restart1, gamma)
    Vis_Prob1 = Vis_Prob1 / np.sum(Vis_Prob1)  # Normalize
    Vis_Prob2 = np.power(beta * np.dot(Vis_Prob2_copy, MCP.T) + (1 - beta) * E_restart2, gamma)
    Vis_Prob2 = Vis_Prob2 / np.sum(Vis_Prob2)  # Normalize

    # Check for convergence
    diff = max(np.max(np.absolute(np.nan_to_num(prev_iter1 - Vis_Prob1))), np.max(np.absolute(np.nan_to_num(prev_iter2 - Vis_Prob2))))
    if diff < 0.0001:
      break

    prev_iter1 = Vis_Prob1
    prev_iter2 = Vis_Prob2
    cn += 1

  combined_scores = np.log(Vis_Prob1 * Vis_Prob2.T + 1e-10)  # Add a small value to avoid log(0)
  combined_scores = combined_scores - np.min(combined_scores)  # Normalize

  # Sort and extract co-ranked pairs
  rank = sorted(zip(combined_scores.flatten(), zip(A, B)), key=functools.cmp_to_key(custom_sort))

  # Retrieve co-ranked pairs
  coranked_pairs = [(dev, rev) for (_, (dev, rev)) in rank]
  filtered_pairs = [pair for pair in coranked_pairs if pair[0] != pair[1]]

  return coranked_pairs, filtered_pairs


developers = list(developer_index.keys())

my_list = all_files
top_files = repeated_items(my_list,len(developers))

reviewers = top_files

M_p1 = np.random.rand(len(reviewers), len(developers))
M_p2 = np.random.rand(len(developers), len(reviewers))

# Call the HRank_ASym function with the extracted information
coranked_pairs, filtered_pairs = HRank_ASym_CoRanking_Modified(developers, reviewers, M_p1, M_p2 ,M_p2,thres=200)

# Retrieve names and IDs of developers from the developer_index dictionary
developers_info = [(developer_index[developer], developer) for developer, reviewer in coranked_pairs]

# Print or use the results as needed
print("Co-ranked pairs:", coranked_pairs)

Co-ranked pairs: [(ObjectId('636529d6e9a99c9121bca2a9'), (ObjectId('5b45f6e62123a77a2e256979'), 17)), (ObjectId('636529d6e9a99c9121bcb624'), (ObjectId('5b0fc400065f39020b8de36b'), 13)), (ObjectId('636529d6e9a99c9121bca028'), (ObjectId('5b4854a23463875e6a15d7cf'), 12)), (ObjectId('636529d6e9a99c9121bc360f'), (ObjectId('5b45f6e42123a77a30256945'), 16)), (ObjectId('636529d6e9a99c9121bc911c'), (ObjectId('5b23df48a1501a2548d0cd6a'), 12)), (ObjectId('636529d6e9a99c9121bca8e4'), (ObjectId('5b110f4ae5cffa4687f4b9cc'), 15)), (ObjectId('636529d6e9a99c9121bcae9b'), (ObjectId('5b45f6e62123a77a2725699a'), 14)), (ObjectId('636529d6e9a99c9121bba1c7'), (ObjectId('5b0fc403065f3902048de4f6'), 17)), (ObjectId('636529d6e9a99c9121bca9fc'), (ObjectId('5b27cbdaeee553456c86752f'), 19)), (ObjectId('636529d6e9a99c9121bbe3c8'), (ObjectId('5b0fc402065f3902028de4b2'), 14)), (ObjectId('636529d5e9a99c9121bb2ace'), (ObjectId('5b0fc40c065f3902098deb08'), 11)), (ObjectId('636529d6e9a99c9121bc9800'), (ObjectId('5b23df3c

In [8]:
corank_sorted_data = sorted(coranked_pairs, key=lambda x: x[1][1],reverse=True)


top_files_copair = []
for i in corank_sorted_data:
    top_files_copair.append(str(i[1][0]))


mongo_db_list= []
for i in mongo_db:
    mongo_db_list.append(str(i))


S = top_files_copair[:200]
T = mongo_db_list

print(f"Similarity of RBO value ---> {rbo.RankingSimilarity(S, T).rbo()}")

Similarity of RBO value ---> 0.9776243639460495
