In [1]:
#Installing all the requirements needed
!pip install -r requirements.txt



In [2]:
#Importing the necessary packages and defining dependencies
import numpy as np
import functools
from scipy.sparse import csr_matrix
import pickle
import pymongo
from bson import ObjectId, decode_all

filename = "project_author_files_dict.pkl"



In [3]:


# Specify the filename of the pickle file

def row_normalize(M):
    U = M
    for i in range(U.shape[0]):
        rowsum = U[i].sum()
        if rowsum > 0:
            U[i] = U[i] / rowsum
    return U

def custom_sort(a, b):
    if a[0]>b[0]:
        return -1
    elif a[0]==b[0] and a[1]<b[1]:
        return -1
    return 1

In [4]:

# filename = "project_author_files_dict.pkl"
# Load the data from the pickle file
with open(filename, "rb") as f:
    project_author_file_dict = pickle.load(f)
print("Dictionary loaded from pickle file:")

Dictionary loaded from pickle file:


In [5]:
dict_items = list(project_author_file_dict.items())
# Take the first 20 elements from the list
first_20_elements = dict_items[:20]
# Convert the sliced list back into a dictionary
first_20_dict = dict(first_20_elements)


In [6]:

def create_df_matrix(project_author_file_dict):
    unique_developers = set()
    unique_files = set()
    for project, authors_files in project_author_file_dict.items():
        for author, files in authors_files.items():
            unique_developers.add(author)
            unique_files.update(files)

    developer_index = {developer: idx for idx, developer in enumerate(unique_developers)}
    file_index = {file: idx for idx, file in enumerate(unique_files)}
    rows = []
    cols = []
    for project, authors_files in project_author_file_dict.items():
        for author, files in authors_files.items():
            dev_idx = developer_index[author]
            for file in files:
                file_idx = file_index[file]
                rows.append(dev_idx)
                cols.append(file_idx)
    df_matrix = csr_matrix((np.ones(len(rows)), (rows, cols)), shape=(len(unique_developers), len(unique_files)))
    return df_matrix, developer_index, file_index


# Call the function
df_matrix, developer_index, file_index = create_df_matrix(first_20_dict)

In [7]:
# Print or inspect the outputs
print("DF Matrix:")
print(df_matrix.toarray())  # Convert to dense array for easier inspection
print("\nDeveloper Index:")
print(developer_index)
print("\nFile Index:")
print(file_index)

DF Matrix:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

Developer Index:
{ObjectId('636529d6e9a99c9121bbc579'): 0, ObjectId('636529d6e9a99c9121bc92c1'): 1, ObjectId('636529d6e9a99c9121bcab78'): 2, ObjectId('636529d6e9a99c9121bc9d21'): 3, ObjectId('636529d6e9a99c9121bc97ff'): 4, ObjectId('636529d6e9a99c9121bc9f1a'): 5, ObjectId('636529d6e9a99c9121bcb71f'): 6, ObjectId('636529d6e9a99c9121bcb721'): 7, ObjectId('636529d6e9a99c9121bc9270'): 8, ObjectId('636529d6e9a99c9121bc90f2'): 9, ObjectId('636529d6e9a99c9121bc9f49'): 10, ObjectId('636529d6e9a99c9121bc929f'): 11, ObjectId('636529d6e9a99c9121bc910b'): 12, ObjectId('636529d6e9a99c9121bca8f8'): 13, ObjectId('636529d6e9a99c9121bcb747'): 14, ObjectId('636529d6e9a99c9121bc99b2'): 15, ObjectId('636529d6e9a99c9121bca0ac'): 16, ObjectId('636529d5e9a99c9121bb3d42'): 17, ObjectId('636529d5e9a99c9121bb381c'): 18, ObjectId('636529d6e9a99c91

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [8]:

# HRank ASym Function

def HRank_ASym(A, B, UAP, UP, MPUPC, alpha=0.15):
    # Restart vector
    E_restart1 = np.full(len(A), 1/len(A))
    E_restart2 = np.full(len(B), 1/len(B))

    # Initial rank
    Vis_Prob1 = E_restart1.reshape(1, -1)    # Reshape for consistency
    Vis_Prob2 = E_restart2.reshape(1, -1)

    # Calculate MCP
    MCP = np.dot(UAP, np.dot(UP, MPUPC))

    # HRank iterations
    prev_iter1 = np.full(len(A), 1)
    prev_iter2 = np.full(len(B), 1)

    cn = 0
    while True:
        Vis_Prob1_copy = Vis_Prob1.copy()
        Vis_Prob2_copy = Vis_Prob2.copy()

        # Updated equation incorporating the provided formula
        Vis_Prob1 = alpha * np.dot(Vis_Prob1_copy, MCP) + (1 - alpha) * E_restart1
        Vis_Prob2 = alpha * np.dot(Vis_Prob2_copy, MCP.T) + (1 - alpha) * E_restart2

        # Check for convergence
        diff = max(np.max(np.absolute(np.nan_to_num(prev_iter1 - Vis_Prob1))), np.max(np.absolute(np.nan_to_num(prev_iter2 - Vis_Prob2))))
        if diff < 0.0001:
            break

        prev_iter1 = Vis_Prob1
        prev_iter2 = Vis_Prob2
        cn += 1

    # Sort and extract ranks
    rank1 = sorted(zip(Vis_Prob1[0], A), key=functools.cmp_to_key(custom_sort))
    rank2 = sorted(zip(Vis_Prob2[0], B), key=functools.cmp_to_key(custom_sort))

    return [x[1] for x in rank1], [x[1] for x in rank2]


In [9]:
import warnings
warnings.filterwarnings("ignore")
#Try 2
developers = list(developer_index.keys())
files = list(file_index.keys())
# Assuming developers are the same as reviewers
reviewers = developers


M_p1 = np.random.rand(len(reviewers), len(developers))
M_p2 = np.random.rand(len(developers), len(reviewers))
# # Call the HRank_ASym function with the extracted information
ranked_developers, _ = HRank_ASym(developers, reviewers, M_p1, M_p2 ,M_p2)
# Retrieve names and IDs of developers from the developer_index dictionary
developers_info = [(developer_index[developer], developer) for developer in ranked_developers]
# Sort the developers based on their ranks
print(ranked_developers)

[ObjectId('636529d5e9a99c9121bb254c'), ObjectId('636529d5e9a99c9121bb25d7'), ObjectId('636529d5e9a99c9121bb2718'), ObjectId('636529d5e9a99c9121bb2719'), ObjectId('636529d5e9a99c9121bb271a'), ObjectId('636529d5e9a99c9121bb271b'), ObjectId('636529d5e9a99c9121bb271c'), ObjectId('636529d5e9a99c9121bb271d'), ObjectId('636529d5e9a99c9121bb271e'), ObjectId('636529d5e9a99c9121bb271f'), ObjectId('636529d5e9a99c9121bb2720'), ObjectId('636529d5e9a99c9121bb2721'), ObjectId('636529d5e9a99c9121bb2722'), ObjectId('636529d5e9a99c9121bb2723'), ObjectId('636529d5e9a99c9121bb2724'), ObjectId('636529d5e9a99c9121bb2725'), ObjectId('636529d5e9a99c9121bb2726'), ObjectId('636529d5e9a99c9121bb2727'), ObjectId('636529d5e9a99c9121bb2728'), ObjectId('636529d5e9a99c9121bb2729'), ObjectId('636529d5e9a99c9121bb272a'), ObjectId('636529d5e9a99c9121bb272b'), ObjectId('636529d5e9a99c9121bb272c'), ObjectId('636529d5e9a99c9121bb272d'), ObjectId('636529d5e9a99c9121bb272e'), ObjectId('636529d5e9a99c9121bb272f'), ObjectId('6

In [10]:
import pymongo
from bson import ObjectId, decode_all

# Function to read and decode a BSON file
def read_bson_file(file_path):
    with open(file_path, 'rb') as f:
        data = f.read()
        documents = decode_all(data)
    return documents

# Load BSON data from files
final_identity_data = read_bson_file('final_identity.bson')
people_data = read_bson_file('people.bson')

# Convert to dictionaries for quick lookup
final_identity_dict = {entry['_id']: entry for entry in final_identity_data}
people_dict = {entry['_id']: entry for entry in people_data}

# Assuming ranked_developers is already present and contains developer IDs
# Extract top 20 ranked developer IDs
top_20_ids = [developer for developer in ranked_developers[:20]]

# Function to find people IDs from final_identity_data
def get_people_ids(final_identity_dict, dev_id):
    entry = final_identity_dict.get(ObjectId(dev_id))
    if entry:
        return entry['people']
    return []

# Function to find name and email by people ID from people_data
def find_name_and_email(people_dict, people_id):
    person = people_dict.get(ObjectId(people_id))
    if person:
        return person.get('name'), person.get('email')
    return None, None

# Initialize the results list
results = []
displayed_names = set()

# Match IDs in final_identity.bson and then in people.bson
for dev_id in top_20_ids:
    people_ids = get_people_ids(final_identity_dict, dev_id)
    for people_id in people_ids:
        name, email = find_name_and_email(people_dict, people_id)
        if name and email:
            name_lower = name.lower()
            if name_lower not in displayed_names:
                results.append({
                    'developer_id': dev_id,
                    'name': name,
                    'email': email
                })
                displayed_names.add(name_lower)

# Print the top 20 developers' details
for result in results:
    print(f"Developer ID: {result['developer_id']}, Name: {result['name']}, Email: {result['email']}")


Developer ID: 636529d5e9a99c9121bb254c, Name: dIon Gillard, Email: dion@multitask.com.au
Developer ID: 636529d5e9a99c9121bb254c, Name: dion, Email: dion@apache.org
Developer ID: 636529d5e9a99c9121bb254c, Name: dion gillard (JIRA), Email: commons-dev@jakarta.apache.org
Developer ID: 636529d5e9a99c9121bb254c, Name: dion.gillard@gmail.com, Email: dion.gillard@gmail.com
Developer ID: 636529d5e9a99c9121bb254c, Name: 'Dion Gillard', Email: dion.gillard@gmail.com
Developer ID: 636529d5e9a99c9121bb254c, Name: 'dion@multitask.com.au', Email: dion@multitask.com.au
Developer ID: 636529d5e9a99c9121bb254c, Name: dion.gillard, Email: dion.gillard@gmail.com
Developer ID: 636529d5e9a99c9121bb254c, Name: 'dIon Gillard ', Email: dion@multitask.com.au
Developer ID: 636529d5e9a99c9121bb254c, Name: dion_gillard, Email: dion_gillard@multitask.com.au
Developer ID: 636529d5e9a99c9121bb254c, Name: Dion  Gillard, Email: dion.gillard@gmail.com
Developer ID: 636529d5e9a99c9121bb25d7, Name: James Dumay, Email: jdu